From f364fdcd44540b6d5403f1d08acbebfff4e78bd4 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 19 Jul 2024 14:56:13 -0400
Subject: [PATCH 001/270] DOC v24.10 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  6 +--
 .devcontainer/cuda11.8-pip/devcontainer.json  |  6 +--
 .../cuda12.5-conda/devcontainer.json          |  6 +--
 .devcontainer/cuda12.5-pip/devcontainer.json  |  6 +--
 .github/workflows/build.yaml                  | 20 ++++-----
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 44 +++++++++----------
 .github/workflows/test.yaml                   | 22 +++++-----
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 ci/test_wheel_cudf_polars.sh                  |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             | 10 ++---
 .../all_cuda-125_arch-x86_64.yaml             | 10 ++---
 cpp/examples/versions.cmake                   |  2 +-
 dependencies.yaml                             | 32 +++++++-------
 java/ci/README.md                             |  4 +-
 java/pom.xml                                  |  2 +-
 python/cudf/pyproject.toml                    |  4 +-
 python/cudf_kafka/pyproject.toml              |  2 +-
 python/cudf_polars/docs/overview.md           |  2 +-
 python/cudf_polars/pyproject.toml             |  2 +-
 python/custreamz/pyproject.toml               |  4 +-
 python/dask_cudf/pyproject.toml               |  6 +--
 23 files changed, 99 insertions(+), 99 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 8423fe21c29..7a1361e52c5 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 4945d6cf753..64d7cd54130 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index fadce01d060..c1924243506 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 026eb540952..beab2940176 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2e5959338b0..2fc39c06fad 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -111,7 +111,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index 5a937b2f362..cf0c2b377dd 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
       with:
         matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
         build_type: nightly
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d5dfc9e1ff5..c2e7f64f952 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,41 +34,41 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
@@ -76,14 +76,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -93,7 +93,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -103,7 +103,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -113,7 +113,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -123,21 +123,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -157,7 +157,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -166,7 +166,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -174,7 +174,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -185,7 +185,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -194,7 +194,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
       build_type: pull-request
@@ -204,7 +204,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 36c9088d93c..9feea050b19 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index 1ab6a2d7457..fd8b0365807 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.08 python=3.11 cuda-version=12.5
+    cudf=24.10 python=3.11 cuda-version=12.5
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index ec8489fda92..7c7ba04436f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.08.00
+24.10.00
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 900acd5d473..cc9f5788685 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -10,7 +10,7 @@ set -eou pipefail
 # files in cudf_polars/pylibcudf", rather than "are there changes
 # between upstream and this branch which touch cudf_polars/pylibcudf"
 # TODO: is the target branch exposed anywhere in an environment variable?
-if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
 then
     HAS_CHANGES=1
 else
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b8d73a01f96..b1a1cc3c68e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -26,7 +26,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -43,10 +43,10 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==24.8.*,>=0.0.0a0
+- libkvikio==24.10.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*,>=0.0.0a0
+- librmm==24.10.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -77,9 +77,9 @@ dependencies:
 - python>=3.9,<3.12
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - rich
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 3f5fae49cbb..1017b11779c 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -42,10 +42,10 @@ dependencies:
 - libarrow==16.1.0.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==24.8.*,>=0.0.0a0
+- libkvikio==24.10.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*,>=0.0.0a0
+- librmm==24.10.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -75,9 +75,9 @@ dependencies:
 - python>=3.9,<3.12
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - rich
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake
index 144b3d3721b..44493011673 100644
--- a/cpp/examples/versions.cmake
+++ b/cpp/examples/versions.cmake
@@ -12,4 +12,4 @@
 # the License.
 # =============================================================================
 
-set(CUDF_TAG branch-24.08)
+set(CUDF_TAG branch-24.10)
diff --git a/dependencies.yaml b/dependencies.yaml
index a19574b7658..a90ac64387b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -287,8 +287,8 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - librmm==24.8.*,>=0.0.0a0
-          - libkvikio==24.8.*,>=0.0.0a0
+          - librmm==24.10.*,>=0.0.0a0
+          - libkvikio==24.10.*,>=0.0.0a0
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==3.0.6
@@ -329,7 +329,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.8.*,>=0.0.0a0
+          - &rmm_conda rmm==24.10.*,>=0.0.0a0
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
@@ -345,10 +345,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages: &build_python_packages_cu12
-              - rmm-cu12==24.8.*,>=0.0.0a0
+              - rmm-cu12==24.10.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages: &build_python_packages_cu11
-              - rmm-cu11==24.8.*,>=0.0.0a0
+              - rmm-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*rmm_conda] }
   libarrow_build:
     common:
@@ -505,7 +505,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==24.8.*,>=0.0.0a0
+          - dask-cuda==24.10.*,>=0.0.0a0
           - *doxygen
           - make
           - myst-nb
@@ -597,11 +597,11 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - rmm-cu12==24.8.*,>=0.0.0a0
+              - rmm-cu12==24.10.*,>=0.0.0a0
               - pynvjitlink-cu12>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - rmm-cu11==24.8.*,>=0.0.0a0
+              - rmm-cu11==24.10.*,>=0.0.0a0
               - cubinlinker-cu11
               - ptxcompiler-cu11
           - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda]}
@@ -614,7 +614,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==24.8.*,>=0.0.0a0
+          - rapids-dask-dependency==24.10.*,>=0.0.0a0
   run_custreamz:
     common:
       - output_types: conda
@@ -700,13 +700,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==24.8.*,>=0.0.0a0
+          - dask-cuda==24.10.*,>=0.0.0a0
           - *numba
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.8.*,>=0.0.0a0
+          - &cudf_conda cudf==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -718,16 +718,16 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf-cu12==24.8.*,>=0.0.0a0
+              - cudf-cu12==24.10.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf-cu11==24.8.*,>=0.0.0a0
+              - cudf-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_conda]}
   depends_on_cudf_kafka:
     common:
       - output_types: conda
         packages:
-          - &cudf_kafka_conda cudf_kafka==24.8.*,>=0.0.0a0
+          - &cudf_kafka_conda cudf_kafka==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -739,10 +739,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf_kafka-cu12==24.8.*,>=0.0.0a0
+              - cudf_kafka-cu12==24.10.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf_kafka-cu11==24.8.*,>=0.0.0a0
+              - cudf_kafka-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_kafka_conda]}
   depends_on_cupy:
     common:
diff --git a/java/ci/README.md b/java/ci/README.md
index 49481efab6b..ccb9efb50b6 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.08
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.10
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable gcc-toolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-24.08.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-24.10.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index 70230e6bc71..9694e741f16 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>24.08.0-SNAPSHOT</version>
+    <version>24.10.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index dcb33b1fc1a..da57622dec7 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "ptxcompiler",
     "pyarrow>=16.1.0,<16.2.0a0",
     "rich",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -126,7 +126,7 @@ requires = [
     "ninja",
     "numpy==1.23.*",
     "pyarrow==16.1.0.*",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [tool.scikit-build]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index badfdf06d15..bff1a9b8493 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index 874bb849747..6cd36136bf8 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -8,7 +8,7 @@ You will need:
    preferred configuration. Or else, use
    [rustup](https://www.rust-lang.org/tools/install)
 2. A [cudf development
-   environment](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment).
+   environment](https://github.com/rapidsai/cudf/blob/branch-24.10/CONTRIBUTING.md#setting-up-your-build-environment).
    The combined devcontainer works, or whatever your favourite approach is.
 
 > ![NOTE] These instructions will get simpler as we merge code in.
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 0b559f7a8e9..393a7510c89 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
     "polars>=1.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 7b99e041b54..59ce15ac4ef 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -20,8 +20,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==24.8.*,>=0.0.0a0",
-    "cudf_kafka==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
+    "cudf_kafka==24.10.*,>=0.0.0a0",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 9b2e3a5a7b1..4968ff0b076 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -19,12 +19,12 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<2.0a0",
     "pandas>=2.0,<2.2.3dev0",
-    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
+    "rapids-dask-dependency==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -45,7 +45,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==24.8.*,>=0.0.0a0",
+    "dask-cuda==24.10.*,>=0.0.0a0",
     "numba>=0.57",
     "pytest-cov",
     "pytest-xdist",

From 29ce5c529ea9ea18edc32ab905f1ef076f266008 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 25 Jul 2024 01:29:41 +0200
Subject: [PATCH 002/270] Fix some issues with deprecated / removed cccl
 facilities (#16377)

`cub::If` has been deprecated and should not be used. There is a better alternative in `cuda::std::conditional_t`

`thrust::{binary, unary}_function` has been deprecated and does not serve a purpose similar to the removed `std::{binary, unary}_function`

Rather than relying on the type aliases one should use the `std::invoke` machinery

Authors:
  - Michael Schellenberger Costa (https://github.com/miscco)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Bernhard Manfred Gruber (https://github.com/bernhardmgruber)

URL: https://github.com/rapidsai/cudf/pull/16377
---
 cpp/benchmarks/common/generate_input.cu          | 2 +-
 cpp/include/cudf/detail/gather.cuh               | 2 +-
 cpp/src/io/fst/agent_dfa.cuh                     | 2 +-
 cpp/src/reductions/minmax.cu                     | 3 +--
 java/src/main/native/src/aggregation128_utils.cu | 2 +-
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 6df2cb44adc..0970003deb2 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -718,7 +718,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
 }
 
 template <typename T>
-struct clamp_down : public thrust::unary_function<T, T> {
+struct clamp_down {
   T max;
   clamp_down(T max) : max(max) {}
   __host__ __device__ T operator()(T x) const { return min(x, max); }
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index d3e9fc4974d..e8e95380815 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -518,7 +518,7 @@ struct column_gatherer_impl<struct_view> {
  * Positive indices are unchanged by this transformation.
  */
 template <typename map_type>
-struct index_converter : public thrust::unary_function<map_type, map_type> {
+struct index_converter {
   index_converter(size_type n_rows) : n_rows(n_rows) {}
 
   __device__ map_type operator()(map_type in) const { return ((in % n_rows) + n_rows) % n_rows; }
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index bc5b94e2718..0e70984b39c 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -791,7 +791,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     can_use_smem_cache;
 
   using DFASimulationCallbackWrapperT =
-    typename cub::If<use_smem_cache, WriteCoalescingT, NonWriteCoalescingT>::Type;
+    cuda::std::conditional_t<use_smem_cache, WriteCoalescingT, NonWriteCoalescingT>;
 
   // Stage 1: Compute the state-transition vector
   if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) {
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 2c1181972c5..6cb58786971 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -107,8 +107,7 @@ rmm::device_scalar<OutputType> reduce_device(InputIterator d_in,
  * respectively of the minimums and maximums of the input pairs.
  */
 template <typename T>
-struct minmax_binary_op
-  : public thrust::binary_function<minmax_pair<T>, minmax_pair<T>, minmax_pair<T>> {
+struct minmax_binary_op {
   __device__ minmax_pair<T> operator()(minmax_pair<T> const& lhs, minmax_pair<T> const& rhs) const
   {
     return minmax_pair<T>{thrust::min(lhs.min_val, rhs.min_val),
diff --git a/java/src/main/native/src/aggregation128_utils.cu b/java/src/main/native/src/aggregation128_utils.cu
index a32e7d27085..631df58b017 100644
--- a/java/src/main/native/src/aggregation128_utils.cu
+++ b/java/src/main/native/src/aggregation128_utils.cu
@@ -34,7 +34,7 @@
 namespace {
 
 // Functor to reassemble a 128-bit value from four 64-bit chunks with overflow detection.
-class chunk_assembler : public thrust::unary_function<cudf::size_type, __int128_t> {
+class chunk_assembler {
  public:
   chunk_assembler(bool* overflows,
                   uint64_t const* chunks0,

From 5a3399bec868f44d13c003f172c665919096d8e8 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 24 Jul 2024 19:26:12 -0500
Subject: [PATCH 003/270] fix [tool.setuptools] reference in custreamz config
 (#16365)

Noticed this warning in logs from #16183

> _/python3.10/site-packages/setuptools/config/pyprojecttoml.py:70: _ToolsTypoInMetadata: Ignoring [tools.setuptools] in pyproject.toml, did you mean [tool.setuptools]?_

This fixes that.

## Notes for Reviewers

Intentionally targeting this at 24.10.

This misconfiguration has been in `custreamz` since the 23.04 release ([git blame link](https://github.com/rapidsai/cudf/blame/e6d412cba7c23df7ee500c28257ed9281cea49b9/python/custreamz/pyproject.toml#L60)).

I think the only effect might be that some test files are included in wheels when we don't want to.

I don't think the fix for it needs to be rushed into 24.08.

I searched across RAPIDS in case this was copied from somewhere else... don't see any other instances of this typo that need to be fixed.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16365
---
 python/custreamz/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 59ce15ac4ef..4be94aa3368 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -57,7 +57,7 @@ zip-safe = false
 [tool.setuptools.dynamic]
 version = {file = "custreamz/VERSION"}
 
-[tools.setuptools.packages.find]
+[tool.setuptools.packages.find]
 include = [
     "custreamz",
     "custreamz.*",

From 473dec55abd1a3d9d540c541443f831d18ebb532 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Fri, 26 Jul 2024 14:45:12 -0700
Subject: [PATCH 004/270] Add query 10 to the TPC-H suite (#16392)

Adds Q10 to the TPC-H benchmark suite

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16392
---
 cpp/examples/tpch/CMakeLists.txt |   4 +
 cpp/examples/tpch/q1.cpp         |   2 +-
 cpp/examples/tpch/q10.cpp        | 166 +++++++++++++++++++++++++++++++
 cpp/examples/tpch/q5.cpp         |  20 ++--
 cpp/examples/tpch/q6.cpp         |   2 +-
 5 files changed, 182 insertions(+), 12 deletions(-)
 create mode 100644 cpp/examples/tpch/q10.cpp

diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt
index 1b91d07e148..373a6d72d56 100644
--- a/cpp/examples/tpch/CMakeLists.txt
+++ b/cpp/examples/tpch/CMakeLists.txt
@@ -30,3 +30,7 @@ target_compile_features(tpch_q6 PRIVATE cxx_std_17)
 add_executable(tpch_q9 q9.cpp)
 target_link_libraries(tpch_q9 PRIVATE cudf::cudf)
 target_compile_features(tpch_q9 PRIVATE cxx_std_17)
+
+add_executable(tpch_q10 q10.cpp)
+target_link_libraries(tpch_q10 PRIVATE cudf::cudf)
+target_compile_features(tpch_q10 PRIVATE cxx_std_17)
diff --git a/cpp/examples/tpch/q1.cpp b/cpp/examples/tpch/q1.cpp
index 1bdf039da4a..fe03320b888 100644
--- a/cpp/examples/tpch/q1.cpp
+++ b/cpp/examples/tpch/q1.cpp
@@ -124,7 +124,7 @@ int main(int argc, char const** argv)
   auto shipdate_upper =
     cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1998, 9, 2), true);
   auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper);
-  auto lineitem_pred                = std::make_unique<cudf::ast::operation>(
+  auto const lineitem_pred          = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal);
 
   // Read out the `lineitem` table from parquet file
diff --git a/cpp/examples/tpch/q10.cpp b/cpp/examples/tpch/q10.cpp
new file mode 100644
index 00000000000..94da46f6930
--- /dev/null
+++ b/cpp/examples/tpch/q10.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q10.cpp
+ * @brief Implement query 10 of the TPC-H benchmark.
+ *
+ * create view customer as select * from '/tables/scale-1/customer.parquet';
+ * create view orders as select * from '/tables/scale-1/orders.parquet';
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ * create view nation as select * from '/tables/scale-1/nation.parquet';
+ *
+ * select
+ *    c_custkey,
+ *    c_name,
+ *    sum(l_extendedprice * (1 - l_discount)) as revenue,
+ *    c_acctbal,
+ *    n_name,
+ *    c_address,
+ *    c_phone,
+ *    c_comment
+ * from
+ *    customer,
+ *    orders,
+ *    lineitem,
+ *    nation
+ * where
+ *     c_custkey = o_custkey
+ *     and l_orderkey = o_orderkey
+ *     and o_orderdate >= date '1993-10-01'
+ *     and o_orderdate < date '1994-01-01'
+ *     and l_returnflag = 'R'
+ *     and c_nationkey = n_nationkey
+ * group by
+ *     c_custkey,
+ *     c_name,
+ *     c_acctbal,
+ *     c_phone,
+ *     n_name,
+ *     c_address,
+ *     c_comment
+ * order by
+ *     revenue desc;
+ */
+
+/**
+ * @brief Calculate the revenue column
+ *
+ * @param extendedprice The extended price column
+ * @param discount The discount column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& discount,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr);
+  auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto revenue            = cudf::binary_operation(extendedprice,
+                                        one_minus_discount->view(),
+                                        cudf::binary_operator::MUL,
+                                        revenue_type,
+                                        stream,
+                                        mr);
+  return revenue;
+}
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Define the column projection and filter predicate for the `orders` table
+  std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
+  auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
+    orders_cols.begin(), std::find(orders_cols.begin(), orders_cols.end(), "o_orderdate")));
+  auto o_orderdate_lower =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1993, 10, 1), true);
+  auto const o_orderdate_lower_limit = cudf::ast::literal(o_orderdate_lower);
+  auto const o_orderdate_pred_lower  = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, o_orderdate_ref, o_orderdate_lower_limit);
+  auto o_orderdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1994, 1, 1), true);
+  auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper);
+  auto const o_orderdate_pred_upper =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit);
+  auto const orders_pred = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper);
+
+  auto const l_returnflag_ref = cudf::ast::column_reference(3);
+  auto r_scalar               = cudf::string_scalar("R");
+  auto const r_literal        = cudf::ast::literal(r_scalar);
+  auto const lineitem_pred    = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::EQUAL, l_returnflag_ref, r_literal);
+
+  // Read out the tables from parquet files
+  // while pushing down the column projections and filter predicates
+  auto const customer = read_parquet(
+    args.dataset_dir + "/customer.parquet",
+    {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"});
+  auto const orders =
+    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
+  auto const lineitem =
+    read_parquet(args.dataset_dir + "/lineitem.parquet",
+                 {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"},
+                 std::move(lineitem_pred));
+  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_name", "n_nationkey"});
+
+  // Perform the joins
+  auto const join_a       = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"});
+  auto const join_b       = apply_inner_join(lineitem, orders, {"l_orderkey"}, {"o_orderkey"});
+  auto const joined_table = apply_inner_join(join_a, join_b, {"c_custkey"}, {"o_custkey"});
+
+  // Calculate and append the `revenue` column
+  auto revenue =
+    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+  (*joined_table).append(revenue, "revenue");
+
+  // Perform the groupby operation
+  auto const groupedby_table = apply_groupby(
+    joined_table,
+    groupby_context_t{
+      {"c_custkey", "c_name", "c_acctbal", "c_phone", "n_name", "c_address", "c_comment"},
+      {
+        {"revenue", {{cudf::aggregation::Kind::SUM, "revenue"}}},
+      }});
+
+  // Perform the order by operation
+  auto const orderedby_table =
+    apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q10.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q5.cpp b/cpp/examples/tpch/q5.cpp
index e56850b94d6..89396a6c968 100644
--- a/cpp/examples/tpch/q5.cpp
+++ b/cpp/examples/tpch/q5.cpp
@@ -44,14 +44,14 @@
  *    region
  * where
  *     c_custkey = o_custkey
- *    and l_orderkey = o_orderkey
- *    and l_suppkey = s_suppkey
- *    and c_nationkey = s_nationkey
- *    and s_nationkey = n_nationkey
- *    and n_regionkey = r_regionkey
- *    and r_name = 'ASIA'
- *    and o_orderdate >= date '1994-01-01'
- *    and o_orderdate < date '1995-01-01'
+ *     and l_orderkey = o_orderkey
+ *     and l_suppkey = s_suppkey
+ *     and c_nationkey = s_nationkey
+ *     and s_nationkey = n_nationkey
+ *     and n_regionkey = r_regionkey
+ *     and r_name = 'ASIA'
+ *     and o_orderdate >= date '1994-01-01'
+ *     and o_orderdate < date '1995-01-01'
  * group by
  *    n_name
  * order by
@@ -109,7 +109,7 @@ int main(int argc, char const** argv)
   auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper);
   auto const o_orderdate_pred_upper =
     cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit);
-  auto orders_pred = std::make_unique<cudf::ast::operation>(
+  auto const orders_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper);
 
   // Define the column projection and filter predicate for the `region` table
@@ -118,7 +118,7 @@ int main(int argc, char const** argv)
     region_cols.begin(), std::find(region_cols.begin(), region_cols.end(), "r_name")));
   auto r_name_value                          = cudf::string_scalar("ASIA");
   auto const r_name_literal                  = cudf::ast::literal(r_name_value);
-  auto region_pred                           = std::make_unique<cudf::ast::operation>(
+  auto const region_pred                     = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::EQUAL, r_name_ref, r_name_literal);
 
   // Read out the tables from parquet files
diff --git a/cpp/examples/tpch/q6.cpp b/cpp/examples/tpch/q6.cpp
index f11b3d6ab3b..405b2ac73ca 100644
--- a/cpp/examples/tpch/q6.cpp
+++ b/cpp/examples/tpch/q6.cpp
@@ -84,7 +84,7 @@ int main(int argc, char const** argv)
     cudf::ast::ast_operator::GREATER_EQUAL, shipdate_ref, shipdate_lower_literal);
   auto const shipdate_pred_b =
     cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal);
-  auto lineitem_pred = std::make_unique<cudf::ast::operation>(
+  auto const lineitem_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
   auto lineitem =
     read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));

From 24997fda194d5b8af34048a8bf275830cabbff8c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 26 Jul 2024 18:37:30 -0700
Subject: [PATCH 005/270] Deduplicate decimal32/decimal64 to decimal128
 conversion function (#16236)

Closes #16194

This PR deduplicates the `convert_data_to_decimal128` function from `to_arrow.cu`, `writer_impl.cu` and `to_arrow_device.cu` to a common location.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16236
---
 cpp/CMakeLists.txt                            |  1 +
 .../interop/decimal_conversion_utilities.cu   | 70 +++++++++++++++++
 .../interop/decimal_conversion_utilities.cuh  | 44 +++++++++++
 cpp/src/interop/to_arrow.cu                   |  8 +-
 cpp/src/interop/to_arrow_device.cu            |  5 +-
 cpp/src/interop/to_arrow_host.cu              | 40 +---------
 cpp/src/io/parquet/writer_impl.cu             | 60 ++++-----------
 cpp/tests/interop/to_arrow_device_test.cpp    | 77 +++++++++++++++++++
 8 files changed, 220 insertions(+), 85 deletions(-)
 create mode 100644 cpp/src/interop/decimal_conversion_utilities.cu
 create mode 100644 cpp/src/interop/decimal_conversion_utilities.cuh

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 95c509efc5b..310bc99b279 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -365,6 +365,7 @@ add_library(
   src/interop/dlpack.cpp
   src/interop/from_arrow.cu
   src/interop/arrow_utilities.cpp
+  src/interop/decimal_conversion_utilities.cu
   src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
   src/interop/to_arrow_host.cu
diff --git a/cpp/src/interop/decimal_conversion_utilities.cu b/cpp/src/interop/decimal_conversion_utilities.cu
new file mode 100644
index 00000000000..2f81c754a30
--- /dev/null
+++ b/cpp/src/interop/decimal_conversion_utilities.cu
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "decimal_conversion_utilities.cuh"
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+
+template <typename DecimalType>
+std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128(
+  cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
+{
+  static_assert(std::is_same_v<DecimalType, int32_t> or std::is_same_v<DecimalType, int64_t>,
+                "Only int32 and int64 decimal types can be converted to decimal128.");
+
+  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType);
+  auto buf = std::make_unique<rmm::device_buffer>(column.size() * sizeof(__int128_t), stream, mr);
+
+  thrust::for_each(rmm::exec_policy_nosync(stream, mr),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(column.size()),
+                   [in  = column.begin<DecimalType>(),
+                    out = reinterpret_cast<DecimalType*>(buf->data()),
+                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
+                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+                     // the lowest order bits are the value, the remainder
+                     // simply matches the sign bit to satisfy the two's
+                     // complement integer representation of negative numbers.
+                     out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+                     }
+                   });
+
+  return buf;
+}
+
+// Instantiate templates for int32_t and int64_t decimal types
+template std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128<int32_t>(
+  cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128<int64_t>(
+  cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/decimal_conversion_utilities.cuh b/cpp/src/interop/decimal_conversion_utilities.cuh
new file mode 100644
index 00000000000..41263147404
--- /dev/null
+++ b/cpp/src/interop/decimal_conversion_utilities.cuh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <type_traits>
+
+namespace cudf::detail {
+
+/**
+ * @brief Convert decimal32 and decimal64 numeric data to decimal128 and return the device vector
+ *
+ * @tparam DecimalType to convert from
+ *
+ * @param column A view of the input columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ *
+ * @return A device vector containing the converted decimal128 data
+ */
+template <typename DecimalType>
+std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128(
+  cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::detail
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 6b163e3441e..3d41f856f4f 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -15,6 +15,7 @@
  */
 
 #include "arrow_utilities.hpp"
+#include "decimal_conversion_utilities.cuh"
 #include "detail/arrow_allocator.hpp"
 
 #include <cudf/column/column.hpp>
@@ -158,8 +159,11 @@ std::shared_ptr<arrow::Array> unsupported_decimals_to_arrow(column_view input,
                                                             arrow::MemoryPool* ar_mr,
                                                             rmm::cuda_stream_view stream)
 {
-  auto buf =
-    detail::decimals_to_arrow<DeviceType>(input, stream, rmm::mr::get_current_device_resource());
+  auto buf = detail::convert_decimals_to_decimal128<DeviceType>(
+    input, stream, rmm::mr::get_current_device_resource());
+
+  // Synchronize stream here to ensure the decimal128 buffer is ready.
+  stream.synchronize();
 
   auto const buf_size_in_bytes = buf->size();
   auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index 2eb9b912054..cea7cdebcba 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -15,6 +15,7 @@
  */
 
 #include "arrow_utilities.hpp"
+#include "decimal_conversion_utilities.cuh"
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
@@ -141,7 +142,9 @@ int construct_decimals(cudf::column_view input,
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
 
-  auto buf = detail::decimals_to_arrow<DeviceType>(input, stream, mr);
+  auto buf = detail::convert_decimals_to_decimal128<DeviceType>(input, stream, mr);
+  // Synchronize stream here to ensure the decimal128 buffer is ready.
+  stream.synchronize();
   NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
 
   ArrowArrayMove(tmp.get(), out);
diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
index c9e53ebaab7..193b3a3b5a2 100644
--- a/cpp/src/interop/to_arrow_host.cu
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -15,6 +15,7 @@
  */
 
 #include "arrow_utilities.hpp"
+#include "decimal_conversion_utilities.cuh"
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/interop.hpp>
@@ -50,41 +51,6 @@
 namespace cudf {
 namespace detail {
 
-template <typename DeviceType>
-std::unique_ptr<rmm::device_buffer> decimals_to_arrow(cudf::column_view input,
-                                                      rmm::cuda_stream_view stream,
-                                                      rmm::device_async_resource_ref mr)
-{
-  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
-  auto buf = std::make_unique<rmm::device_buffer>(input.size() * sizeof(__int128_t), stream, mr);
-
-  auto count = thrust::counting_iterator<size_type>(0);
-  thrust::for_each(rmm::exec_policy(stream, mr),
-                   count,
-                   count + input.size(),
-                   [in  = input.begin<DeviceType>(),
-                    out = reinterpret_cast<DeviceType*>(buf->data()),
-                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
-                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-                     // the lowest order bits are the value, the remainder
-                     // simply matches the sign bit to satisfy the two's
-                     // complement integer representation of negative numbers.
-                     out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-                     }
-                   });
-
-  return buf;
-}
-
-template std::unique_ptr<rmm::device_buffer> decimals_to_arrow<int32_t>(
-  cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
-
-template std::unique_ptr<rmm::device_buffer> decimals_to_arrow<int64_t>(
-  cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
-
 namespace {
 
 struct dispatch_to_arrow_host {
@@ -156,7 +122,9 @@ struct dispatch_to_arrow_host {
     NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
 
     NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
-    auto buf = detail::decimals_to_arrow<DeviceType>(column, stream, mr);
+    auto buf = detail::convert_decimals_to_decimal128<DeviceType>(column, stream, mr);
+    // No need to synchronize stream here as populate_data_buffer uses the same stream to copy data
+    // to host.
     NANOARROW_RETURN_NOT_OK(
       populate_data_buffer(device_span<__int128_t const>(
                              reinterpret_cast<const __int128_t*>(buf->data()), column.size()),
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 2df71b77301..36a1d8377bf 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -22,6 +22,7 @@
 #include "arrow_schema_writer.hpp"
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
+#include "interop/decimal_conversion_utilities.cuh"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/parquet/parquet.hpp"
 #include "io/parquet/parquet_gpu.hpp"
@@ -1601,50 +1602,12 @@ size_t column_index_buffer_size(EncColumnChunk* ck,
   return ck->ck_stat_size * num_pages + column_index_truncate_length + padding + size_struct_size;
 }
 
-/**
- * @brief Convert decimal32 and decimal64 data to decimal128 and return the device vector
- *
- * @tparam DecimalType to convert from
- *
- * @param column A view of the input columns
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return A device vector containing the converted decimal128 data
- */
-template <typename DecimalType>
-rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& column,
-                                                           rmm::cuda_stream_view stream)
-{
-  size_type constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType);
-
-  rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream);
-
-  thrust::for_each(rmm::exec_policy_nosync(stream),
-                   thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(column.size()),
-                   [in  = column.begin<DecimalType>(),
-                    out = reinterpret_cast<DecimalType*>(d128_buffer.data()),
-                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
-                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-                     // The lowest order bits are the value, the remainder
-                     // simply matches the sign bit to satisfy the two's
-                     // complement integer representation of negative numbers.
-                     out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-                     }
-                   });
-
-  return d128_buffer;
-}
-
 /**
  * @brief Function to convert decimal32 and decimal64 columns to decimal128 data,
  *        update the input table metadata, and return a new vector of column views.
  *
  * @param[in,out] table_meta The table metadata
- * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers.
+ * @param[in,out] d128_buffers Buffers containing the converted decimal128 data.
  * @param input The input table
  * @param stream CUDA stream used for device memory operations and kernel launches
  *
@@ -1652,7 +1615,7 @@ rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& co
  */
 std::vector<column_view> convert_decimal_columns_and_metadata(
   table_input_metadata& table_meta,
-  std::vector<rmm::device_uvector<__int128_t>>& d128_vectors,
+  std::vector<std::unique_ptr<rmm::device_buffer>>& d128_buffers,
   table_view const& table,
   rmm::cuda_stream_view stream)
 {
@@ -1673,28 +1636,30 @@ std::vector<column_view> convert_decimal_columns_and_metadata(
     switch (column.type().id()) {
       case type_id::DECIMAL32:
         // Convert data to decimal128 type
-        d128_vectors.emplace_back(convert_data_to_decimal128<int32_t>(column, stream));
+        d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128<int32_t>(
+          column, stream, rmm::mr::get_current_device_resource()));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
         // Create a new column view from the d128 data vector
         return {data_type{type_id::DECIMAL128, column.type().scale()},
                 column.size(),
-                d128_vectors.back().data(),
+                d128_buffers.back()->data(),
                 column.null_mask(),
                 column.null_count(),
                 column.offset(),
                 converted_children};
       case type_id::DECIMAL64:
         // Convert data to decimal128 type
-        d128_vectors.emplace_back(convert_data_to_decimal128<int64_t>(column, stream));
+        d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128<int64_t>(
+          column, stream, rmm::mr::get_current_device_resource()));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
         // Create a new column view from the d128 data vector
         return {data_type{type_id::DECIMAL128, column.type().scale()},
                 column.size(),
-                d128_vectors.back().data(),
+                d128_buffers.back()->data(),
                 column.null_mask(),
                 column.null_count(),
                 column.offset(),
@@ -1722,6 +1687,9 @@ std::vector<column_view> convert_decimal_columns_and_metadata(
     std::back_inserter(converted_column_views),
     [&](auto elem) { return convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); });
 
+  // Synchronize stream here to ensure all decimal128 buffers are ready.
+  stream.synchronize();
+
   return converted_column_views;
 }
 
@@ -1780,13 +1748,13 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    rmm::cuda_stream_view stream)
 {
   // Container to store decimal128 converted data if needed
-  std::vector<rmm::device_uvector<__int128_t>> d128_vectors;
+  std::vector<std::unique_ptr<rmm::device_buffer>> d128_buffers;
 
   // Convert decimal32/decimal64 data to decimal128 if writing arrow schema
   // and initialize LinkedColVector
   auto vec = table_to_linked_columns(
     (write_arrow_schema)
-      ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_vectors, input, stream)})
+      ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_buffers, input, stream)})
       : input);
 
   auto schema_tree = construct_parquet_schema_tree(
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 77da4039103..51216a8512c 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -710,6 +710,83 @@ TEST_F(ToArrowDeviceTest, StructColumn)
 template <typename T>
 using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
 
+TEST_F(ToArrowDeviceTest, FixedPoint32Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {6, 4, 2, 0, -1, -3, -5}) {
+    auto const expect_data =
+      std::vector<int32_t>{-1000, -1, -1, -1, 2400, 0, 0, 0, -3456, -1, -1, -1,
+                           4650,  0,  0,  0,  5154, 0, 0, 0, 6800,  0,  0,  0};
+    auto col = fp_wrapper<int32_t>({-1000, 2400, -3456, 4650, 5154, 6800}, scale_type{scale});
+    std::vector<std::unique_ptr<cudf::column>> table_cols;
+    table_cols.emplace_back(col.release());
+    auto input = cudf::table(std::move(table_cols));
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    auto got_arrow_schema =
+      cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+    compare_schemas(expected_schema.get(), got_arrow_schema.get());
+
+    auto result_dev_data = std::make_unique<rmm::device_uvector<int32_t>>(
+      expect_data.size(), cudf::get_default_stream());
+    cudaMemcpy(result_dev_data->data(),
+               expect_data.data(),
+               sizeof(int32_t) * expect_data.size(),
+               cudaMemcpyHostToDevice);
+
+    cudf::get_default_stream().synchronize();
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    expected_array->children[0]->length = input.num_rows();
+    NANOARROW_THROW_NOT_OK(
+      ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc));
+    ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data =
+      const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(input.view().column(0).null_mask()));
+
+    auto data_ptr = reinterpret_cast<uint8_t*>(result_dev_data->data());
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(
+      ArrowArrayBuffer(expected_array->children[0], 1),
+      ArrowBufferDeallocator(
+        [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
+          auto buf =
+            reinterpret_cast<std::unique_ptr<rmm::device_uvector<int32_t>>*>(alloc->private_data);
+          delete buf;
+        },
+        new std::unique_ptr<rmm::device_uvector<int32_t>>(std::move(result_dev_data)))));
+    ArrowArrayBuffer(expected_array->children[0], 1)->data = data_ptr;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+    auto got_arrow_array = cudf::to_arrow_device(input.view());
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+
+    got_arrow_array = cudf::to_arrow_device(std::move(input));
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+  }
+}
+
 TEST_F(ToArrowDeviceTest, FixedPoint64Table)
 {
   using namespace numeric;

From 18c1465b597284d8b558964cc0ca48de7da60a17 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 06:06:07 -1000
Subject: [PATCH 006/270] Align ewm APIs with pandas 2.x (#16413)

These all currently are not implemented and raise a `NotImplementedError`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16413
---
 python/cudf/cudf/core/window/ewm.py | 52 ++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index bb153d4b549..1203a840076 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -114,23 +114,57 @@ def __init__(
         self.adjust = adjust
         self.com = get_center_of_mass(com, span, halflife, alpha)
 
-    def mean(self):
+    def online(self, engine: str = "numba", engine_kwargs=None):
+        """
+        Return an ``OnlineExponentialMovingWindow`` object to calculate
+        exponentially moving window aggregations in an online method.
+
+        Currently not supported.
+        """
+        raise NotImplementedError("online is currently not supported.")
+
+    def mean(
+        self, numeric_only: bool = False, engine=None, engine_kwargs=None
+    ):
         """
         Calculate the ewm (exponential weighted moment) mean.
         """
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas."
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional and added for compatibility with pandas."
+            )
         return self._apply_agg("ewma")
 
-    def var(self, bias):
-        raise NotImplementedError("ewmvar not yet supported.")
+    def sum(self, numeric_only: bool = False, engine=None, engine_kwargs=None):
+        raise NotImplementedError("sum not yet supported.")
 
-    def std(self, bias):
-        raise NotImplementedError("ewmstd not yet supported.")
+    def var(self, bias: bool = False, numeric_only: bool = False):
+        raise NotImplementedError("var not yet supported.")
 
-    def corr(self, other):
-        raise NotImplementedError("ewmcorr not yet supported.")
+    def std(self, bias: bool = False, numeric_only: bool = False):
+        raise NotImplementedError("std not yet supported.")
 
-    def cov(self, other):
-        raise NotImplementedError("ewmcov not yet supported.")
+    def corr(
+        self, other, pairwise: bool | None = None, numeric_only: bool = False
+    ):
+        raise NotImplementedError("corr not yet supported.")
+
+    def cov(
+        self,
+        other,
+        pairwise: bool | None = None,
+        bias: bool = False,
+        numeric_only: bool = False,
+    ):
+        raise NotImplementedError("cov not yet supported.")
 
     def _apply_agg_series(self, sr, agg_name):
         if not is_numeric_dtype(sr.dtype):

From 58f47242fe04b1e25fd42e1e45e8c15417140777 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 06:09:21 -1000
Subject: [PATCH 007/270] Align groupby APIs with pandas 2.x (#16403)

The following breaking APIs are affected:

* `apply`
* `transform`
* `describe`

The rest of the APIs are non-breaking and generally will raise a `NotImplementedError`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16403
---
 .../source/user_guide/api_docs/groupby.rst    |   3 +-
 python/cudf/cudf/core/groupby/groupby.py      | 629 ++++++++++++++----
 python/cudf/cudf/core/resample.py             |   6 +-
 python/cudf/cudf/tests/test_groupby.py        |  25 +
 4 files changed, 514 insertions(+), 149 deletions(-)

diff --git a/docs/cudf/source/user_guide/api_docs/groupby.rst b/docs/cudf/source/user_guide/api_docs/groupby.rst
index 80811efa33f..ca29087cbf9 100644
--- a/docs/cudf/source/user_guide/api_docs/groupby.rst
+++ b/docs/cudf/source/user_guide/api_docs/groupby.rst
@@ -68,7 +68,6 @@ Computations / descriptive stats
    GroupBy.std
    GroupBy.sum
    GroupBy.var
-   GroupBy.corr
    GroupBy.cov
 
 The following methods are available in both ``SeriesGroupBy`` and
@@ -81,6 +80,7 @@ application to columns of a specific data type.
    :toctree: api/
 
    DataFrameGroupBy.bfill
+   DataFrameGroupBy.corr
    DataFrameGroupBy.count
    DataFrameGroupBy.cumcount
    DataFrameGroupBy.cummax
@@ -102,5 +102,6 @@ The following methods are available only for ``SeriesGroupBy`` objects.
 .. autosummary::
    :toctree: api/
 
+   SeriesGroupBy.corr
    SeriesGroupBy.nunique
    SeriesGroupBy.unique
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 1646c5042fd..3cfbd1d736a 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -8,7 +8,7 @@
 import warnings
 from collections import abc
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Iterable
+from typing import TYPE_CHECKING, Any, Iterable, Literal
 
 import cupy as cp
 import numpy as np
@@ -306,6 +306,18 @@ def __iter__(self):
                 grouped_values[offsets[i] : offsets[i + 1]],
             )
 
+    def __len__(self) -> int:
+        return self.ngroups
+
+    @property
+    def ngroups(self) -> int:
+        _, offsets, _, _ = self._grouped()
+        return len(offsets) - 1
+
+    @property
+    def ndim(self) -> int:
+        return self.obj.ndim
+
     @property
     def dtypes(self):
         """
@@ -457,10 +469,20 @@ def size(self):
         )
 
     @_performance_tracking
-    def cumcount(self):
+    def cumcount(self, ascending: bool = True):
         """
         Return the cumulative count of keys in each group.
+
+        Parameters
+        ----------
+        ascending : bool, default True
+            If False, number in reverse, from length of group - 1 to 0.
+            Currently not supported
         """
+        if ascending is not True:
+            raise NotImplementedError(
+                "ascending is currently not implemented."
+            )
         return (
             cudf.Series(
                 cudf.core.column.column_empty(
@@ -527,7 +549,7 @@ def _groupby(self):
         )
 
     @_performance_tracking
-    def agg(self, func):
+    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
         """
         Apply aggregation(s) to the groups.
 
@@ -615,6 +637,22 @@ def agg(self, func):
         1  1.5  1.75  2.0   2.0
         2  3.0  3.00  1.0   1.0
         """
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if args:
+            raise NotImplementedError(
+                "Passing args to func is currently not supported."
+            )
+        if kwargs:
+            raise NotImplementedError(
+                "Passing kwargs to func is currently not supported."
+            )
         column_names, columns, normalized_aggs = self._normalize_aggs(func)
         orig_dtypes = tuple(c.dtype for c in columns)
 
@@ -935,12 +973,13 @@ def tail(self, n: int = 5, *, preserve_order: bool = True):
         )
 
     @_performance_tracking
-    def nth(self, n):
+    def nth(self, n, dropna: Literal["any", "all", None] = None):
         """
         Return the nth row from each group.
         """
-
-        self.obj["__groupbynth_order__"] = range(0, len(self.obj))
+        if dropna is not None:
+            raise NotImplementedError("dropna is not currently supported.")
+        self.obj["__groupbynth_order__"] = range(0, len(self.obj))  # type: ignore[index]
         # We perform another groupby here to have the grouping columns
         # be a part of dataframe columns.
         result = self.obj.groupby(self.grouping.keys).agg(lambda x: x.nth(n))
@@ -1423,13 +1462,13 @@ def _post_process_chunk_results(
 
     @_performance_tracking
     def apply(
-        self, function, *args, engine="auto", include_groups: bool = True
+        self, func, *args, engine="auto", include_groups: bool = True, **kwargs
     ):
         """Apply a python transformation function over the grouped chunk.
 
         Parameters
         ----------
-        function : callable
+        func : callable
           The python transformation function that will be applied
           on the grouped chunk.
         args : tuple
@@ -1452,6 +1491,9 @@ def apply(
             When True, will attempt to apply ``func`` to the groupings in
             the case that they are columns of the DataFrame. In the future,
             this will default to ``False``.
+        kwargs : dict
+            Optional keyword arguments to pass to the function.
+            Currently not supported
 
         Examples
         --------
@@ -1528,13 +1570,17 @@ def mult(df):
         dtype: int64
 
         """
+        if kwargs:
+            raise NotImplementedError(
+                "Passing kwargs to func is currently not supported."
+            )
         if self.obj.empty:
-            if function in {"count", "size", "idxmin", "idxmax"}:
+            if func in {"count", "size", "idxmin", "idxmax"}:
                 res = cudf.Series([], dtype="int64")
             else:
                 res = self.obj.copy(deep=True)
             res.index = self.grouping.keys
-            if function in {"sum", "product"}:
+            if func in {"sum", "product"}:
                 # For `sum` & `product`, boolean types
                 # will need to result in `int64` type.
                 for name, col in res._data.items():
@@ -1542,20 +1588,20 @@ def mult(df):
                         res._data[name] = col.astype("int")
             return res
 
-        if not callable(function):
-            raise TypeError(f"type {type(function)} is not callable")
+        if not callable(func):
+            raise TypeError(f"type {type(func)} is not callable")
         group_names, offsets, group_keys, grouped_values = self._grouped(
             include_groups=include_groups
         )
 
         if engine == "auto":
-            if _can_be_jitted(grouped_values, function, args):
+            if _can_be_jitted(grouped_values, func, args):
                 engine = "jit"
             else:
                 engine = "cudf"
         if engine == "jit":
             result = self._jit_groupby_apply(
-                function,
+                func,
                 group_names,
                 offsets,
                 group_keys,
@@ -1564,7 +1610,7 @@ def mult(df):
             )
         elif engine == "cudf":
             result = self._iterative_groupby_apply(
-                function,
+                func,
                 group_names,
                 offsets,
                 group_keys,
@@ -1744,12 +1790,14 @@ def _broadcast(self, values: cudf.Series) -> cudf.Series:
         return values
 
     @_performance_tracking
-    def transform(self, function):
+    def transform(
+        self, func, *args, engine=None, engine_kwargs=None, **kwargs
+    ):
         """Apply an aggregation, then broadcast the result to the group size.
 
         Parameters
         ----------
-        function: str or callable
+        func: str or callable
             Aggregation to apply to each group. Note that the set of
             operations currently supported by `transform` is identical
             to that supported by the `agg` method.
@@ -1778,18 +1826,35 @@ def transform(self, function):
         --------
         agg
         """
-        if not (isinstance(function, str) or callable(function)):
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if args:
+            raise NotImplementedError(
+                "Passing args to func is currently not supported."
+            )
+        if kwargs:
+            raise NotImplementedError(
+                "Passing kwargs to func is currently not supported."
+            )
+
+        if not (isinstance(func, str) or callable(func)):
             raise TypeError(
                 "Aggregation must be a named aggregation or a callable"
             )
         try:
-            result = self.agg(function)
+            result = self.agg(func)
         except TypeError as e:
             raise NotImplementedError(
                 "Currently, `transform()` supports only aggregations."
             ) from e
         # If the aggregation is a scan, don't broadcast
-        if libgroupby._is_all_scan_aggregate([[function]]):
+        if libgroupby._is_all_scan_aggregate([[func]]):
             if len(result) != len(self.obj):
                 raise AssertionError(
                     "Unexpected result length for scan transform"
@@ -1824,7 +1889,7 @@ def func(x):
         return self.agg(func)
 
     @_performance_tracking
-    def describe(self, include=None, exclude=None):
+    def describe(self, percentiles=None, include=None, exclude=None):
         """
         Generate descriptive statistics that summarizes the central tendency,
         dispersion and shape of a dataset's distribution, excluding NaN values.
@@ -1833,6 +1898,10 @@ def describe(self, include=None, exclude=None):
 
         Parameters
         ----------
+        percentiles : list-like of numbers, optional
+            The percentiles to include in the output.
+            Currently not supported.
+
         include: 'all', list-like of dtypes or None (default), optional
             list of data types to include in the result.
             Ignored for Series.
@@ -1869,8 +1938,12 @@ def describe(self, include=None, exclude=None):
         90        1   24.0  <NA>   24.0   24.0   24.0   24.0   24.0
 
         """
-        if exclude is not None and include is not None:
-            raise NotImplementedError
+        if percentiles is not None:
+            raise NotImplementedError("percentiles is currently not supported")
+        if exclude is not None:
+            raise NotImplementedError("exclude is currently not supported")
+        if include is not None:
+            raise NotImplementedError("include is currently not supported")
 
         res = self.agg(
             [
@@ -1896,69 +1969,7 @@ def describe(self, include=None, exclude=None):
         return res
 
     @_performance_tracking
-    def corr(self, method="pearson", min_periods=1):
-        """
-        Compute pairwise correlation of columns, excluding NA/null values.
-
-        Parameters
-        ----------
-        method: {"pearson", "kendall", "spearman"} or callable,
-            default "pearson". Currently only the pearson correlation
-            coefficient is supported.
-
-        min_periods: int, optional
-            Minimum number of observations required per pair of columns
-            to have a valid result.
-
-        Returns
-        -------
-        DataFrame
-            Correlation matrix.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> gdf = cudf.DataFrame({
-        ...             "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
-        ...             "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
-        ...             "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
-        ...             "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]})
-        >>> gdf
-           id  val1  val2  val3
-        0  a     5     4     4
-        1  a     4     5     5
-        2  a     6     6     6
-        3  b     4     1     1
-        4  b     8     2     2
-        5  b     7     9     9
-        6  c     4     8     8
-        7  c     5     5     5
-        8  c     2     1     1
-        >>> gdf.groupby("id").corr(method="pearson")
-                    val1      val2      val3
-        id
-        a   val1  1.000000  0.500000  0.500000
-            val2  0.500000  1.000000  1.000000
-            val3  0.500000  1.000000  1.000000
-        b   val1  1.000000  0.385727  0.385727
-            val2  0.385727  1.000000  1.000000
-            val3  0.385727  1.000000  1.000000
-        c   val1  1.000000  0.714575  0.714575
-            val2  0.714575  1.000000  1.000000
-            val3  0.714575  1.000000  1.000000
-        """
-
-        if method.lower() not in ("pearson",):
-            raise NotImplementedError(
-                "Only pearson correlation is currently supported"
-            )
-
-        return self._cov_or_corr(
-            lambda x: x.corr(method, min_periods), "Correlation"
-        )
-
-    @_performance_tracking
-    def cov(self, min_periods=0, ddof=1):
+    def cov(self, min_periods=0, ddof=1, numeric_only: bool = False):
         """
         Compute the pairwise covariance among the columns of a DataFrame,
         excluding NA/null values.
@@ -2042,6 +2053,10 @@ def cov(self, min_periods=0, ddof=1):
            val2  3.833333  12.333333  12.333333
            val3  3.833333  12.333333  12.333333
         """
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
 
         return self._cov_or_corr(
             lambda x: x.cov(min_periods, ddof), "Covariance"
@@ -2137,7 +2152,13 @@ def _cov_or_corr(self, func, method_name):
         return res
 
     @_performance_tracking
-    def var(self, ddof=1):
+    def var(
+        self,
+        ddof=1,
+        engine=None,
+        engine_kwargs=None,
+        numeric_only: bool = False,
+    ):
         """Compute the column-wise variance of the values in each group.
 
         Parameters
@@ -2146,6 +2167,18 @@ def var(self, ddof=1):
             The delta degrees of freedom. N - ddof is the divisor used to
             normalize the variance.
         """
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
 
         def func(x):
             return getattr(x, "var")(ddof=ddof)
@@ -2153,7 +2186,13 @@ def func(x):
         return self.agg(func)
 
     @_performance_tracking
-    def std(self, ddof=1):
+    def std(
+        self,
+        ddof=1,
+        engine=None,
+        engine_kwargs=None,
+        numeric_only: bool = False,
+    ):
         """Compute the column-wise std of the values in each group.
 
         Parameters
@@ -2162,6 +2201,18 @@ def std(self, ddof=1):
             The delta degrees of freedom. N - ddof is the divisor used to
             normalize the standard deviation.
         """
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
 
         def func(x):
             return getattr(x, "std")(ddof=ddof)
@@ -2169,7 +2220,9 @@ def func(x):
         return self.agg(func)
 
     @_performance_tracking
-    def quantile(self, q=0.5, interpolation="linear"):
+    def quantile(
+        self, q=0.5, interpolation="linear", numeric_only: bool = False
+    ):
         """Compute the column-wise quantiles of the values in each group.
 
         Parameters
@@ -2179,7 +2232,14 @@ def quantile(self, q=0.5, interpolation="linear"):
         interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}
             The interpolation method to use when the desired quantile lies
             between two data points. Defaults to "linear".
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+            Currently not supported
         """
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is not currently supported."
+            )
 
         def func(x):
             return getattr(x, "quantile")(q=q, interpolation=interpolation)
@@ -2333,7 +2393,14 @@ def fillna(
         )
 
     @_performance_tracking
-    def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+    def shift(
+        self,
+        periods=1,
+        freq=None,
+        axis=0,
+        fill_value=None,
+        suffix: str | None = None,
+    ):
         """
         Shift each group by ``periods`` positions.
 
@@ -2355,6 +2422,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
               the list. The length of the list should match the number of
               columns shifted. Each value should match the data type of the
               column to fill.
+        suffix : str, optional
+            A string to add to each shifted column if there are multiple periods.
+            Ignored otherwise.
+            Currently not supported.
 
         Returns
         -------
@@ -2374,6 +2445,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         if not axis == 0:
             raise NotImplementedError("Only axis=0 is supported.")
 
+        if suffix is not None:
+            raise NotImplementedError("shift is not currently supported.")
+
         values = self.grouping.values
         if is_list_like(fill_value):
             if len(fill_value) != len(values._data):
@@ -2473,6 +2547,142 @@ def pct_change(
         shifted = fill_grp.shift(periods=periods, freq=freq)
         return (filled / shifted) - 1
 
+    def _mimic_pandas_order(
+        self, result: DataFrameOrSeries
+    ) -> DataFrameOrSeries:
+        """Given a groupby result from libcudf, reconstruct the row orders
+        matching that of pandas. This also adds appropriate indices.
+        """
+        # TODO: copy metadata after this method is a common pattern, should
+        # merge in this method.
+
+        # This function is used to reorder the results of scan-based
+        # groupbys which have the same output size as input size.
+        # However, if the grouping key has NAs and dropna=True, the
+        # result coming back from libcudf has null_count few rows than
+        # the input, so we must produce an ordering from the full
+        # input range.
+        _, _, (ordering,) = self._groupby.groups(
+            [as_column(range(0, len(self.obj)))]
+        )
+        if self._dropna and any(
+            c.has_nulls(include_nan=True) > 0
+            for c in self.grouping._key_columns
+        ):
+            # Scan aggregations with null/nan keys put nulls in the
+            # corresponding output rows in pandas, to do that here
+            # expand the result by reindexing.
+            ri = cudf.RangeIndex(0, len(self.obj))
+            result.index = cudf.Index(ordering)
+            # This reorders and expands
+            result = result.reindex(ri)
+        else:
+            # Just reorder according to the groupings
+            result = result.take(ordering.argsort())
+        # Now produce the actual index we first thought of
+        result.index = self.obj.index
+        return result
+
+    def ohlc(self):
+        """
+        Compute open, high, low and close values of a group, excluding missing values.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("ohlc is currently not implemented")
+
+    @property
+    def plot(self):
+        """
+        Make plots of a grouped Series or DataFrame.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("plot is currently not implemented")
+
+    def resample(self, rule, *args, include_groups: bool = True, **kwargs):
+        """
+        Provide resampling when using a TimeGrouper.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("resample is currently not implemented")
+
+    def take(self, indices):
+        """
+        Return the elements in the given *positional* indices in each group.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("take is currently not implemented")
+
+    def filter(self, func, dropna: bool = True, *args, **kwargs):
+        """
+        Filter elements from groups that don't satisfy a criterion.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("filter is currently not implemented")
+
+    def expanding(self, *args, **kwargs):
+        """
+        Return an expanding grouper, providing expanding
+        functionality per group.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("expanding is currently not implemented")
+
+    def ewm(self, *args, **kwargs):
+        """
+        Return an ewm grouper, providing ewm functionality per group.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("expanding is currently not implemented")
+
+    def any(self, skipna: bool = True):
+        """
+        Return True if any value in the group is truthful, else False.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("any is currently not implemented")
+
+    def all(self, skipna: bool = True):
+        """
+        Return True if all values in the group are truthful, else False.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("all is currently not implemented")
+
+
+class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
+    obj: "cudf.core.dataframe.DataFrame"
+
+    _PROTECTED_KEYS = frozenset(("obj",))
+
+    def _reduce_numeric_only(self, op: str):
+        columns = list(
+            name
+            for name in self.obj._data.names
+            if (
+                is_numeric_dtype(self.obj._data[name].dtype)
+                and name not in self.grouping.names
+            )
+        )
+        return self[columns].agg(op)
+
+    def __getitem__(self, key):
+        return self.obj[key].groupby(
+            by=self.grouping.keys,
+            dropna=self._dropna,
+            sort=self._sort,
+            group_keys=self._group_keys,
+            as_index=self._as_index,
+        )
+
     def value_counts(
         self,
         subset=None,
@@ -2637,68 +2847,112 @@ def value_counts(
 
         return result
 
-    def _mimic_pandas_order(
-        self, result: DataFrameOrSeries
-    ) -> DataFrameOrSeries:
-        """Given a groupby result from libcudf, reconstruct the row orders
-        matching that of pandas. This also adds appropriate indices.
+    @_performance_tracking
+    def corr(
+        self, method="pearson", min_periods=1, numeric_only: bool = False
+    ):
         """
-        # TODO: copy metadata after this method is a common pattern, should
-        # merge in this method.
+        Compute pairwise correlation of columns, excluding NA/null values.
 
-        # This function is used to reorder the results of scan-based
-        # groupbys which have the same output size as input size.
-        # However, if the grouping key has NAs and dropna=True, the
-        # result coming back from libcudf has null_count few rows than
-        # the input, so we must produce an ordering from the full
-        # input range.
-        _, _, (ordering,) = self._groupby.groups(
-            [as_column(range(0, len(self.obj)))]
-        )
-        if self._dropna and any(
-            c.has_nulls(include_nan=True) > 0
-            for c in self.grouping._key_columns
-        ):
-            # Scan aggregations with null/nan keys put nulls in the
-            # corresponding output rows in pandas, to do that here
-            # expand the result by reindexing.
-            ri = cudf.RangeIndex(0, len(self.obj))
-            result.index = cudf.Index(ordering)
-            # This reorders and expands
-            result = result.reindex(ri)
-        else:
-            # Just reorder according to the groupings
-            result = result.take(ordering.argsort())
-        # Now produce the actual index we first thought of
-        result.index = self.obj.index
-        return result
+        Parameters
+        ----------
+        method: {"pearson", "kendall", "spearman"} or callable,
+            default "pearson". Currently only the pearson correlation
+            coefficient is supported.
 
+        min_periods: int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result.
 
-class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
-    obj: "cudf.core.dataframe.DataFrame"
+        Returns
+        -------
+        DataFrame
+            Correlation matrix.
 
-    _PROTECTED_KEYS = frozenset(("obj",))
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({
+        ...             "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+        ...             "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+        ...             "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        ...             "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]})
+        >>> gdf
+           id  val1  val2  val3
+        0  a     5     4     4
+        1  a     4     5     5
+        2  a     6     6     6
+        3  b     4     1     1
+        4  b     8     2     2
+        5  b     7     9     9
+        6  c     4     8     8
+        7  c     5     5     5
+        8  c     2     1     1
+        >>> gdf.groupby("id").corr(method="pearson")
+                    val1      val2      val3
+        id
+        a   val1  1.000000  0.500000  0.500000
+            val2  0.500000  1.000000  1.000000
+            val3  0.500000  1.000000  1.000000
+        b   val1  1.000000  0.385727  0.385727
+            val2  0.385727  1.000000  1.000000
+            val3  0.385727  1.000000  1.000000
+        c   val1  1.000000  0.714575  0.714575
+            val2  0.714575  1.000000  1.000000
+            val3  0.714575  1.000000  1.000000
+        """
 
-    def _reduce_numeric_only(self, op: str):
-        columns = list(
-            name
-            for name in self.obj._data.names
-            if (
-                is_numeric_dtype(self.obj._data[name].dtype)
-                and name not in self.grouping.names
+        if method != "pearson":
+            raise NotImplementedError(
+                "Only pearson correlation is currently supported"
+            )
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
             )
-        )
-        return self[columns].agg(op)
 
-    def __getitem__(self, key):
-        return self.obj[key].groupby(
-            by=self.grouping.keys,
-            dropna=self._dropna,
-            sort=self._sort,
-            group_keys=self._group_keys,
-            as_index=self._as_index,
+        return self._cov_or_corr(
+            lambda x: x.corr(method, min_periods), "Correlation"
         )
 
+    def hist(
+        self,
+        column=None,
+        by=None,
+        grid: bool = True,
+        xlabelsize: int | None = None,
+        xrot: float | None = None,
+        ylabelsize: int | None = None,
+        yrot: float | None = None,
+        ax=None,
+        sharex: bool = False,
+        sharey: bool = False,
+        figsize: tuple[float, float] | None = None,
+        layout: tuple[int, int] | None = None,
+        bins: int | abc.Sequence[int] = 10,
+        backend: str | None = None,
+        legend: bool = False,
+        **kwargs,
+    ):
+        raise NotImplementedError("hist is not currently implemented")
+
+    def boxplot(
+        self,
+        subplots: bool = True,
+        column=None,
+        fontsize: int | None = None,
+        rot: int = 0,
+        grid: bool = True,
+        ax=None,
+        figsize: tuple[float, float] | None = None,
+        layout=None,
+        sharex: bool = False,
+        sharey: bool = True,
+        backend=None,
+        **kwargs,
+    ):
+        raise NotImplementedError("boxplot is not currently implemented")
+
 
 DataFrameGroupBy.__doc__ = groupby_doc_template.format(ret="")
 
@@ -2706,8 +2960,10 @@ def __getitem__(self, key):
 class SeriesGroupBy(GroupBy):
     obj: "cudf.core.series.Series"
 
-    def agg(self, func):
-        result = super().agg(func)
+    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
+        result = super().agg(
+            func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
+        )
 
         # downcast the result to a Series:
         if len(result._data):
@@ -2722,14 +2978,95 @@ def agg(self, func):
 
     aggregate = agg
 
-    def apply(self, func, *args):
-        result = super().apply(func, *args)
+    def apply(self, func, *args, **kwargs):
+        result = super().apply(func, *args, **kwargs)
 
         # apply Series name to result
         result.name = self.obj.name
 
         return result
 
+    @property
+    def dtype(self) -> pd.Series:
+        raise NotImplementedError("dtype is currently not implemented.")
+
+    def hist(
+        self,
+        by=None,
+        ax=None,
+        grid: bool = True,
+        xlabelsize: int | None = None,
+        xrot: float | None = None,
+        ylabelsize: int | None = None,
+        yrot: float | None = None,
+        figsize: tuple[float, float] | None = None,
+        bins: int | abc.Sequence[int] = 10,
+        backend: str | None = None,
+        legend: bool = False,
+        **kwargs,
+    ):
+        raise NotImplementedError("hist is currently not implemented.")
+
+    @property
+    def is_monotonic_increasing(self) -> cudf.Series:
+        """
+        Return whether each group's values are monotonically increasing.
+
+        Currently not implemented
+        """
+        raise NotImplementedError(
+            "is_monotonic_increasing is currently not implemented."
+        )
+
+    @property
+    def is_monotonic_decreasing(self) -> cudf.Series:
+        """
+        Return whether each group's values are monotonically decreasing.
+
+        Currently not implemented
+        """
+        raise NotImplementedError(
+            "is_monotonic_decreasing is currently not implemented."
+        )
+
+    def nlargest(
+        self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
+    ) -> cudf.Series:
+        """
+        Return the largest n elements.
+
+        Currently not implemented
+        """
+        raise NotImplementedError("nlargest is currently not implemented.")
+
+    def nsmallest(
+        self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
+    ) -> cudf.Series:
+        """
+        Return the smallest n elements.
+
+        Currently not implemented
+        """
+        raise NotImplementedError("nsmallest is currently not implemented.")
+
+    def value_counts(
+        self,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        bins=None,
+        dropna: bool = True,
+    ) -> cudf.Series | cudf.DataFrame:
+        raise NotImplementedError("value_counts is currently not implemented.")
+
+    def corr(
+        self,
+        other: cudf.Series,
+        method: str = "pearson",
+        min_periods: int | None = None,
+    ) -> cudf.Series:
+        raise NotImplementedError("corr is currently not implemented.")
+
 
 SeriesGroupBy.__doc__ = groupby_doc_template.format(ret="")
 
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index 4e0c5bd86b9..715bbf89b15 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -43,8 +43,10 @@ def __init__(self, obj, by, axis=None, kind=None):
         by = _ResampleGrouping(obj, by)
         super().__init__(obj, by=by)
 
-    def agg(self, func):
-        result = super().agg(func)
+    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
+        result = super().agg(
+            func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
+        )
         if len(self.grouping.bin_labels) != len(result):
             index = cudf.core.index.Index(
                 self.grouping.bin_labels, name=self.grouping.names[0]
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 826a0e52f57..74f04c0584f 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3885,3 +3885,28 @@ def test_group_by_raises_category_error(op):
 
     with pytest.raises(TypeError):
         df.groupby(df.a).agg(op)
+
+
+def test_ngroups():
+    pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)})
+    gdf = cudf.DataFrame.from_pandas(pdf)
+
+    pgb = pdf.groupby("a")
+    ggb = gdf.groupby("a")
+    assert pgb.ngroups == ggb.ngroups
+    assert len(pgb) == len(ggb)
+
+
+def test_ndim():
+    pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)})
+    gdf = cudf.DataFrame.from_pandas(pdf)
+
+    pgb = pdf.groupby("a")
+    ggb = gdf.groupby("a")
+    assert pgb.ndim == ggb.ndim
+
+    pser = pd.Series(range(3))
+    gser = cudf.Series.from_pandas(pser)
+    pgb = pser.groupby([0, 0, 1])
+    ggb = gser.groupby(cudf.Series([0, 0, 1]))
+    assert pgb.ndim == ggb.ndim

From 6e7624d6b31c93b0547590929ac63ed8e3a48d24 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 29 Jul 2024 14:06:51 -0400
Subject: [PATCH 008/270] Add stream parameter to reshape APIs (#16410)

Adds `stream` parameter to reshape APIs:
- `cudf::interleave_columns`
- `cudf::tile`
- `cudf::byte_cast`

Found while working #15983

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16410
---
 cpp/include/cudf/detail/reshape.hpp   |  4 ---
 cpp/include/cudf/reshape.hpp          | 17 ++++++----
 cpp/src/reshape/byte_cast.cu          | 11 ++-----
 cpp/src/reshape/interleave_columns.cu |  3 +-
 cpp/src/reshape/tile.cu               |  3 +-
 cpp/tests/CMakeLists.txt              |  1 +
 cpp/tests/streams/reshape_test.cpp    | 47 +++++++++++++++++++++++++++
 7 files changed, 65 insertions(+), 21 deletions(-)
 create mode 100644 cpp/tests/streams/reshape_test.cpp

diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index 30f8b88b116..68a856373bf 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -28,8 +28,6 @@ namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::tile
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<table> tile(table_view const& input,
                             size_type count,
@@ -38,8 +36,6 @@ std::unique_ptr<table> tile(table_view const& input,
 
 /**
  * @copydoc cudf::interleave_columns
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view,
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index a0a7fe694bb..07aaf6488ad 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -47,13 +47,14 @@ namespace CUDF_EXPORT cudf {
  * @throws cudf::logic_error if input contains no columns.
  * @throws cudf::logic_error if input columns dtypes are not identical.
  *
- * @param[in] input Table containing columns to interleave
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
- *
+ * @param input Table containing columns to interleave
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return The interleaved columns as a single column
  */
 std::unique_ptr<column> interleave_columns(
   table_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -68,15 +69,17 @@ std::unique_ptr<column> interleave_columns(
  * return = [[8, 4, 7, 8, 4, 7], [5, 2, 3, 5, 2, 3]]
  * ```
  *
- * @param[in] input Table containing rows to be repeated
- * @param[in] count Number of times to tile "rows". Must be non-negative
- * @param[in] mr Device memory resource used to allocate the returned table's device memory
+ * @param input Table containing rows to be repeated
+ * @param count Number of times to tile "rows". Must be non-negative
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
  *
  * @return The table containing the tiled "rows"
  */
 std::unique_ptr<table> tile(
   table_view const& input,
   size_type count,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -95,6 +98,7 @@ enum class flip_endianness : bool { NO, YES };
  *
  * @param input_column Column to be converted to lists of bytes
  * @param endian_configuration Whether to retain or flip the endianness of the elements
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @return The column containing the lists of bytes
@@ -102,6 +106,7 @@ enum class flip_endianness : bool { NO, YES };
 std::unique_ptr<column> byte_cast(
   column_view const& input_column,
   flip_endianness endian_configuration,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 3dfa0b65814..2a03a5504c1 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -167,11 +167,6 @@ struct byte_list_conversion_fn<T, std::enable_if_t<std::is_same_v<T, cudf::strin
 
 }  // namespace
 
-/**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
                                   rmm::cuda_stream_view stream,
@@ -183,15 +178,13 @@ std::unique_ptr<column> byte_cast(column_view const& input,
 
 }  // namespace detail
 
-/**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
- */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::byte_cast(input, endian_configuration, cudf::get_default_stream(), mr);
+  return detail::byte_cast(input, endian_configuration, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 79124508b11..7473b6045af 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -264,10 +264,11 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
 }  // namespace detail
 
 std::unique_ptr<column> interleave_columns(table_view const& input,
+                                           rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::interleave_columns(input, cudf::get_default_stream(), mr);
+  return detail::interleave_columns(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 29996aa2152..3d4fb73c000 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -64,10 +64,11 @@ std::unique_ptr<table> tile(table_view const& in,
 
 std::unique_ptr<table> tile(table_view const& in,
                             size_type count,
+                            rmm::cuda_stream_view stream,
                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tile(in, count, cudf::get_default_stream(), mr);
+  return detail::tile(in, count, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 22827484f9a..4dffcb41ba2 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -704,6 +704,7 @@ ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE test
 ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
 ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/reshape_test.cpp b/cpp/tests/streams/reshape_test.cpp
new file mode 100644
index 00000000000..d7c5da91bca
--- /dev/null
+++ b/cpp/tests/streams/reshape_test.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/reshape.hpp>
+
+class ReshapeTest : public cudf::test::BaseFixture {};
+
+TEST_F(ReshapeTest, InterleaveColumns)
+{
+  auto a = cudf::test::fixed_width_column_wrapper<int32_t>({0, 3, 6});
+  auto b = cudf::test::fixed_width_column_wrapper<int32_t>({1, 4, 7});
+  auto c = cudf::test::fixed_width_column_wrapper<int32_t>({2, 5, 8});
+  cudf::table_view in(std::vector<cudf::column_view>{a, b, c});
+  cudf::interleave_columns(in, cudf::test::get_default_stream());
+}
+
+TEST_F(ReshapeTest, Tile)
+{
+  auto a = cudf::test::fixed_width_column_wrapper<int32_t>({-1, 0, 1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+  cudf::tile(in, 2, cudf::test::get_default_stream());
+}
+
+TEST_F(ReshapeTest, ByteCast)
+{
+  auto a = cudf::test::fixed_width_column_wrapper<int32_t>({0, 100, -100, 1000, 1000});
+  cudf::byte_cast(a, cudf::flip_endianness::YES, cudf::test::get_default_stream());
+  cudf::byte_cast(a, cudf::flip_endianness::NO, cudf::test::get_default_stream());
+}

From 35796057b64e258713d4d89ba368837d30a1a9c5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 08:33:23 -1000
Subject: [PATCH 009/270] Align misc DataFrame and MultiIndex methods with
 pandas 2.x (#16402)

The API changes in this PR are mostly adding implementations or adding missing keyword argument (although they might not be implemented). The APIs affected are:

* `DataFrame.insert`
* `DataFrame.melt`
* `DataFrame.merge`
* `DataFrame.quantile`
* `DataFrame.cov`
* `DataFrame.corr`
* `DataFrame.median`
* `DataFrame.rolling`
* `DataFrame.resample`
* `DataFrame.dropna`
* `MultiIndex.from_tuple`
* `MultiIndex.from_frame`
* `MultiIndex.from_product`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16402
---
 python/cudf/cudf/core/dataframe.py      | 106 +++++++++++++++++-------
 python/cudf/cudf/core/indexed_frame.py  |  81 +++++++++++-------
 python/cudf/cudf/core/multiindex.py     |  38 +++++++--
 python/cudf/cudf/core/reshape.py        |   3 +
 python/cudf/cudf/core/window/ewm.py     |  23 +++--
 python/cudf/cudf/core/window/rolling.py |  27 +++++-
 python/cudf/cudf/tests/test_dropna.py   |   9 ++
 7 files changed, 211 insertions(+), 76 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1d7136e61e3..6ea11fe9f64 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3215,26 +3215,37 @@ def reset_index(
         )
 
     @_performance_tracking
-    def insert(self, loc, name, value, nan_as_null=no_default):
+    def insert(
+        self,
+        loc,
+        column,
+        value,
+        allow_duplicates: bool = False,
+        nan_as_null=no_default,
+    ):
         """Add a column to DataFrame at the index specified by loc.
 
         Parameters
         ----------
         loc : int
             location to insert by index, cannot be greater then num columns + 1
-        name : number or string
-            name or label of column to be inserted
+        column : number or string
+            column or label of column to be inserted
         value : Series or array-like
         nan_as_null : bool, Default None
             If ``None``/``True``, converts ``np.nan`` values to
             ``null`` values.
             If ``False``, leaves ``np.nan`` values as is.
         """
+        if allow_duplicates is not False:
+            raise NotImplementedError(
+                "allow_duplicates is currently not implemented."
+            )
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
         return self._insert(
             loc=loc,
-            name=name,
+            name=column,
             value=value,
             nan_as_null=nan_as_null,
             ignore_index=False,
@@ -4097,7 +4108,15 @@ def transpose(self):
     T = property(transpose, doc=transpose.__doc__)
 
     @_performance_tracking
-    def melt(self, **kwargs):
+    def melt(
+        self,
+        id_vars=None,
+        value_vars=None,
+        var_name=None,
+        value_name="value",
+        col_level=None,
+        ignore_index: bool = True,
+    ):
         """Unpivots a DataFrame from wide format to long format,
         optionally leaving identifier variables set.
 
@@ -4124,23 +4143,30 @@ def melt(self, **kwargs):
         """
         from cudf.core.reshape import melt
 
-        return melt(self, **kwargs)
+        return melt(
+            self,
+            id_vars=id_vars,
+            value_vars=value_vars,
+            var_name=var_name,
+            value_name=value_name,
+            col_level=col_level,
+            ignore_index=ignore_index,
+        )
 
     @_performance_tracking
     def merge(
         self,
         right,
+        how="inner",
         on=None,
         left_on=None,
         right_on=None,
         left_index=False,
         right_index=False,
-        how="inner",
         sort=False,
-        lsuffix=None,
-        rsuffix=None,
-        indicator=False,
         suffixes=("_x", "_y"),
+        indicator=False,
+        validate=None,
     ):
         """Merge GPU DataFrame objects by performing a database-style join
         operation by columns or indexes.
@@ -4241,17 +4267,8 @@ def merge(
             raise NotImplementedError(
                 "Only indicator=False is currently supported"
             )
-
-        if lsuffix or rsuffix:
-            raise ValueError(
-                "The lsuffix and rsuffix keywords have been replaced with the "
-                "``suffixes=`` keyword.  "
-                "Please provide the following instead: \n\n"
-                "    suffixes=('%s', '%s')"
-                % (lsuffix or "_x", rsuffix or "_y")
-            )
-        else:
-            lsuffix, rsuffix = suffixes
+        if validate is not None:
+            raise NotImplementedError("validate is currently not supported.")
 
         lhs, rhs = self, right
         merge_cls = Merge
@@ -5952,9 +5969,9 @@ def quantile(
         axis=0,
         numeric_only=True,
         interpolation=None,
+        method="single",
         columns=None,
         exact=True,
-        method="single",
     ):
         """
         Return values at the given quantile.
@@ -5980,14 +5997,14 @@ def quantile(
                 * higher: `j`.
                 * nearest: `i` or `j` whichever is nearest.
                 * midpoint: (`i` + `j`) / 2.
-        columns : list of str
-            List of column names to include.
-        exact : boolean
-            Whether to use approximate or exact quantile algorithm.
         method : {'single', 'table'}, default `'single'`
             Whether to compute quantiles per-column ('single') or over all
             columns ('table'). When 'table', the only allowed interpolation
             methods are 'nearest', 'lower', and 'higher'.
+        columns : list of str
+            List of column names to include.
+        exact : boolean
+            Whether to use approximate or exact quantile algorithm.
 
         Returns
         -------
@@ -7309,25 +7326,47 @@ def unnamed_group_generator():
             return result
 
     @_performance_tracking
-    def cov(self, **kwargs):
+    def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False):
         """Compute the covariance matrix of a DataFrame.
 
         Parameters
         ----------
-        **kwargs
-            Keyword arguments to be passed to cupy.cov
+        min_periods : int, optional
+            Minimum number of observations required per pair of columns to
+            have a valid result.
+            Currently not supported.
+
+        ddof : int, default 1
+            Delta degrees of freedom.  The divisor used in calculations
+            is ``N - ddof``, where ``N`` represents the number of elements.
+
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+            Currently not supported.
 
         Returns
         -------
         cov : DataFrame
         """
-        cov = cupy.cov(self.values, rowvar=False)
+        if min_periods is not None:
+            raise NotImplementedError(
+                "min_periods is currently not supported."
+            )
+
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
+
+        cov = cupy.cov(self.values, ddof=ddof, rowvar=False)
         cols = self._data.to_pandas_index()
         df = DataFrame(cupy.asfortranarray(cov)).set_index(cols)
         df._set_columns_like(self._data)
         return df
 
-    def corr(self, method="pearson", min_periods=None):
+    def corr(
+        self, method="pearson", min_periods=None, numeric_only: bool = False
+    ):
         """Compute the correlation matrix of a DataFrame.
 
         Parameters
@@ -7357,6 +7396,11 @@ def corr(self, method="pearson", min_periods=None):
         if min_periods is not None:
             raise NotImplementedError("Unsupported argument 'min_periods'")
 
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
+
         corr = cupy.corrcoef(values, rowvar=False)
         cols = self._data.to_pandas_index()
         df = DataFrame(cupy.asfortranarray(corr)).set_index(cols)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e14f8923c25..0678ebfdd81 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1495,9 +1495,7 @@ def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs):
             **kwargs,
         )
 
-    def median(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
-    ):
+    def median(self, axis=None, skipna=True, numeric_only=None, **kwargs):
         """
         Return the median of the values for the requested axis.
 
@@ -1857,7 +1855,16 @@ def mask(
     @_performance_tracking
     @copy_docstring(Rolling)
     def rolling(
-        self, window, min_periods=None, center=False, axis=0, win_type=None
+        self,
+        window,
+        min_periods=None,
+        center: bool = False,
+        win_type: str | None = None,
+        on=None,
+        axis=0,
+        closed: str | None = None,
+        step: int | None = None,
+        method: str = "single",
     ):
         return Rolling(
             self,
@@ -1865,7 +1872,11 @@ def rolling(
             min_periods=min_periods,
             center=center,
             axis=axis,
+            on=on,
             win_type=win_type,
+            closed=closed,
+            step=step,
+            method=method,
         )
 
     @copy_docstring(ExponentialMovingWindow)
@@ -1880,6 +1891,7 @@ def ewm(
         ignore_na: bool = False,
         axis: int = 0,
         times: str | np.ndarray | None = None,
+        method: Literal["single", "table"] = "single",
     ):
         return ExponentialMovingWindow(
             self,
@@ -1892,6 +1904,7 @@ def ewm(
             ignore_na=ignore_na,
             axis=axis,
             times=times,
+            method=method,
         )
 
     @_performance_tracking
@@ -3943,16 +3956,15 @@ def resample(
         self,
         rule,
         axis=0,
-        closed=None,
-        label=None,
-        convention="start",
+        closed: Literal["right", "left"] | None = None,
+        label: Literal["right", "left"] | None = None,
+        convention: Literal["start", "end", "s", "e"] = "start",
         kind=None,
-        loffset=None,
-        base=None,
         on=None,
         level=None,
         origin="start_day",
         offset=None,
+        group_keys: bool = False,
     ):
         """
         Convert the frequency of ("resample") the given time series data.
@@ -4090,26 +4102,27 @@ def resample(
                 "deprecated and will be removed in a future version. ",
                 FutureWarning,
             )
-        if (axis, convention, kind, loffset, base, origin, offset) != (
-            0,
-            "start",
-            None,
-            None,
-            None,
-            "start_day",
-            None,
-        ):
-            raise NotImplementedError(
-                "The following arguments are not "
-                "currently supported by resample:\n\n"
-                "- axis\n"
-                "- convention\n"
-                "- kind\n"
-                "- loffset\n"
-                "- base\n"
-                "- origin\n"
-                "- offset"
+            raise NotImplementedError("kind is currently not supported.")
+        if axis != 0:
+            warnings.warn(
+                "The 'axis' keyword in is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
             )
+            raise NotImplementedError("axis is currently not supported.")
+        if convention != "start":
+            warnings.warn(
+                "The 'convention' keyword in is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
+            raise NotImplementedError("convention is currently not supported.")
+        if origin != "start_day":
+            raise NotImplementedError("origin is currently not supported.")
+        if offset is not None:
+            raise NotImplementedError("offset is currently not supported.")
+        if group_keys is not False:
+            raise NotImplementedError("group_keys is currently not supported.")
         by = cudf.Grouper(
             key=on, freq=rule, closed=closed, label=label, level=level
         )
@@ -4120,7 +4133,13 @@ def resample(
         )
 
     def dropna(
-        self, axis=0, how="any", thresh=None, subset=None, inplace=False
+        self,
+        axis=0,
+        how="any",
+        thresh=None,
+        subset=None,
+        inplace=False,
+        ignore_index: bool = False,
     ):
         """
         Drop rows (or columns) containing nulls from a Column.
@@ -4144,6 +4163,8 @@ def dropna(
             columns, subset is a list of rows to consider.
         inplace : bool, default False
             If True, do operation inplace and return None.
+        ignore_index : bool, default ``False``
+            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
 
         Returns
         -------
@@ -4220,6 +4241,8 @@ def dropna(
         """
         if axis == 0:
             result = self._drop_na_rows(how=how, subset=subset, thresh=thresh)
+            if ignore_index:
+                result.index = RangeIndex(len(result))
         else:
             result = self._drop_na_columns(
                 how=how, subset=subset, thresh=thresh
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index dfc596bf279..0e1fddd7ed5 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -524,8 +524,10 @@ def codes(self):
             col.values for col in self._codes
         )
 
-    def get_slice_bound(self, label, side, kind=None):
-        raise NotImplementedError()
+    def get_slice_bound(self, label, side):
+        raise NotImplementedError(
+            "get_slice_bound is not currently implemented."
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -1108,7 +1110,7 @@ def _concat(cls, objs):
 
     @classmethod
     @_performance_tracking
-    def from_tuples(cls, tuples, names=None):
+    def from_tuples(cls, tuples, sortorder: int | None = None, names=None):
         """
         Convert list of tuples to MultiIndex.
 
@@ -1116,6 +1118,9 @@ def from_tuples(cls, tuples, names=None):
         ----------
         tuples : list / sequence of tuple-likes
             Each tuple is the index of one row/column.
+        sortorder : int or None
+            Level of sortedness (must be lexicographically sorted by that
+            level).
         names : list / sequence of str, optional
             Names for the levels in the index.
 
@@ -1142,7 +1147,9 @@ def from_tuples(cls, tuples, names=None):
                    names=['number', 'color'])
         """
         # Use Pandas for handling Python host objects
-        pdi = pd.MultiIndex.from_tuples(tuples, names=names)
+        pdi = pd.MultiIndex.from_tuples(
+            tuples, sortorder=sortorder, names=names
+        )
         return cls.from_pandas(pdi)
 
     @_performance_tracking
@@ -1215,7 +1222,12 @@ def values(self):
 
     @classmethod
     @_performance_tracking
-    def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
+    def from_frame(
+        cls,
+        df: pd.DataFrame | cudf.DataFrame,
+        sortorder: int | None = None,
+        names=None,
+    ):
         """
         Make a MultiIndex from a DataFrame.
 
@@ -1223,6 +1235,9 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         ----------
         df : DataFrame
             DataFrame to be converted to MultiIndex.
+        sortorder : int, optional
+            Level of sortedness (must be lexicographically sorted by that
+            level).
         names : list-like, optional
             If no names are provided, use the column names, or tuple of column
             names if the columns is a MultiIndex. If a sequence, overwrite
@@ -1273,11 +1288,13 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         else:
             source_data = df
         names = names if names is not None else source_data._column_names
-        return cls.from_arrays(source_data._columns, names=names)
+        return cls.from_arrays(
+            source_data._columns, sortorder=sortorder, names=names
+        )
 
     @classmethod
     @_performance_tracking
-    def from_product(cls, arrays, names=None):
+    def from_product(cls, iterables, sortorder: int | None = None, names=None):
         """
         Make a MultiIndex from the cartesian product of multiple iterables.
 
@@ -1285,6 +1302,9 @@ def from_product(cls, arrays, names=None):
         ----------
         iterables : list / sequence of iterables
             Each iterable has unique labels for each level of the index.
+        sortorder : int or None
+            Level of sortedness (must be lexicographically sorted by that
+            level).
         names : list / sequence of str, optional
             Names for the levels in the index.
             If not explicitly provided, names will be inferred from the
@@ -1314,7 +1334,9 @@ def from_product(cls, arrays, names=None):
                    names=['number', 'color'])
         """
         # Use Pandas for handling Python host objects
-        pdi = pd.MultiIndex.from_product(arrays, names=names)
+        pdi = pd.MultiIndex.from_product(
+            iterables, sortorder=sortorder, names=names
+        )
         return cls.from_pandas(pdi)
 
     @classmethod
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index a542c5f5969..e7248977b1d 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -502,6 +502,7 @@ def melt(
     var_name=None,
     value_name="value",
     col_level=None,
+    ignore_index: bool = True,
 ):
     """Unpivots a DataFrame from wide format to long format,
     optionally leaving identifier variables set.
@@ -566,6 +567,8 @@ def melt(
     """
     if col_level is not None:
         raise NotImplementedError("col_level != None is not supported yet.")
+    if ignore_index is not True:
+        raise NotImplementedError("ignore_index is currently not supported.")
 
     # Arg cleaning
 
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index 1203a840076..ef0f6958aeb 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
 from __future__ import annotations
 
+import warnings
+from typing import Literal
+
 import numpy as np
 
 from cudf._lib.reduce import scan
@@ -103,13 +105,24 @@ def __init__(
         ignore_na: bool = False,
         axis: int = 0,
         times: str | np.ndarray | None = None,
+        method: Literal["single", "table"] = "single",
     ):
-        if (min_periods, ignore_na, axis, times) != (0, False, 0, None):
+        if min_periods != 0:
             raise NotImplementedError(
-                "The parameters `min_periods`, `ignore_na`, "
-                "`axis`, and `times` are not yet supported."
+                "min_periods is currently not supported."
             )
-
+        if ignore_na is not False:
+            raise NotImplementedError("ignore_na is currently not supported.")
+        if axis != 0:
+            warnings.warn(
+                "axis is deprecated with will be removed in a future version. "
+                "Transpose the DataFrame first instead."
+            )
+            raise NotImplementedError("axis is currently not supported.")
+        if times is not None:
+            raise NotImplementedError("times is currently not supported.")
+        if method != "single":
+            raise NotImplementedError("method is currently not supported.")
         self.obj = obj
         self.adjust = adjust
         self.com = get_center_of_mass(com, span, halflife, alpha)
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 29391c68471..043a41145e5 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -1,4 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION
+from __future__ import annotations
+
+import warnings
 
 import numba
 import pandas as pd
@@ -196,17 +199,26 @@ def __init__(
         obj,
         window,
         min_periods=None,
-        center=False,
+        center: bool = False,
+        win_type: str | None = None,
+        on=None,
         axis=0,
-        win_type=None,
+        closed: str | None = None,
+        step: int | None = None,
+        method: str = "single",
     ):
         self.obj = obj
         self.window = window
         self.min_periods = min_periods
         self.center = center
         self._normalize()
-        self.agg_params = {}
+        # for var & std only?
+        self.agg_params: dict[str, int] = {}
         if axis != 0:
+            warnings.warn(
+                "axis is deprecated with will be removed in a future version. "
+                "Transpose the DataFrame first instead."
+            )
             raise NotImplementedError("axis != 0 is not supported yet.")
         self.axis = axis
 
@@ -217,6 +229,15 @@ def __init__(
                 )
         self.win_type = win_type
 
+        if on is not None:
+            raise NotImplementedError("on is currently not supported")
+        if closed not in (None, "right"):
+            raise NotImplementedError("closed is currently not supported")
+        if step is not None:
+            raise NotImplementedError("step is currently not supported")
+        if method != "single":
+            raise NotImplementedError("method is currently not supported")
+
     def __getitem__(self, arg):
         if isinstance(arg, tuple):
             arg = list(arg)
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index ed0cf0053ea..5b1ee0ffac6 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -284,3 +284,12 @@ def test_dropna_multiindex_2(data, how):
     got = gi.dropna(how)
 
     assert_eq(expect, got)
+
+
+def test_ignore_index():
+    pser = pd.Series([1, 2, np.nan], index=[2, 4, 1])
+    gser = cudf.from_pandas(pser)
+
+    result = pser.dropna(ignore_index=True)
+    expected = gser.dropna(ignore_index=True)
+    assert_eq(result, expected)

From 743e16426c564d0ed0d7e3d9be5f67e4605c4f32 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 29 Jul 2024 14:19:43 -0500
Subject: [PATCH 010/270] update some branch references in GitHub Actions
 configs (#16397)

Fixes some lingering references to `branch-24.08` in the `pr_issue_status_automation` CI workflow.

This was missed when new branches were cut because that file ends in `.yml` and `update-version.sh` was only modifying files ending in `.yaml`. The corresponding `update-version.sh` changes were made in #16183 and are already on 24.10 thanks to forward mergers.

https://github.com/rapidsai/cudf/blob/dc05a01f3fc0742c5fbbddd86a0f2007bfdc2050/ci/release/update-version.sh#L78

## Notes for Reviewers

I checked like this, and don't see any other missed references:

```shell
git grep -E '24\.8|24\.08|0\.39'
```

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/16397
---
 .github/workflows/pr_issue_status_automation.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 8ca971dc28d..45e5191eb54 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:

From f8eb63e499f94d583d715f5c1f5e6f234589be57 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 12:39:19 -1000
Subject: [PATCH 011/270] Align Index APIs with pandas 2.x (#16361)

Similar to https://github.com/rapidsai/cudf/pull/16310, the follow APIs have been modified to adjust/add parameters

* `to_flat_index`
* `isin`
* `unique`
* `transpose`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16361
---
 docs/cudf/source/conf.py                     |  5 ++++
 python/cudf/cudf/core/_base_index.py         | 25 ++++++++++++++++++--
 python/cudf/cudf/core/index.py               | 24 +++++++++++++++----
 python/cudf/cudf/core/multiindex.py          | 16 +++++++++++--
 python/cudf/cudf/core/series.py              |  8 -------
 python/cudf/cudf/core/single_column_frame.py |  7 ++++++
 python/cudf/cudf/tests/test_multiindex.py    |  9 +++++++
 7 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index f544536fb31..7421d9be298 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -561,6 +561,11 @@ def on_missing_reference(app, env, node, contnode):
     ("py:class", "ScalarLike"),
     ("py:class", "ParentType"),
     ("py:class", "ColumnLike"),
+    ("py:class", "ColumnLike"),
+    ("py:obj", "cudf.Index.transpose"),
+    ("py:obj", "cudf.Index.T"),
+    ("py:obj", "cudf.Index.to_flat_index"),
+    ("py:obj", "cudf.MultiIndex.to_flat_index"),
     # TODO: Remove this when we figure out why typing_extensions doesn't seem
     # to map types correctly for intersphinx
     ("py:class", "typing_extensions.Self"),
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 8fad82c5c46..c91514202c5 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -868,6 +868,24 @@ def to_numpy(self):
         """Convert to a numpy array."""
         raise NotImplementedError
 
+    def to_flat_index(self) -> Self:
+        """
+        Identity method.
+
+        This is implemented for compatibility with subclass implementations
+        when chaining.
+
+        Returns
+        -------
+        pd.Index
+            Caller.
+
+        See Also
+        --------
+        MultiIndex.to_flat_index : Subclass implementation.
+        """
+        return self
+
     def any(self):
         """
         Return whether any elements is True in Index.
@@ -945,7 +963,7 @@ def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False):
         """
         raise NotImplementedError
 
-    def isin(self, values):
+    def isin(self, values, level=None):
         """Return a boolean array where the index values are in values.
 
         Compute boolean array of whether each index value is found in
@@ -956,6 +974,9 @@ def isin(self, values):
         ----------
         values : set, list-like, Index
             Sought values.
+        level : str or int, optional
+            Name or position of the index level to use (if the index is a
+            `MultiIndex`).
 
         Returns
         -------
@@ -979,7 +1000,7 @@ def isin(self, values):
         # ColumnBase.isin).
         raise NotImplementedError
 
-    def unique(self):
+    def unique(self, level: int | None = None):
         """
         Return unique values in the index.
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 1c48b8f4f2d..156cb973a9a 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -540,8 +540,12 @@ def memory_usage(self, deep: bool = False) -> int:
             )
         return 0
 
-    def unique(self) -> Self:
+    def unique(self, level: int | None = None) -> Self:
         # RangeIndex always has unique values
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         return self.copy()
 
     @_performance_tracking
@@ -964,7 +968,11 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn:
             i = []
         return as_column(i, dtype=size_type_dtype)
 
-    def isin(self, values):
+    def isin(self, values, level=None):
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         if is_scalar(values):
             raise TypeError(
                 "only list-like objects are allowed to be passed "
@@ -1616,12 +1624,20 @@ def append(self, other):
 
         return self._concat(to_concat)
 
-    def unique(self):
+    def unique(self, level: int | None = None) -> Self:
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         return cudf.core.index._index_from_data(
             {self.name: self._values.unique()}, name=self.name
         )
 
-    def isin(self, values):
+    def isin(self, values, level=None):
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         if is_scalar(values):
             raise TypeError(
                 "only list-like objects are allowed to be passed "
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 0e1fddd7ed5..2788455aebf 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1156,6 +1156,15 @@ def from_tuples(cls, tuples, sortorder: int | None = None, names=None):
     def to_numpy(self):
         return self.values_host
 
+    def to_flat_index(self):
+        """
+        Convert a MultiIndex to an Index of Tuples containing the level values.
+
+        This is not currently implemented
+        """
+        # TODO: Could implement as Index of ListDtype?
+        raise NotImplementedError("to_flat_index is not currently supported.")
+
     @property  # type: ignore
     @_performance_tracking
     def values_host(self):
@@ -1734,8 +1743,11 @@ def fillna(self, value):
         return super().fillna(value=value)
 
     @_performance_tracking
-    def unique(self):
-        return self.drop_duplicates(keep="first")
+    def unique(self, level: int | None = None) -> Self | cudf.Index:
+        if level is None:
+            return self.drop_duplicates(keep="first")
+        else:
+            return self.get_level_values(level).unique()
 
     @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 8277ccf68fc..10ac1fdfc1e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2775,14 +2775,6 @@ def cov(self, other, min_periods=None, ddof: int | None = None):
                 f"{other.dtype}"
             )
 
-    @_performance_tracking
-    def transpose(self):
-        """Return the transpose, which is by definition self."""
-
-        return self
-
-    T = property(transpose, doc=transpose.__doc__)
-
     @_performance_tracking
     def duplicated(self, keep="first"):
         """
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index b93528f9693..a5ff1223791 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -389,3 +389,10 @@ def where(self, cond, other=None, inplace=False):
         result = cudf._lib.copying.copy_if_else(input_col, other, cond)
 
         return _make_categorical_like(result, self_column)
+
+    @_performance_tracking
+    def transpose(self):
+        """Return the transpose, which is by definition self."""
+        return self
+
+    T = property(transpose, doc=transpose.__doc__)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 2c00d48266c..b7314a36e73 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2170,3 +2170,12 @@ def test_bool_raises():
         lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]],
         rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]],
     )
+
+
+def test_unique_level():
+    pd_mi = pd.MultiIndex.from_arrays([[1, 1, 2], [3, 3, 2]])
+    cudf_mi = cudf.MultiIndex.from_pandas(pd_mi)
+
+    result = pd_mi.unique(level=1)
+    expected = cudf_mi.unique(level=1)
+    assert_eq(result, expected)

From 368a34ca9fd7db1b6cfb6e7817978e3e4fcfb00b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 29 Jul 2024 20:05:17 -0500
Subject: [PATCH 012/270] Use RMM adaptor constructors instead of factories.
 (#16414)

This PR uses RMM memory resource adaptor constructors instead of factory functions. With CTAD, we do not need the factory and can use the constructor directly. The factory will be deprecated in https://github.com/rapidsai/rmm/pull/1626.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

URL: https://github.com/rapidsai/cudf/pull/16414
---
 cpp/benchmarks/fixture/benchmark_fixture.hpp         |  2 +-
 .../cudf_test/stream_checking_resource_adaptor.hpp   | 12 ------------
 cpp/include/cudf_test/testing_main.hpp               |  2 +-
 java/src/main/native/src/RmmJni.cpp                  |  7 -------
 4 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index 8c8d6756b00..8900899f9be 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -107,7 +107,7 @@ class memory_stats_logger {
  public:
   memory_stats_logger()
     : existing_mr(rmm::mr::get_current_device_resource()),
-      statistics_mr(rmm::mr::make_statistics_adaptor(existing_mr))
+      statistics_mr(rmm::mr::statistics_resource_adaptor(existing_mr))
   {
     rmm::mr::set_current_device_resource(&statistics_mr);
   }
diff --git a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index 4f3c723d195..417bbb3d9ab 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -156,16 +156,4 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
                                // cudf::test::get_default_stream() is observed.
 };
 
-/**
- * @brief Convenience factory to return a `stream_checking_resource_adaptor` around the
- * upstream resource `upstream`.
- *
- * @param upstream Reference to the upstream resource
- */
-inline stream_checking_resource_adaptor make_stream_checking_resource_adaptor(
-  rmm::device_async_resource_ref upstream, bool error_on_invalid_stream, bool check_default_stream)
-{
-  return stream_checking_resource_adaptor{upstream, error_on_invalid_stream, check_default_stream};
-}
-
 }  // namespace cudf::test
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 9866253a9f8..ed83ddabb00 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -183,7 +183,7 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
   auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
   auto const error_on_invalid_stream = (stream_error_mode == "error");
   auto const check_default_stream    = (stream_mode == "new_cudf_default");
-  auto adaptor                       = cudf::test::make_stream_checking_resource_adaptor(
+  auto adaptor                       = cudf::test::stream_checking_resource_adaptor(
     resource, error_on_invalid_stream, check_default_stream);
   if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
     rmm::mr::set_current_device_resource(&adaptor);
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 5842a980fc4..09c04a77590 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -154,13 +154,6 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
   }
 };
 
-template <typename Upstream>
-tracking_resource_adaptor<Upstream>* make_tracking_adaptor(Upstream* upstream,
-                                                           std::size_t size_alignment)
-{
-  return new tracking_resource_adaptor<Upstream>{upstream, size_alignment};
-}
-
 /**
  * @brief An RMM device memory resource adaptor that delegates to the wrapped resource
  * for most operations but will call Java to handle certain situations (e.g.: allocation failure).

From d1be0b6dc06fddd0b69fb69731281b16894cb132 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 15:12:38 -1000
Subject: [PATCH 013/270] Align CategoricalIndex APIs with pandas 2.x (#16369)

Mostly exposing methods that were available on the CategoricalColumn

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16369
---
 python/cudf/cudf/core/column/categorical.py | 130 +++++++++++---------
 python/cudf/cudf/core/index.py              | 116 +++++++++++++++++
 python/cudf/cudf/tests/test_categorical.py  |  56 +++++++++
 3 files changed, 247 insertions(+), 55 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 9aaccca349d..9433a91b9c6 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -262,37 +262,10 @@ def add_categories(self, new_categories: Any) -> SeriesOrIndex | None:
         dtype: category
         Categories (2, int64): [1, 2]
         """
-        old_categories = self._column.categories
-        new_categories = column.as_column(
-            new_categories,
-            dtype=old_categories.dtype if len(new_categories) == 0 else None,
-        )
-
-        if is_mixed_with_object_dtype(old_categories, new_categories):
-            raise TypeError(
-                f"cudf does not support adding categories with existing "
-                f"categories of dtype `{old_categories.dtype}` and new "
-                f"categories of dtype `{new_categories.dtype}`, please "
-                f"type-cast new_categories to the same type as "
-                f"existing categories."
-            )
-        common_dtype = find_common_type(
-            [old_categories.dtype, new_categories.dtype]
+        return self._return_or_inplace(
+            self._column.add_categories(new_categories=new_categories)
         )
 
-        new_categories = new_categories.astype(common_dtype)
-        old_categories = old_categories.astype(common_dtype)
-
-        if old_categories.isin(new_categories).any():
-            raise ValueError("new categories must not include old categories")
-
-        new_categories = old_categories.append(new_categories)
-        out_col = self._column
-        if not out_col._categories_equal(new_categories):
-            out_col = out_col._set_categories(new_categories)
-
-        return self._return_or_inplace(out_col)
-
     def remove_categories(
         self,
         removals: Any,
@@ -349,23 +322,9 @@ def remove_categories(
         dtype: category
         Categories (3, int64): [1, 2, 10]
         """
-
-        cats = self.categories.to_series()
-        removals = cudf.Series(removals, dtype=cats.dtype)
-        removals_mask = removals.isin(cats)
-
-        # ensure all the removals are in the current categories
-        # list. If not, raise an error to match Pandas behavior
-        if not removals_mask.all():
-            vals = removals[~removals_mask].to_numpy()
-            raise ValueError(f"removals must all be in old categories: {vals}")
-
-        new_categories = cats[~cats.isin(removals)]._column
-        out_col = self._column
-        if not out_col._categories_equal(new_categories):
-            out_col = out_col._set_categories(new_categories)
-
-        return self._return_or_inplace(out_col)
+        return self._return_or_inplace(
+            self._column.remove_categories(removals=removals)
+        )
 
     def set_categories(
         self,
@@ -1319,7 +1278,7 @@ def _set_categories(
         new_categories: Any,
         is_unique: bool = False,
         ordered: bool = False,
-    ) -> CategoricalColumn:
+    ) -> Self:
         """Returns a new CategoricalColumn with the categories set to the
         specified *new_categories*.
 
@@ -1376,17 +1335,68 @@ def _set_categories(
         new_codes = df._data["new_codes"]
 
         # codes can't have masks, so take mask out before moving in
-        return column.build_categorical_column(
-            categories=new_cats,
-            codes=column.build_column(
-                new_codes.base_data, dtype=new_codes.dtype
+        return cast(
+            Self,
+            column.build_categorical_column(
+                categories=new_cats,
+                codes=column.build_column(
+                    new_codes.base_data, dtype=new_codes.dtype
+                ),
+                mask=new_codes.base_mask,
+                size=new_codes.size,
+                offset=new_codes.offset,
+                ordered=ordered,
             ),
-            mask=new_codes.base_mask,
-            size=new_codes.size,
-            offset=new_codes.offset,
-            ordered=ordered,
         )
 
+    def add_categories(self, new_categories: Any) -> Self:
+        old_categories = self.categories
+        new_categories = column.as_column(
+            new_categories,
+            dtype=old_categories.dtype if len(new_categories) == 0 else None,
+        )
+        if is_mixed_with_object_dtype(old_categories, new_categories):
+            raise TypeError(
+                f"cudf does not support adding categories with existing "
+                f"categories of dtype `{old_categories.dtype}` and new "
+                f"categories of dtype `{new_categories.dtype}`, please "
+                f"type-cast new_categories to the same type as "
+                f"existing categories."
+            )
+        common_dtype = find_common_type(
+            [old_categories.dtype, new_categories.dtype]
+        )
+
+        new_categories = new_categories.astype(common_dtype)
+        old_categories = old_categories.astype(common_dtype)
+
+        if old_categories.isin(new_categories).any():
+            raise ValueError("new categories must not include old categories")
+
+        new_categories = old_categories.append(new_categories)
+        if not self._categories_equal(new_categories):
+            return self._set_categories(new_categories)
+        return self
+
+    def remove_categories(
+        self,
+        removals: Any,
+    ) -> Self:
+        removals = column.as_column(removals).astype(self.categories.dtype)
+        removals_mask = removals.isin(self.categories)
+
+        # ensure all the removals are in the current categories
+        # list. If not, raise an error to match Pandas behavior
+        if not removals_mask.all():
+            raise ValueError("removals must all be in old categories")
+
+        new_categories = self.categories.apply_boolean_mask(
+            self.categories.isin(removals).unary_operator("not")
+        )
+        if not self._categories_equal(new_categories):
+            return self._set_categories(new_categories)
+        return self
+
     def reorder_categories(
         self,
         new_categories: Any,
@@ -1404,6 +1414,16 @@ def reorder_categories(
             )
         return self._set_categories(new_categories, ordered=ordered)
 
+    def rename_categories(self, new_categories) -> CategoricalColumn:
+        raise NotImplementedError(
+            "rename_categories is currently not supported."
+        )
+
+    def remove_unused_categories(self) -> Self:
+        raise NotImplementedError(
+            "remove_unused_categories is currently not supported."
+        )
+
     def as_ordered(self, ordered: bool):
         if self.dtype.ordered == ordered:
             return self
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 156cb973a9a..8c3b091abec 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2721,6 +2721,10 @@ def __init__(
             data = data.as_ordered(ordered=False)
         super().__init__(data, name=name)
 
+    @property
+    def ordered(self) -> bool:
+        return self._column.ordered
+
     @property  # type: ignore
     @_performance_tracking
     def codes(self):
@@ -2743,6 +2747,118 @@ def _is_boolean(self):
     def _is_categorical(self):
         return True
 
+    def add_categories(self, new_categories) -> Self:
+        """
+        Add new categories.
+
+        `new_categories` will be included at the last/highest place in the
+        categories and will be unused directly after this call.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.add_categories(new_categories)}
+        )
+
+    def as_ordered(self) -> Self:
+        """
+        Set the Categorical to be ordered.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.as_ordered(ordered=True)}
+        )
+
+    def as_unordered(self) -> Self:
+        """
+        Set the Categorical to be unordered.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.as_ordered(ordered=False)}
+        )
+
+    def remove_categories(self, removals) -> Self:
+        """
+        Remove the specified categories.
+
+        `removals` must be included in the old categories.
+
+        Parameters
+        ----------
+        removals : category or list of categories
+           The categories which should be removed.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.remove_categories(removals)}
+        )
+
+    def remove_unused_categories(self) -> Self:
+        """
+        Remove categories which are not used.
+
+        This method is currently not supported.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.remove_unused_categories()}
+        )
+
+    def rename_categories(self, new_categories) -> Self:
+        """
+        Rename categories.
+
+        This method is currently not supported.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.rename_categories(new_categories)}
+        )
+
+    def reorder_categories(self, new_categories, ordered=None) -> Self:
+        """
+        Reorder categories as specified in new_categories.
+
+        ``new_categories`` need to include all old categories and no new category
+        items.
+
+        Parameters
+        ----------
+        new_categories : Index-like
+           The categories in new order.
+        ordered : bool, optional
+           Whether or not the categorical is treated as a ordered categorical.
+           If not given, do not change the ordered information.
+        """
+        return type(self)._from_data(
+            {
+                self.name: self._column.reorder_categories(
+                    new_categories, ordered=ordered
+                )
+            }
+        )
+
+    def set_categories(
+        self, new_categories, ordered=None, rename: bool = False
+    ) -> Self:
+        """
+        Set the categories to the specified new_categories.
+
+        Parameters
+        ----------
+        new_categories : list-like
+            The categories in new order.
+        ordered : bool, default None
+            Whether or not the categorical is treated as
+            a ordered categorical. If not given, do
+            not change the ordered information.
+        rename : bool, default False
+            Whether or not the `new_categories` should be
+            considered as a rename of the old categories
+            or as reordered categories.
+        """
+        return type(self)._from_data(
+            {
+                self.name: self._column.set_categories(
+                    new_categories, ordered=ordered, rename=rename
+                )
+            }
+        )
+
 
 @_performance_tracking
 def interval_range(
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 9b6029582ce..ae58af8ebce 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -891,3 +891,59 @@ def test_categorical_maxima(op):
     result = getattr(ser.cat.as_ordered(), op)()
     result_pd = getattr(ser_pd.cat.as_ordered(), op)()
     assert_eq(result, result_pd)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_ordered(ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered)
+    cudf_ci = cudf.from_pandas(pd_ci)
+    assert pd_ci.ordered == cudf_ci.ordered
+
+
+@pytest.mark.parametrize("method", ["as_ordered", "as_unordered"])
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_as_ordered(method, ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered)
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = getattr(pd_ci, method)()
+    result = getattr(cudf_ci, method)()
+    assert_eq(result, expected)
+
+
+def test_index_add_categories():
+    pd_ci = pd.CategoricalIndex([1, 2, 3])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.add_categories([4])
+    result = cudf_ci.add_categories([4])
+    assert_eq(result, expected)
+
+
+def test_index_remove_categories():
+    pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.remove_categories([4])
+    result = cudf_ci.remove_categories([4])
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_reorder_categories(ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 3, 2, 4])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.reorder_categories([1, 2, 3, 4], ordered=ordered)
+    result = cudf_ci.reorder_categories([1, 2, 3, 4], ordered=ordered)
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_set_categories(ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered)
+    result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered)
+    assert_eq(result, expected)

From 8def2ec1acac6a538002db011d977bb22cfbda82 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Jul 2024 14:34:59 -0500
Subject: [PATCH 014/270] Add Java APIs to copy column data to host
 asynchronously (#16429)

Adds Java methods to ColumnView to allow copying of column data to host memory asynchronously.  This can be used to avoid many unnecessary stream synchronization when copying many columns to the host.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/16429
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 52 +++++++++++++------
 .../java/ai/rapids/cudf/HostColumnVector.java |  4 ++
 .../ai/rapids/cudf/HostColumnVectorCore.java  |  4 +-
 .../ai/rapids/cudf/JCudfSerialization.java    |  5 +-
 4 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 997ff77bae3..8ff2f0f0a73 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -5034,8 +5034,8 @@ private static NestedColumnVector createNestedColumnVector(DType type, long rows
   // DATA MOVEMENT
   /////////////////////////////////////////////////////////////////////////////
 
-  private static HostColumnVectorCore copyToHostNestedHelper(
-      ColumnView deviceCvPointer, HostMemoryAllocator hostMemoryAllocator) {
+  private static HostColumnVectorCore copyToHostAsyncNestedHelper(
+      Cuda.Stream stream, ColumnView deviceCvPointer, HostMemoryAllocator hostMemoryAllocator) {
     if (deviceCvPointer == null) {
       return null;
     }
@@ -5056,20 +5056,20 @@ private static HostColumnVectorCore copyToHostNestedHelper(
       currValidity = deviceCvPointer.getValid();
       if (currData != null) {
         hostData = hostMemoryAllocator.allocate(currData.length);
-        hostData.copyFromDeviceBuffer(currData);
+        hostData.copyFromDeviceBufferAsync(currData, stream);
       }
       if (currValidity != null) {
         hostValid = hostMemoryAllocator.allocate(currValidity.length);
-        hostValid.copyFromDeviceBuffer(currValidity);
+        hostValid.copyFromDeviceBufferAsync(currValidity, stream);
       }
       if (currOffsets != null) {
         hostOffsets = hostMemoryAllocator.allocate(currOffsets.length);
-        hostOffsets.copyFromDeviceBuffer(currOffsets);
+        hostOffsets.copyFromDeviceBufferAsync(currOffsets, stream);
       }
       int numChildren = deviceCvPointer.getNumChildren();
       for (int i = 0; i < numChildren; i++) {
         try(ColumnView childDevPtr = deviceCvPointer.getChildColumnView(i)) {
-          children.add(copyToHostNestedHelper(childDevPtr, hostMemoryAllocator));
+          children.add(copyToHostAsyncNestedHelper(stream, childDevPtr, hostMemoryAllocator));
         }
       }
       currNullCount = deviceCvPointer.getNullCount();
@@ -5103,11 +5103,20 @@ private static HostColumnVectorCore copyToHostNestedHelper(
     }
   }
 
+  /** Copy the data to the host synchronously. */
+  public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
+    HostColumnVector result = copyToHostAsync(Cuda.DEFAULT_STREAM, hostMemoryAllocator);
+    Cuda.DEFAULT_STREAM.sync();
+    return result;
+  }
+
   /**
-   * Copy the data to the host.
+   * Copy the data to the host asynchronously. The caller MUST synchronize on the stream
+   * before examining the result.
    */
-  public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
-    try (NvtxRange toHost = new NvtxRange("ensureOnHost", NvtxColor.BLUE)) {
+  public HostColumnVector copyToHostAsync(Cuda.Stream stream,
+                                          HostMemoryAllocator hostMemoryAllocator) {
+    try (NvtxRange toHost = new NvtxRange("toHostAsync", NvtxColor.BLUE)) {
       HostMemoryBuffer hostDataBuffer = null;
       HostMemoryBuffer hostValidityBuffer = null;
       HostMemoryBuffer hostOffsetsBuffer = null;
@@ -5127,16 +5136,16 @@ public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
         if (!type.isNestedType()) {
           if (valid != null) {
             hostValidityBuffer = hostMemoryAllocator.allocate(valid.getLength());
-            hostValidityBuffer.copyFromDeviceBuffer(valid);
+            hostValidityBuffer.copyFromDeviceBufferAsync(valid, stream);
           }
           if (offsets != null) {
             hostOffsetsBuffer = hostMemoryAllocator.allocate(offsets.length);
-            hostOffsetsBuffer.copyFromDeviceBuffer(offsets);
+            hostOffsetsBuffer.copyFromDeviceBufferAsync(offsets, stream);
           }
           // If a strings column is all null values there is no data buffer allocated
           if (data != null) {
             hostDataBuffer = hostMemoryAllocator.allocate(data.length);
-            hostDataBuffer.copyFromDeviceBuffer(data);
+            hostDataBuffer.copyFromDeviceBufferAsync(data, stream);
           }
           HostColumnVector ret = new HostColumnVector(type, rows, Optional.of(nullCount),
               hostDataBuffer, hostValidityBuffer, hostOffsetsBuffer);
@@ -5145,21 +5154,21 @@ public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
         } else {
           if (data != null) {
             hostDataBuffer = hostMemoryAllocator.allocate(data.length);
-            hostDataBuffer.copyFromDeviceBuffer(data);
+            hostDataBuffer.copyFromDeviceBufferAsync(data, stream);
           }
 
           if (valid != null) {
             hostValidityBuffer = hostMemoryAllocator.allocate(valid.getLength());
-            hostValidityBuffer.copyFromDeviceBuffer(valid);
+            hostValidityBuffer.copyFromDeviceBufferAsync(valid, stream);
           }
           if (offsets != null) {
             hostOffsetsBuffer = hostMemoryAllocator.allocate(offsets.getLength());
-            hostOffsetsBuffer.copyFromDeviceBuffer(offsets);
+            hostOffsetsBuffer.copyFromDeviceBufferAsync(offsets, stream);
           }
           List<HostColumnVectorCore> children = new ArrayList<>();
           for (int i = 0; i < getNumChildren(); i++) {
             try (ColumnView childDevPtr = getChildColumnView(i)) {
-              children.add(copyToHostNestedHelper(childDevPtr, hostMemoryAllocator));
+              children.add(copyToHostAsyncNestedHelper(stream, childDevPtr, hostMemoryAllocator));
             }
           }
           HostColumnVector ret = new HostColumnVector(type, rows, Optional.of(nullCount),
@@ -5192,10 +5201,19 @@ public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
     }
   }
 
+  /** Copy the data to host memory synchronously */
   public HostColumnVector copyToHost() {
     return copyToHost(DefaultHostMemoryAllocator.get());
   }
 
+  /**
+   * Copy the data to the host asynchronously. The caller MUST synchronize on the stream
+   * before examining the result.
+   */
+  public HostColumnVector copyToHostAsync(Cuda.Stream stream) {
+    return copyToHostAsync(stream, DefaultHostMemoryAllocator.get());
+  }
+
   /**
    * Calculate the total space required to copy the data to the host. This should be padded to
    * the alignment that the CPU requires.
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index 6b41d10fee3..61b11673957 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -92,6 +92,8 @@ public interface EventHandler {
   public HostColumnVector(DType type, long rows, Optional<Long> nullCount,
                    HostMemoryBuffer hostDataBuffer, HostMemoryBuffer hostValidityBuffer,
                    HostMemoryBuffer offsetBuffer, List<HostColumnVectorCore> nestedHcv) {
+    // NOTE: This constructor MUST NOT examine the contents of any host buffers, as they may be
+    //       asynchronously written by the device.
     super(type, rows, nullCount, hostDataBuffer, hostValidityBuffer, offsetBuffer, nestedHcv);
     refCount = 0;
     incRefCountInternal(true);
@@ -100,6 +102,8 @@ public HostColumnVector(DType type, long rows, Optional<Long> nullCount,
   HostColumnVector(DType type, long rows, Optional<Long> nullCount,
                    HostMemoryBuffer hostDataBuffer, HostMemoryBuffer hostValidityBuffer,
                    HostMemoryBuffer offsetBuffer) {
+    // NOTE: This constructor MUST NOT examine the contents of any host buffers, as they may be
+    //       asynchronously written by the device.
     super(type, rows, nullCount, hostDataBuffer, hostValidityBuffer, offsetBuffer, new ArrayList<>());
     assert !type.equals(DType.LIST) : "This constructor should not be used for list type";
     if (nullCount.isPresent() && nullCount.get() > 0 && hostValidityBuffer == null) {
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
index 95d209c0984..a225fbf34e1 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -47,6 +47,8 @@ public class HostColumnVectorCore implements AutoCloseable {
   public HostColumnVectorCore(DType type, long rows,
                               Optional<Long> nullCount, HostMemoryBuffer data, HostMemoryBuffer validity,
                               HostMemoryBuffer offsets, List<HostColumnVectorCore> nestedChildren) {
+    // NOTE: This constructor MUST NOT examine the contents of any host buffers, as they may be
+    //       asynchronously written by the device.
     this.offHeap = new OffHeapState(data, validity,  offsets);
     MemoryCleaner.register(this, offHeap);
     this.type = type;
diff --git a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
index 666a8864003..89f363d2b29 100644
--- a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
+++ b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -907,8 +907,9 @@ private static ColumnBufferProvider[] providersFrom(ColumnVector[] columns) {
     boolean success = false;
     try {
       for (int i = 0; i < columns.length; i++) {
-        onHost[i] = columns[i].copyToHost();
+        onHost[i] = columns[i].copyToHostAsync(Cuda.DEFAULT_STREAM);
       }
+      Cuda.DEFAULT_STREAM.sync();
       ColumnBufferProvider[] ret = providersFrom(onHost, true);
       success = true;
       return ret;

From 79a1eed785fccbca2c20ff5cc844ec1a9e741ee5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 31 Jul 2024 11:00:30 -0400
Subject: [PATCH 015/270] Remove checking for specific tests in memcheck script
 (#16412)

Removes the checking for specific gtests in the `run_cudf_memcheck_ctests.sh` script. Each of those tests can check the `LIBCUDF_MEMCHECK_ENABLED` environment variable themselves.
This simplifies the script logic and may help with replacing this with ctest logic in the future.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16412
---
 ci/run_cudf_memcheck_ctests.sh                                | 3 ---
 cpp/tests/error/error_handling_test.cu                        | 4 ++++
 .../test_default_stream_identification.cu                     | 1 +
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/ci/run_cudf_memcheck_ctests.sh b/ci/run_cudf_memcheck_ctests.sh
index aacd93e3b96..653829db419 100755
--- a/ci/run_cudf_memcheck_ctests.sh
+++ b/ci/run_cudf_memcheck_ctests.sh
@@ -15,9 +15,6 @@ export LIBCUDF_MEMCHECK_ENABLED=1
 for gt in ./*_TEST ; do
   test_name=$(basename ${gt})
   # Run gtests with compute-sanitizer
-  if [[ "$test_name" == "ERROR_TEST" ]] || [[ "$test_name" == "STREAM_IDENTIFICATION_TEST" ]]; then
-    continue
-  fi
   echo "Running compute-sanitizer on $test_name"
   compute-sanitizer --tool memcheck ${gt} "$@"
 done
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 46d01ec14ff..1dfe45556c4 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -50,6 +50,8 @@ CUDF_KERNEL void test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
 // calls.
 TEST(StreamCheck, FailedKernel)
 {
+  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { GTEST_SKIP(); }
+
   rmm::cuda_stream stream;
   int a;
   test_kernel<<<0, 0, 0, stream.value()>>>(&a);
@@ -61,6 +63,8 @@ TEST(StreamCheck, FailedKernel)
 
 TEST(StreamCheck, CatchFailedKernel)
 {
+  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { GTEST_SKIP(); }
+
   rmm::cuda_stream stream;
   int a;
   test_kernel<<<0, 0, 0, stream.value()>>>(&a);
diff --git a/cpp/tests/identify_stream_usage/test_default_stream_identification.cu b/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
index 268c7b37c81..c5fb75a7a8e 100644
--- a/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
+++ b/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
@@ -33,6 +33,7 @@ void test_cudaLaunchKernel()
   } catch (std::runtime_error&) {
     return;
   }
+  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
   throw std::runtime_error("No exception raised for kernel on default stream!");
 }
 

From 9336c172b1f61408e2392cbbd953e7f7e6e9ae3d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 31 Jul 2024 16:27:26 +0100
Subject: [PATCH 016/270] Add upper bound pin for polars (#16442)

This aligns the polars dependency with the most modern version supported by cudf-polars in this branch.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16442
---
 dependencies.yaml                 | 2 +-
 python/cudf_polars/pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 0fa32404156..aeb030313ed 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -630,7 +630,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.0
+          - polars>=1.0,<1.3
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index f8a1973bdbf..424c83a5199 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cudf==24.10.*,>=0.0.0a0",
-    "polars>=1.0",
+    "polars>=1.0,<1.3",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From 0f3b3808348debca8458bf73575745770b494ddc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 31 Jul 2024 07:38:56 -1000
Subject: [PATCH 017/270] Add environment variable to log cudf.pandas fallback
 calls (#16161)

Introduces a new environment variable `LOG_FAST_FALLBACK` which will create a structured log of the call that failed.

An example of the log is

```
INFO:root:{"debug_type": "LOG_FAST_FALLBACK", "failed_call": "pandas._libs.interval.Interval(0,1)", "exception": "Exception", "exception_message": "Cannot transform _Unusable", "pandas_object": "pandas._libs.interval.Interval", "passed_args": "0,1,", "passed_kwargs": {}}
```

I could turn this into a warning instead, but I imagine we would want to first utilize this to parse the failures and see generalized failures in aggregate

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16161
---
 python/cudf/cudf/pandas/_logger.py         | 80 ++++++++++++++++++++++
 python/cudf/cudf/pandas/fast_slow_proxy.py |  6 +-
 2 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/pandas/_logger.py

diff --git a/python/cudf/cudf/pandas/_logger.py b/python/cudf/cudf/pandas/_logger.py
new file mode 100644
index 00000000000..68923c3e35c
--- /dev/null
+++ b/python/cudf/cudf/pandas/_logger.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import json
+import logging
+
+logging.basicConfig(
+    filename="cudf_pandas_unit_tests_debug.log", level=logging.INFO
+)
+logger = logging.getLogger()
+
+
+class StructuredMessage:
+    # https://docs.python.org/3/howto/logging-cookbook.html#implementing-structured-logging
+    def __init__(self, debug_type: str, /, **kwargs) -> None:
+        self.debug_type = debug_type
+        self.kwargs = kwargs
+
+    def __str__(self) -> str:
+        log = {"debug_type": self.debug_type}
+        return json.dumps({**log, **self.kwargs})
+
+
+def reprify(arg) -> str:
+    """Attempt to return arg's repr for logging."""
+    try:
+        return repr(arg)
+    except Exception:
+        return "<REPR FAILED>"
+
+
+def log_fallback(
+    slow_args: tuple, slow_kwargs: dict, exception: Exception
+) -> None:
+    """Log when a fast call falls back to the slow path."""
+    caller = slow_args[0]
+    module = getattr(caller, "__module__", "")
+    obj_name = getattr(caller, "__qualname__", type(caller).__qualname__)
+    if module:
+        slow_object = f"{module}.{obj_name}"
+    else:
+        slow_object = obj_name
+    # TODO: Maybe use inspect.signature to map called args and kwargs
+    # to their keyword names, but a user calling an API incorrectly would
+    # break this.
+    caller_args = slow_args[1]
+    args_passed = ", ".join((reprify(arg) for arg in caller_args))
+    args_types_passed = ", ".join((type(arg).__name__ for arg in caller_args))
+    kwargs_passed = {}
+    kwargs_types_passed = ""
+    if len(slow_args) == 3:
+        caller_kwargs = slow_args[2]
+        if caller_kwargs:
+            fmt_kwargs = ", ".join(
+                f"{kwarg}={reprify(value)}"
+                for kwarg, value in caller_kwargs.items()
+            )
+            kwargs_types_passed = ", ".join(
+                f"{kwarg}={type(value).__name__}"
+                for kwarg, value in caller_kwargs.items()
+            )
+            args_passed = f"{args_passed}, {fmt_kwargs}"
+            kwargs_passed = {
+                kwarg: reprify(value) for kwarg, value in caller_kwargs.items()
+            }
+    message = StructuredMessage(
+        "LOG_FAST_FALLBACK",
+        failed_call=f"{slow_object}({args_passed})",
+        exception=type(exception).__name__,
+        exception_message=str(exception),
+        slow_object=slow_object,
+        args_passed=args_passed,
+        kwargs_passed=kwargs_passed,
+        args_types_passed=args_types_passed,
+        kwargs_types_passed=kwargs_types_passed,
+    )
+    logger.info(message)
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index dfb729cae6b..bb678fd1efe 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -930,13 +930,17 @@ def _fast_slow_function_call(
                             "Pandas debugging mode failed. "
                             f"The exception was {e}."
                         )
-    except Exception:
+    except Exception as err:
         with nvtx.annotate(
             "EXECUTE_SLOW",
             color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
             domain="cudf_pandas",
         ):
             slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs)
+            if _env_get_bool("LOG_FAST_FALLBACK", False):
+                from ._logger import log_fallback
+
+                log_fallback(slow_args, slow_kwargs, err)
             with disable_module_accelerator():
                 result = func(*slow_args, **slow_kwargs)
     return _maybe_wrap_result(result, func, *args, **kwargs), fast

From 5bcd8e062369a7d15222fa6d0bcc0b310553edbf Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 31 Jul 2024 10:34:37 -1000
Subject: [PATCH 018/270] Align DatetimeIndex APIs with pandas 2.x (#16367)

Mostly transferring methods that were defined on `Series.dt` methods to `DatetimeColumn` so it could be reused in `DatetimeIndex`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16367
---
 docs/cudf/source/conf.py                 |   2 +
 python/cudf/cudf/core/column/datetime.py |  56 ++++++
 python/cudf/cudf/core/index.py           | 211 ++++++++++++++++++++++-
 python/cudf/cudf/core/series.py          |  43 ++---
 python/cudf/cudf/tests/test_datetime.py  | 107 ++++++++++++
 5 files changed, 385 insertions(+), 34 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 7421d9be298..7ebafc0da95 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -556,6 +556,8 @@ def on_missing_reference(app, env, node, contnode):
     ("py:class", "Dtype"),
     # The following are erroneously warned due to
     # https://github.com/sphinx-doc/sphinx/issues/11225
+    ("py:obj", "cudf.DatetimeIndex.time"),
+    ("py:obj", "cudf.DatetimeIndex.date"),
     ("py:obj", "cudf.Index.values_host"),
     ("py:class", "pa.Array"),
     ("py:class", "ScalarLike"),
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 73902789c11..81fbb914842 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -286,6 +286,62 @@ def dayofyear(self) -> ColumnBase:
     def day_of_year(self) -> ColumnBase:
         return self.get_dt_field("day_of_year")
 
+    @property
+    def is_month_start(self) -> ColumnBase:
+        return (self.day == 1).fillna(False)
+
+    @property
+    def is_month_end(self) -> ColumnBase:
+        last_day_col = libcudf.datetime.last_day_of_month(self)
+        return (self.day == last_day_col.day).fillna(False)
+
+    @property
+    def is_quarter_end(self) -> ColumnBase:
+        last_month = self.month.isin([3, 6, 9, 12])
+        return (self.is_month_end & last_month).fillna(False)
+
+    @property
+    def is_quarter_start(self) -> ColumnBase:
+        first_month = self.month.isin([1, 4, 7, 10])
+        return (self.is_month_start & first_month).fillna(False)
+
+    @property
+    def is_year_end(self) -> ColumnBase:
+        day_of_year = self.day_of_year
+        leap_dates = libcudf.datetime.is_leap_year(self)
+
+        leap = day_of_year == cudf.Scalar(366)
+        non_leap = day_of_year == cudf.Scalar(365)
+        return libcudf.copying.copy_if_else(leap, non_leap, leap_dates).fillna(
+            False
+        )
+
+    @property
+    def is_year_start(self) -> ColumnBase:
+        return (self.day_of_year == 1).fillna(False)
+
+    @property
+    def days_in_month(self) -> ColumnBase:
+        return libcudf.datetime.days_in_month(self)
+
+    @property
+    def day_of_week(self) -> ColumnBase:
+        raise NotImplementedError("day_of_week is currently not implemented.")
+
+    @property
+    def is_normalized(self) -> bool:
+        raise NotImplementedError(
+            "is_normalized is currently not implemented."
+        )
+
+    def to_julian_date(self) -> ColumnBase:
+        raise NotImplementedError(
+            "to_julian_date is currently not implemented."
+        )
+
+    def normalize(self) -> ColumnBase:
+        raise NotImplementedError("normalize is currently not implemented.")
+
     @property
     def values(self):
         """
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 8c3b091abec..40a5d9ff259 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -61,6 +61,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Generator, Iterable
+    from datetime import tzinfo
 
 
 def ensure_index(index_like: Any) -> BaseIndex:
@@ -1680,7 +1681,7 @@ class DatetimeIndex(Index):
     copy : bool
         Make a copy of input.
     freq : str, optional
-        This is not yet supported
+        Frequency of the DatetimeIndex
     tz : pytz.timezone or dateutil.tz.tzfile
         This is not yet supported
     ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
@@ -1847,6 +1848,210 @@ def searchsorted(
             value, side=side, ascending=ascending, na_position=na_position
         )
 
+    def as_unit(self, unit: str, round_ok: bool = True) -> Self:
+        """
+        Convert to a dtype with the given unit resolution.
+
+        Currently not implemented.
+
+        Parameters
+        ----------
+        unit : {'s', 'ms', 'us', 'ns'}
+        round_ok : bool, default True
+            If False and the conversion requires rounding, raise ValueError.
+        """
+        raise NotImplementedError("as_unit is currently not implemented")
+
+    def mean(self, *, skipna: bool = True, axis: int | None = 0):
+        return self._column.mean(skipna=skipna)
+
+    def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1):
+        return self._column.std(skipna=skipna, ddof=ddof)
+
+    def strftime(self, date_format: str) -> Index:
+        """
+        Convert to Index using specified date_format.
+
+        Return an Index of formatted strings specified by date_format, which
+        supports the same string format as the python standard library.
+
+        Parameters
+        ----------
+        date_format : str
+            Date format string (e.g. "%Y-%m-%d").
+        """
+        return Index._from_data(
+            {self.name: self._column.strftime(date_format)}
+        )
+
+    @property
+    def asi8(self) -> cupy.ndarray:
+        return self._column.astype("int64").values
+
+    @property
+    def inferred_freq(self) -> cudf.DateOffset | None:
+        raise NotImplementedError("inferred_freq is currently not implemented")
+
+    @property
+    def freq(self) -> cudf.DateOffset | None:
+        return self._freq
+
+    @freq.setter
+    def freq(self) -> None:
+        raise NotImplementedError("Setting freq is currently not supported.")
+
+    @property
+    def freqstr(self) -> str:
+        raise NotImplementedError("freqstr is currently not implemented")
+
+    @property
+    def resolution(self) -> str:
+        """
+        Returns day, hour, minute, second, millisecond or microsecond
+        """
+        raise NotImplementedError("resolution is currently not implemented")
+
+    @property
+    def unit(self) -> str:
+        return self._column.time_unit
+
+    @property
+    def tz(self) -> tzinfo | None:
+        """
+        Return the timezone.
+
+        Returns
+        -------
+        datetime.tzinfo or None
+            Returns None when the array is tz-naive.
+        """
+        return getattr(self.dtype, "tz", None)
+
+    @property
+    def tzinfo(self) -> tzinfo | None:
+        """
+        Alias for tz attribute
+        """
+        return self.tz
+
+    def to_pydatetime(self) -> np.ndarray:
+        """
+        Return an ndarray of ``datetime.datetime`` objects.
+
+        Returns
+        -------
+        numpy.ndarray
+            An ndarray of ``datetime.datetime`` objects.
+        """
+        return self.to_pandas().to_pydatetime()
+
+    def to_julian_date(self) -> Index:
+        return Index._from_data({self.name: self._column.to_julian_date()})
+
+    def to_period(self, freq) -> pd.PeriodIndex:
+        return self.to_pandas().to_period(freq=freq)
+
+    def normalize(self) -> Self:
+        """
+        Convert times to midnight.
+
+        Currently not implemented.
+        """
+        return type(self)._from_data({self.name: self._column.normalize()})
+
+    @property
+    def time(self) -> np.ndarray:
+        """
+        Returns numpy array of ``datetime.time`` objects.
+
+        The time part of the Timestamps.
+        """
+        return self.to_pandas().time
+
+    @property
+    def timetz(self) -> np.ndarray:
+        """
+        Returns numpy array of ``datetime.time`` objects with timezones.
+
+        The time part of the Timestamps.
+        """
+        return self.to_pandas().timetz
+
+    @property
+    def date(self) -> np.ndarray:
+        """
+        Returns numpy array of python ``datetime.date`` objects.
+
+        Namely, the date part of Timestamps without time and
+        timezone information.
+        """
+        return self.to_pandas().date
+
+    @property
+    def is_month_start(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the first day of the month.
+        """
+        return self._column.is_month_start.values
+
+    @property
+    def is_month_end(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the last day of the month.
+        """
+        return self._column.is_month_end.values
+
+    @property
+    def is_quarter_end(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the last day of the quarter.
+        """
+        return self._column.is_quarter_end.values
+
+    @property
+    def is_quarter_start(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the start day of the quarter.
+        """
+        return self._column.is_quarter_start.values
+
+    @property
+    def is_year_end(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the last day of the year.
+        """
+        return self._column.is_year_end.values
+
+    @property
+    def is_year_start(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the first day of the year.
+        """
+        return self._column.is_year_start.values
+
+    @property
+    def is_normalized(self) -> bool:
+        """
+        Returns True if all of the dates are at midnight ("no time")
+        """
+        return self._column.is_normalized
+
+    @property
+    def days_in_month(self) -> Index:
+        """
+        Get the total number of days in the month that the date falls on.
+        """
+        return Index._from_data({self.name: self._column.days_in_month})
+
+    daysinmonth = days_in_month
+
+    @property
+    def day_of_week(self) -> Index:
+        """
+        Get the day of week that the date falls on.
+        """
+        return Index._from_data({self.name: self._column.day_of_week})
+
     @property  # type: ignore
     @_performance_tracking
     def year(self):
@@ -3391,9 +3596,11 @@ def _get_nearest_indexer(
     return indexer
 
 
-def _validate_freq(freq: Any) -> cudf.DateOffset:
+def _validate_freq(freq: Any) -> cudf.DateOffset | None:
     if isinstance(freq, str):
         return cudf.DateOffset._from_freqstr(freq)
+    elif freq is None:
+        return freq
     elif freq is not None and not isinstance(freq, cudf.DateOffset):
         raise ValueError(f"Invalid frequency: {freq}")
     return cast(cudf.DateOffset, freq)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 10ac1fdfc1e..929af5cd981 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4415,7 +4415,9 @@ def is_month_start(self) -> Series:
         """
         Booleans indicating if dates are the first day of the month.
         """
-        return (self.day == 1).fillna(False)
+        return self._return_result_like_self(
+            self.series._column.is_month_start
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -4462,9 +4464,7 @@ def days_in_month(self) -> Series:
         11    31
         dtype: int16
         """
-        return self._return_result_like_self(
-            libcudf.datetime.days_in_month(self.series._column)
-        )
+        return self._return_result_like_self(self.series._column.days_in_month)
 
     @property  # type: ignore
     @_performance_tracking
@@ -4505,9 +4505,7 @@ def is_month_end(self) -> Series:
         8    False
         dtype: bool
         """  # noqa: E501
-        last_day_col = libcudf.datetime.last_day_of_month(self.series._column)
-        last_day = self._return_result_like_self(last_day_col)
-        return (self.day == last_day.dt.day).fillna(False)
+        return self._return_result_like_self(self.series._column.is_month_end)
 
     @property  # type: ignore
     @_performance_tracking
@@ -4546,14 +4544,10 @@ def is_quarter_start(self) -> Series:
         7    False
         dtype: bool
         """
-        day = self.series._column.get_dt_field("day")
-        first_month = self.series._column.get_dt_field("month").isin(
-            [1, 4, 7, 10]
+        return self._return_result_like_self(
+            self.series._column.is_quarter_start
         )
 
-        result = ((day == cudf.Scalar(1)) & first_month).fillna(False)
-        return self._return_result_like_self(result)
-
     @property  # type: ignore
     @_performance_tracking
     def is_quarter_end(self) -> Series:
@@ -4591,16 +4585,10 @@ def is_quarter_end(self) -> Series:
         7    False
         dtype: bool
         """
-        day = self.series._column.get_dt_field("day")
-        last_day = libcudf.datetime.last_day_of_month(self.series._column)
-        last_day = last_day.get_dt_field("day")
-        last_month = self.series._column.get_dt_field("month").isin(
-            [3, 6, 9, 12]
+        return self._return_result_like_self(
+            self.series._column.is_quarter_end
         )
 
-        result = ((day == last_day) & last_month).fillna(False)
-        return self._return_result_like_self(result)
-
     @property  # type: ignore
     @_performance_tracking
     def is_year_start(self) -> Series:
@@ -4627,10 +4615,7 @@ def is_year_start(self) -> Series:
         2    True
         dtype: bool
         """
-        outcol = self.series._column.get_dt_field(
-            "day_of_year"
-        ) == cudf.Scalar(1)
-        return self._return_result_like_self(outcol.fillna(False))
+        return self._return_result_like_self(self.series._column.is_year_start)
 
     @property  # type: ignore
     @_performance_tracking
@@ -4658,13 +4643,7 @@ def is_year_end(self) -> Series:
         2    False
         dtype: bool
         """
-        day_of_year = self.series._column.get_dt_field("day_of_year")
-        leap_dates = libcudf.datetime.is_leap_year(self.series._column)
-
-        leap = day_of_year == cudf.Scalar(366)
-        non_leap = day_of_year == cudf.Scalar(365)
-        result = cudf._lib.copying.copy_if_else(leap, non_leap, leap_dates)
-        return self._return_result_like_self(result.fillna(False))
+        return self._return_result_like_self(self.series._column.is_year_end)
 
     @_performance_tracking
     def _get_dt_field(self, field: str) -> Series:
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 7ab9ff2ef23..6bc775d2a2c 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -7,6 +7,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pandas._testing as tm
 import pyarrow as pa
 import pytest
 
@@ -2429,3 +2430,109 @@ def test_day_month_name_locale_not_implemented(meth, klass):
         obj = obj.dt
     with pytest.raises(NotImplementedError):
         getattr(obj, meth)(locale="pt_BR.utf8")
+
+
+@pytest.mark.parametrize(
+    "attr",
+    [
+        "is_month_start",
+        "is_month_end",
+        "is_quarter_end",
+        "is_quarter_start",
+        "is_year_end",
+        "is_year_start",
+        "days_in_month",
+        "timetz",
+        "time",
+        "date",
+    ],
+)
+def test_dti_datetime_attributes(attr):
+    data = [
+        "2020-01-01",
+        "2020-01-31",
+        "2020-03-01",
+        "2020-03-31",
+        "2020-03-31",
+        "2020-12-31",
+        None,
+    ]
+    pd_dti = pd.DatetimeIndex(data, name="foo")
+    cudf_dti = cudf.from_pandas(pd_dti)
+
+    result = getattr(cudf_dti, attr)
+    expected = getattr(pd_dti, attr)
+    if isinstance(result, np.ndarray):
+        # numpy doesn't assert object arrays with NaT correctly
+        tm.assert_numpy_array_equal(result, expected)
+    else:
+        assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("attr", ["freq", "unit"])
+def test_dti_properties(attr):
+    pd_dti = pd.DatetimeIndex(
+        ["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"
+    )
+    cudf_dti = cudf.DatetimeIndex(
+        ["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"
+    )
+
+    result = getattr(cudf_dti, attr)
+    expected = getattr(pd_dti, attr)
+    assert result == expected
+
+
+def test_dti_asi8():
+    pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo")
+    cudf_dti = cudf.from_pandas(pd_dti)
+
+    result = pd_dti.asi8
+    expected = cudf_dti.asi8
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, kwargs",
+    [
+        ["mean", {}],
+        pytest.param(
+            "std",
+            {},
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/16444"
+            ),
+        ),
+        pytest.param(
+            "std",
+            {"ddof": 0},
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/16444"
+            ),
+        ),
+    ],
+)
+def test_dti_reduction(method, kwargs):
+    pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo")
+    cudf_dti = cudf.from_pandas(pd_dti)
+
+    result = getattr(cudf_dti, method)(**kwargs)
+    expected = getattr(pd_dti, method)(**kwargs)
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "method, kwargs",
+    [
+        ["to_pydatetime", {}],
+        ["to_period", {"freq": "D"}],
+        ["strftime", {"date_format": "%Y-%m-%d"}],
+    ],
+)
+def test_dti_methods(method, kwargs):
+    pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo")
+    cudf_dti = cudf.from_pandas(pd_dti)
+
+    result = getattr(cudf_dti, method)(**kwargs)
+    expected = getattr(pd_dti, method)(**kwargs)
+    assert_eq(result, expected)

From e2d45d6f24adbeb3a21081e078a6c2776d550a06 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 31 Jul 2024 10:36:08 -1000
Subject: [PATCH 019/270] Align TimedeltaIndex APIs with pandas 2.x (#16368)

Mostly exposing methods that were available on the `TimedeltaColumn`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16368
---
 python/cudf/cudf/core/column/timedelta.py | 12 +++
 python/cudf/cudf/core/index.py            | 92 +++++++++++++++++++++++
 python/cudf/cudf/tests/test_timedelta.py  | 39 ++++++++++
 3 files changed, 143 insertions(+)

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 59ea1cc002c..47c8ed6fd95 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -251,6 +251,18 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
     def time_unit(self) -> str:
         return np.datetime_data(self.dtype)[0]
 
+    def total_seconds(self) -> ColumnBase:
+        raise NotImplementedError("total_seconds is currently not implemented")
+
+    def ceil(self, freq: str) -> ColumnBase:
+        raise NotImplementedError("ceil is currently not implemented")
+
+    def floor(self, freq: str) -> ColumnBase:
+        raise NotImplementedError("floor is currently not implemented")
+
+    def round(self, freq: str) -> ColumnBase:
+        raise NotImplementedError("round is currently not implemented")
+
     def as_numerical_column(
         self, dtype: Dtype
     ) -> "cudf.core.column.NumericalColumn":
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 40a5d9ff259..888ea25cdae 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2759,6 +2759,98 @@ def __getitem__(self, index):
             return pd.Timedelta(value)
         return value
 
+    def as_unit(self, unit: str, round_ok: bool = True) -> Self:
+        """
+        Convert to a dtype with the given unit resolution.
+
+        Currently not implemented.
+
+        Parameters
+        ----------
+        unit : {'s', 'ms', 'us', 'ns'}
+        round_ok : bool, default True
+            If False and the conversion requires rounding, raise ValueError.
+        """
+        raise NotImplementedError("as_unit is currently not implemented")
+
+    @property
+    def freq(self) -> cudf.DateOffset | None:
+        raise NotImplementedError("freq is currently not implemented")
+
+    @property
+    def freqstr(self) -> str:
+        raise NotImplementedError("freqstr is currently not implemented")
+
+    @property
+    def resolution(self) -> str:
+        """
+        Returns day, hour, minute, second, millisecond or microsecond
+        """
+        raise NotImplementedError("resolution is currently not implemented")
+
+    @property
+    def unit(self) -> str:
+        return self._column.time_unit
+
+    def to_pytimedelta(self) -> np.ndarray:
+        """
+        Return an ndarray of ``datetime.timedelta`` objects.
+
+        Returns
+        -------
+        numpy.ndarray
+            An ndarray of ``datetime.timedelta`` objects.
+        """
+        return self.to_pandas().to_pytimedelta()
+
+    @property
+    def asi8(self) -> cupy.ndarray:
+        return self._column.astype("int64").values
+
+    def sum(self, *, skipna: bool = True, axis: int | None = 0):
+        return self._column.sum(skipna=skipna)
+
+    def mean(self, *, skipna: bool = True, axis: int | None = 0):
+        return self._column.mean(skipna=skipna)
+
+    def median(self, *, skipna: bool = True, axis: int | None = 0):
+        return self._column.median(skipna=skipna)
+
+    def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1):
+        return self._column.std(skipna=skipna, ddof=ddof)
+
+    def total_seconds(self) -> cupy.ndarray:
+        """
+        Return total duration of each element expressed in seconds.
+
+        This method is currently not implemented.
+        """
+        return self._column.total_seconds().values
+
+    def ceil(self, freq: str) -> Self:
+        """
+        Ceil to the specified resolution.
+
+        This method is currently not implemented.
+        """
+        return type(self)._from_data({self.name: self._column.ceil(freq)})
+
+    def floor(self, freq: str) -> Self:
+        """
+        Floor to the specified resolution.
+
+        This method is currently not implemented.
+        """
+        return type(self)._from_data({self.name: self._column.floor(freq)})
+
+    def round(self, freq: str) -> Self:
+        """
+        Round to the specified resolution.
+
+        This method is currently not implemented.
+        """
+        return type(self)._from_data({self.name: self._column.round(freq)})
+
     @property  # type: ignore
     @_performance_tracking
     def days(self):
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index c4a2349f535..d622ff6b94e 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1467,3 +1467,42 @@ def test_timedelta_series_cmpops_pandas_compatibility(data1, data2, op):
         got = op(gsr1, gsr2)
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "method, kwargs",
+    [
+        ["sum", {}],
+        ["mean", {}],
+        ["median", {}],
+        ["std", {}],
+        ["std", {"ddof": 0}],
+    ],
+)
+def test_tdi_reductions(method, kwargs):
+    pd_tdi = pd.TimedeltaIndex(["1 day", "2 days", "3 days"])
+    cudf_tdi = cudf.from_pandas(pd_tdi)
+
+    result = getattr(pd_tdi, method)(**kwargs)
+    expected = getattr(cudf_tdi, method)(**kwargs)
+    assert result == expected
+
+
+def test_tdi_asi8():
+    pd_tdi = pd.TimedeltaIndex(["1 day", "2 days", "3 days"])
+    cudf_tdi = cudf.from_pandas(pd_tdi)
+
+    result = pd_tdi.asi8
+    expected = cudf_tdi.asi8
+    assert_eq(result, expected)
+
+
+def test_tdi_unit():
+    pd_tdi = pd.TimedeltaIndex(
+        ["1 day", "2 days", "3 days"], dtype="timedelta64[ns]"
+    )
+    cudf_tdi = cudf.from_pandas(pd_tdi)
+
+    result = pd_tdi.unit
+    expected = cudf_tdi.unit
+    assert result == expected

From dab8660df7ba823dcef8cb8276a3867c2bb27cc7 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 31 Jul 2024 10:37:48 -1000
Subject: [PATCH 020/270] Align IntervalIndex APIs with pandas 2.x (#16371)

Implemented the relatively straightforward, missing APIs and raised `NotImplementedError` for the others

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16371
---
 docs/cudf/source/conf.py                      |  15 ++-
 python/cudf/cudf/core/column/interval.py      |  64 ++++++++-
 python/cudf/cudf/core/index.py                | 123 ++++++++++++++++++
 .../cudf/cudf/tests/indexes/test_interval.py  |  33 +++++
 4 files changed, 229 insertions(+), 6 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 7ebafc0da95..43e2d6031bc 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -559,15 +559,20 @@ def on_missing_reference(app, env, node, contnode):
     ("py:obj", "cudf.DatetimeIndex.time"),
     ("py:obj", "cudf.DatetimeIndex.date"),
     ("py:obj", "cudf.Index.values_host"),
-    ("py:class", "pa.Array"),
-    ("py:class", "ScalarLike"),
-    ("py:class", "ParentType"),
-    ("py:class", "ColumnLike"),
-    ("py:class", "ColumnLike"),
     ("py:obj", "cudf.Index.transpose"),
     ("py:obj", "cudf.Index.T"),
     ("py:obj", "cudf.Index.to_flat_index"),
     ("py:obj", "cudf.MultiIndex.to_flat_index"),
+    ("py:meth", "pyarrow.Table.to_pandas"),
+    ("py:class", "pa.Array"),
+    ("py:class", "ScalarLike"),
+    ("py:class", "ParentType"),
+    ("py:class", "pyarrow.lib.DataType"),
+    ("py:class", "pyarrow.lib.Table"),
+    ("py:class", "pyarrow.lib.Scalar"),
+    ("py:class", "pyarrow.lib.ChunkedArray"),
+    ("py:class", "pyarrow.lib.Array"),
+    ("py:class", "ColumnLike"),
     # TODO: Remove this when we figure out why typing_extensions doesn't seem
     # to map types correctly for intersphinx
     ("py:class", "typing_extensions.Self"),
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index d09a1f66539..b2f79ef0c65 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -1,11 +1,18 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
 import pandas as pd
 import pyarrow as pa
 
 import cudf
-from cudf.core.column import StructColumn
+from cudf.core.column import StructColumn, as_column
 from cudf.core.dtypes import IntervalDtype
 
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
 
 class IntervalColumn(StructColumn):
     def __init__(
@@ -85,6 +92,61 @@ def copy(self, deep=True):
             children=struct_copy.base_children,
         )
 
+    @property
+    def is_empty(self) -> ColumnBase:
+        left_equals_right = (self.right == self.left).fillna(False)
+        not_closed_both = as_column(
+            self.dtype.closed != "both", length=len(self)
+        )
+        return left_equals_right & not_closed_both
+
+    @property
+    def is_non_overlapping_monotonic(self) -> bool:
+        raise NotImplementedError(
+            "is_overlapping is currently not implemented."
+        )
+
+    @property
+    def is_overlapping(self) -> bool:
+        raise NotImplementedError(
+            "is_overlapping is currently not implemented."
+        )
+
+    @property
+    def length(self) -> ColumnBase:
+        return self.right - self.left
+
+    @property
+    def left(self) -> ColumnBase:
+        return self.children[0]
+
+    @property
+    def mid(self) -> ColumnBase:
+        try:
+            return 0.5 * (self.left + self.right)
+        except TypeError:
+            # datetime safe version
+            return self.left + 0.5 * self.length
+
+    @property
+    def right(self) -> ColumnBase:
+        return self.children[1]
+
+    def overlaps(other) -> ColumnBase:
+        raise NotImplementedError("overlaps is not currently implemented.")
+
+    def set_closed(
+        self, closed: Literal["left", "right", "both", "neither"]
+    ) -> IntervalColumn:
+        return IntervalColumn(
+            size=self.size,
+            dtype=IntervalDtype(self.dtype.fields["left"], closed),
+            mask=self.base_mask,
+            offset=self.offset,
+            null_count=self.null_count,
+            children=self.base_children,
+        )
+
     def as_interval_column(self, dtype):
         if isinstance(dtype, IntervalDtype):
             return IntervalColumn(
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 888ea25cdae..cd879d559cd 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3429,6 +3429,31 @@ def from_breaks(
         )
         return IntervalIndex(interval_col, name=name, closed=closed)
 
+    @classmethod
+    def from_arrays(
+        cls,
+        left,
+        right,
+        closed: Literal["left", "right", "both", "neither"] = "right",
+        copy: bool = False,
+        dtype=None,
+    ) -> Self:
+        raise NotImplementedError("from_arrays is currently not supported.")
+
+    @classmethod
+    def from_tuples(
+        cls,
+        data,
+        closed: Literal["left", "right", "both", "neither"] = "right",
+        name=None,
+        copy: bool = False,
+        dtype=None,
+    ) -> IntervalIndex:
+        piidx = pd.IntervalIndex.from_tuples(
+            data, closed=closed, name=name, copy=copy, dtype=dtype
+        )
+        return cls.from_pandas(piidx)
+
     def __getitem__(self, index):
         raise NotImplementedError(
             "Getting a scalar from an IntervalIndex is not yet supported"
@@ -3443,6 +3468,104 @@ def _is_boolean(self):
     def _clean_nulls_from_index(self):
         return self
 
+    @property
+    def is_empty(self) -> cupy.ndarray:
+        """
+        Indicates if an interval is empty, meaning it contains no points.
+        """
+        return self._column.is_empty.values
+
+    @property
+    def is_non_overlapping_monotonic(self) -> bool:
+        """
+        Return a True if the IntervalIndex is non-overlapping and monotonic.
+        """
+        return self._column.is_non_overlapping_monotonic
+
+    @property
+    def is_overlapping(self) -> bool:
+        """
+        Return True if the IntervalIndex has overlapping intervals, else False.
+
+        Currently not implemented
+        """
+        return self._column.is_overlapping
+
+    @property
+    def length(self) -> Index:
+        """
+        Return an Index with entries denoting the length of each Interval.
+        """
+        return _index_from_data({None: self._column.length})
+
+    @property
+    def left(self) -> Index:
+        """
+        Return left bounds of the intervals in the IntervalIndex.
+
+        The left bounds of each interval in the IntervalIndex are
+        returned as an Index. The datatype of the left bounds is the
+        same as the datatype of the endpoints of the intervals.
+        """
+        return _index_from_data({None: self._column.left})
+
+    @property
+    def mid(self) -> Index:
+        """
+        Return the midpoint of each interval in the IntervalIndex as an Index.
+
+        Each midpoint is calculated as the average of the left and right bounds
+        of each interval.
+        """
+        return _index_from_data({None: self._column.mid})
+
+    @property
+    def right(self) -> Index:
+        """
+        Return right bounds of the intervals in the IntervalIndex.
+
+        The right bounds of each interval in the IntervalIndex are
+        returned as an Index. The datatype of the right bounds is the
+        same as the datatype of the endpoints of the intervals.
+        """
+        return _index_from_data({None: self._column.right})
+
+    def overlaps(self, other) -> cupy.ndarray:
+        """
+        Check elementwise if an Interval overlaps the values in the IntervalIndex.
+
+        Currently not supported.
+        """
+        return self._column.overlaps(other).values
+
+    def set_closed(
+        self, closed: Literal["left", "right", "both", "neither"]
+    ) -> Self:
+        """
+        Return an identical IntervalArray closed on the specified side.
+
+        Parameters
+        ----------
+        closed : {'left', 'right', 'both', 'neither'}
+            Whether the intervals are closed on the left-side, right-side, both
+            or neither.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.set_closed(closed)}
+        )
+
+    def to_tuples(self, na_tuple: bool = True) -> pd.Index:
+        """
+        Return an Index of tuples of the form (left, right).
+
+        Parameters
+        ----------
+        na_tuple : bool, default True
+            If ``True``, return ``NA`` as a tuple ``(nan, nan)``. If ``False``,
+            just return ``NA`` as ``nan``.
+        """
+        return self.to_pandas().to_tuples(na_tuple=na_tuple)
+
 
 @_performance_tracking
 def as_index(
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index 87b76ab7609..3b3a9f96543 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -368,3 +368,36 @@ def test_intervalindex_conflicting_closed():
 def test_intervalindex_invalid_data():
     with pytest.raises(TypeError):
         cudf.IntervalIndex([1, 2])
+
+
+@pytest.mark.parametrize(
+    "attr",
+    [
+        "is_empty",
+        "length",
+        "left",
+        "right",
+        "mid",
+    ],
+)
+def test_intervalindex_properties(attr):
+    pd_ii = pd.IntervalIndex.from_arrays([0, 1], [0, 2])
+    cudf_ii = cudf.from_pandas(pd_ii)
+
+    result = getattr(cudf_ii, attr)
+    expected = getattr(pd_ii, attr)
+    assert_eq(result, expected)
+
+
+def test_set_closed():
+    data = [pd.Interval(0, 1)]
+    result = cudf.IntervalIndex(data).set_closed("both")
+    expected = pd.IntervalIndex(data).set_closed("both")
+    assert_eq(result, expected)
+
+
+def test_from_tuples():
+    data = [(1, 2), (10, 20)]
+    result = cudf.IntervalIndex.from_tuples(data, closed="left", name="a")
+    expected = pd.IntervalIndex.from_tuples(data, closed="left", name="a")
+    assert_eq(result, expected)

From be842259a835f4f7a5b9f7ff6fad1507d33c13cd Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 31 Jul 2024 17:53:13 -0500
Subject: [PATCH 021/270] Remove cuDF dependency from pylibcudf column
 from_device tests (#16441)

This removes the need to `import cudf` in `test_column_from_device` and removes a runtime dependency on numpy in the associated pylibcudf column method.

Authors:
  - https://github.com/brandon-b-miller
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16441
---
 python/cudf/cudf/_lib/pylibcudf/column.pyx    |  9 ++---
 .../cudf/_lib/pylibcudf/libcudf/types.pxd     |  2 +
 python/cudf/cudf/_lib/pylibcudf/types.pxd     |  2 +
 python/cudf/cudf/_lib/pylibcudf/types.pyx     | 16 +++++++-
 .../test_column_from_device.py                | 39 +++++++++++++++----
 5 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index a61e0629292..1d9902b0374 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -15,13 +15,11 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .scalar cimport Scalar
-from .types cimport DataType, type_id
+from .types cimport DataType, size_of, type_id
 from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
 
 import functools
 
-import numpy as np
-
 
 cdef class Column:
     """A container of nullable device data as a column of elements.
@@ -303,14 +301,15 @@ cdef class Column:
             raise ValueError("mask not yet supported.")
 
         typestr = iface['typestr'][1:]
+        data_type = _datatype_from_dtype_desc(typestr)
+
         if not is_c_contiguous(
             iface['shape'],
             iface['strides'],
-            np.dtype(typestr).itemsize
+            size_of(data_type)
         ):
             raise ValueError("Data must be C-contiguous")
 
-        data_type = _datatype_from_dtype_desc(typestr)
         size = iface['shape'][0]
         return Column(
             data_type,
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
index 8e94ec296cf..eabae68bc90 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
@@ -98,3 +98,5 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         HIGHER
         MIDPOINT
         NEAREST
+
+    cdef size_type size_of(data_type t) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd
index 7d3ddca14a1..1f3e1aa2fbb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd
@@ -27,3 +27,5 @@ cdef class DataType:
 
     @staticmethod
     cdef DataType from_libcudf(data_type dt)
+
+cpdef size_type size_of(DataType t)
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index c45c6071bb3..311f9ce4046 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -2,7 +2,12 @@
 
 from libc.stdint cimport int32_t
 
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    data_type,
+    size_of as cpp_size_of,
+    size_type,
+    type_id,
+)
 from cudf._lib.pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id
 
 from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
@@ -69,6 +74,15 @@ cdef class DataType:
         ret.c_obj = dt
         return ret
 
+cpdef size_type size_of(DataType t):
+    """Returns the size in bytes of elements of the specified data_type.
+
+    Only fixed-width types are supported.
+
+    For details, see :cpp:func:`size_of`.
+    """
+    with nogil:
+        return cpp_size_of(t.c_obj)
 
 SIZE_TYPE = DataType(type_to_id[size_type]())
 SIZE_TYPE_ID = SIZE_TYPE.id()
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
index c4ff7bb43a5..78ee2cb100e 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
@@ -4,7 +4,8 @@
 import pytest
 from utils import assert_column_eq
 
-import cudf
+import rmm
+
 from cudf._lib import pylibcudf as plc
 
 VALID_TYPES = [
@@ -35,17 +36,39 @@ def valid_type(request):
     return request.param
 
 
+class DataBuffer:
+    def __init__(self, obj, dtype):
+        self.obj = rmm.DeviceBuffer.to_device(obj)
+        self.dtype = dtype
+        self.shape = (int(len(self.obj) / self.dtype.itemsize),)
+        self.strides = (self.dtype.itemsize,)
+        self.typestr = self.dtype.str
+
+    @property
+    def __cuda_array_interface__(self):
+        return {
+            "data": self.obj.__cuda_array_interface__["data"],
+            "shape": self.shape,
+            "strides": self.strides,
+            "typestr": self.typestr,
+            "version": 0,
+        }
+
+
 @pytest.fixture
-def valid_column(valid_type):
+def input_column(valid_type):
     if valid_type == pa.bool_():
         return pa.array([True, False, True], type=valid_type)
     return pa.array([1, 2, 3], type=valid_type)
 
 
-def test_from_cuda_array_interface(valid_column):
-    col = plc.column.Column.from_cuda_array_interface_obj(
-        cudf.Series(valid_column)
-    )
-    expect = valid_column
+@pytest.fixture
+def iface_obj(input_column):
+    data = input_column.to_numpy(zero_copy_only=False)
+    return DataBuffer(data.view("uint8"), data.dtype)
+
+
+def test_from_cuda_array_interface(input_column, iface_obj):
+    col = plc.column.Column.from_cuda_array_interface_obj(iface_obj)
 
-    assert_column_eq(expect, col)
+    assert_column_eq(input_column, col)

From 9d0c57a64d63d52182bd1c1e930180bf62404f1a Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 1 Aug 2024 10:59:27 -0700
Subject: [PATCH 022/270] Add skiprows and nrows to parquet reader (#16214)

closes #15144

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16214
---
 python/cudf/cudf/_lib/parquet.pyx             | 35 ++++++++++++-----
 .../cudf/cudf/_lib/pylibcudf/io/parquet.pxd   |  2 +-
 .../cudf/cudf/_lib/pylibcudf/io/parquet.pyx   | 18 ++++-----
 python/cudf/cudf/io/parquet.py                | 23 +++++++++++
 .../cudf/pylibcudf_tests/io/test_parquet.py   |  2 +-
 python/cudf/cudf/tests/test_parquet.py        | 39 +++++++++++++++++++
 python/cudf/cudf/utils/ioutils.py             | 10 +++++
 python/cudf_polars/cudf_polars/dsl/ir.py      |  2 +-
 8 files changed, 110 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index a2eed94bb3c..4a4b13b0b31 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -22,7 +22,7 @@ from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
@@ -132,7 +132,10 @@ cdef object _process_metadata(object df,
                               object filepaths_or_buffers,
                               list pa_buffers,
                               bool allow_range_index,
-                              bool use_pandas_metadata):
+                              bool use_pandas_metadata,
+                              size_type nrows=-1,
+                              int64_t skip_rows=0,
+                              ):
 
     add_df_col_struct_names(df, child_names)
     index_col = None
@@ -221,9 +224,13 @@ cdef object _process_metadata(object df,
                 else:
                     idx = cudf.Index(cudf.core.column.column_empty(0))
             else:
+                start = range_index_meta["start"] + skip_rows
+                stop = range_index_meta["stop"]
+                if nrows != -1:
+                    stop = start + nrows
                 idx = cudf.RangeIndex(
-                    start=range_index_meta['start'],
-                    stop=range_index_meta['stop'],
+                    start=start,
+                    stop=stop,
                     step=range_index_meta['step'],
                     name=range_index_meta['name']
                 )
@@ -260,7 +267,9 @@ def read_parquet_chunked(
     row_groups=None,
     use_pandas_metadata=True,
     size_t chunk_read_limit=0,
-    size_t pass_read_limit=1024000000
+    size_t pass_read_limit=1024000000,
+    size_type nrows=-1,
+    int64_t skip_rows=0
 ):
     # Convert NativeFile buffers to NativeFileDatasource,
     # but save original buffers in case we need to use
@@ -287,7 +296,9 @@ def read_parquet_chunked(
         row_groups,
         use_pandas_metadata,
         chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit
+        pass_read_limit=pass_read_limit,
+        skip_rows=skip_rows,
+        nrows=nrows,
     )
 
     tbl_w_meta = reader.read_chunk()
@@ -320,13 +331,16 @@ def read_parquet_chunked(
     df = _process_metadata(df, column_names, child_names,
                            per_file_user_data, row_groups,
                            filepaths_or_buffers, pa_buffers,
-                           allow_range_index, use_pandas_metadata)
+                           allow_range_index, use_pandas_metadata,
+                           nrows=nrows, skip_rows=skip_rows)
     return df
 
 
 cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                    use_pandas_metadata=True,
-                   Expression filters=None):
+                   Expression filters=None,
+                   size_type nrows=-1,
+                   int64_t skip_rows=0):
     """
     Cython function to call into libcudf API, see `read_parquet`.
 
@@ -362,6 +376,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         filters,
         convert_strings_to_categories = False,
         use_pandas_metadata = use_pandas_metadata,
+        skip_rows = skip_rows,
+        nrows = nrows,
     )
 
     df = cudf.DataFrame._from_data(
@@ -371,7 +387,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     df = _process_metadata(df, tbl_w_meta.column_names(include_children=False),
                            tbl_w_meta.child_names, tbl_w_meta.per_file_user_data,
                            row_groups, filepaths_or_buffers, pa_buffers,
-                           allow_range_index, use_pandas_metadata)
+                           allow_range_index, use_pandas_metadata,
+                           nrows=nrows, skip_rows=skip_rows)
     return df
 
 cpdef read_parquet_metadata(filepaths_or_buffers):
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
index 027f215fb91..93ef849b813 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
@@ -28,7 +28,7 @@ cpdef read_parquet(
     bool convert_strings_to_categories = *,
     bool use_pandas_metadata = *,
     int64_t skip_rows = *,
-    size_type num_rows = *,
+    size_type nrows = *,
     # disabled see comment in parquet.pyx for more
     # ReaderColumnSchema reader_column_schema = *,
     # DataType timestamp_type = *
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
index 96119e1b714..84a79f9565f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
@@ -26,7 +26,7 @@ cdef parquet_reader_options _setup_parquet_reader_options(
     bool convert_strings_to_categories = False,
     bool use_pandas_metadata = True,
     int64_t skip_rows = 0,
-    size_type num_rows = -1,
+    size_type nrows = -1,
     # ReaderColumnSchema reader_column_schema = None,
     # DataType timestamp_type = DataType(type_id.EMPTY)
 ):
@@ -40,8 +40,8 @@ cdef parquet_reader_options _setup_parquet_reader_options(
     )
     if row_groups is not None:
         opts.set_row_groups(row_groups)
-    if num_rows != -1:
-        opts.set_num_rows(num_rows)
+    if nrows != -1:
+        opts.set_num_rows(nrows)
     if skip_rows != 0:
         opts.set_skip_rows(skip_rows)
     if columns is not None:
@@ -73,7 +73,7 @@ cdef class ChunkedParquetReader:
         Whether to convert string columns to the category type
     skip_rows : int64_t, default 0
         The number of rows to skip from the start of the file.
-    num_rows : size_type, default -1
+    nrows : size_type, default -1
         The number of rows to read. By default, read the entire file.
     chunk_read_limit : size_t, default 0
         Limit on total number of bytes to be returned per read,
@@ -90,7 +90,7 @@ cdef class ChunkedParquetReader:
         bool use_pandas_metadata=True,
         bool convert_strings_to_categories=False,
         int64_t skip_rows = 0,
-        size_type num_rows = -1,
+        size_type nrows = -1,
         size_t chunk_read_limit=0,
         size_t pass_read_limit=1024000000
     ):
@@ -103,7 +103,7 @@ cdef class ChunkedParquetReader:
             convert_strings_to_categories=convert_strings_to_categories,
             use_pandas_metadata=use_pandas_metadata,
             skip_rows=skip_rows,
-            num_rows=num_rows,
+            nrows=nrows,
         )
 
         with nogil:
@@ -152,7 +152,7 @@ cpdef read_parquet(
     bool convert_strings_to_categories = False,
     bool use_pandas_metadata = True,
     int64_t skip_rows = 0,
-    size_type num_rows = -1,
+    size_type nrows = -1,
     # Disabled, these aren't used by cudf-python
     # we should only add them back in if there's user demand
     # ReaderColumnSchema reader_column_schema = None,
@@ -178,7 +178,7 @@ cpdef read_parquet(
         the per-file user metadata of the ``TableWithMetadata``
     skip_rows : int64_t, default 0
         The number of rows to skip from the start of the file.
-    num_rows : size_type, default -1
+    nrows : size_type, default -1
         The number of rows to read. By default, read the entire file.
 
     Returns
@@ -195,7 +195,7 @@ cpdef read_parquet(
         convert_strings_to_categories,
         use_pandas_metadata,
         skip_rows,
-        num_rows,
+        nrows,
     )
 
     with nogil:
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 7dab2f20100..4a419a2fbb6 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -539,6 +539,8 @@ def read_parquet(
     open_file_options=None,
     bytes_per_thread=None,
     dataset_kwargs=None,
+    nrows=None,
+    skip_rows=None,
     *args,
     **kwargs,
 ):
@@ -685,6 +687,8 @@ def read_parquet(
             partition_keys=partition_keys,
             partition_categories=partition_categories,
             dataset_kwargs=dataset_kwargs,
+            nrows=nrows,
+            skip_rows=skip_rows,
             **kwargs,
         )
     # Apply filters row-wise (if any are defined), and return
@@ -813,6 +817,8 @@ def _parquet_to_frame(
     partition_keys=None,
     partition_categories=None,
     dataset_kwargs=None,
+    nrows=None,
+    skip_rows=None,
     **kwargs,
 ):
     # If this is not a partitioned read, only need
@@ -820,11 +826,18 @@ def _parquet_to_frame(
     if not partition_keys:
         return _read_parquet(
             paths_or_buffers,
+            nrows=nrows,
+            skip_rows=skip_rows,
             *args,
             row_groups=row_groups,
             **kwargs,
         )
 
+    if nrows is not None or skip_rows is not None:
+        raise NotImplementedError(
+            "nrows/skip_rows is not supported when reading a partitioned parquet dataset"
+        )
+
     partition_meta = None
     partitioning = (dataset_kwargs or {}).get("partitioning", None)
     if hasattr(partitioning, "schema"):
@@ -912,6 +925,8 @@ def _read_parquet(
     columns=None,
     row_groups=None,
     use_pandas_metadata=None,
+    nrows=None,
+    skip_rows=None,
     *args,
     **kwargs,
 ):
@@ -934,13 +949,21 @@ def _read_parquet(
                 columns=columns,
                 row_groups=row_groups,
                 use_pandas_metadata=use_pandas_metadata,
+                nrows=nrows if nrows is not None else -1,
+                skip_rows=skip_rows if skip_rows is not None else 0,
             )
         else:
+            if nrows is None:
+                nrows = -1
+            if skip_rows is None:
+                skip_rows = 0
             return libparquet.read_parquet(
                 filepaths_or_buffers,
                 columns=columns,
                 row_groups=row_groups,
                 use_pandas_metadata=use_pandas_metadata,
+                nrows=nrows,
+                skip_rows=skip_rows,
             )
     else:
         if (
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py b/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
index 07d2ab3d69a..dbd20cd473e 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
@@ -31,7 +31,7 @@ def test_read_parquet_basic(
 
     res = plc.io.parquet.read_parquet(
         plc.io.SourceInfo([source]),
-        num_rows=nrows,
+        nrows=nrows,
         skip_rows=skiprows,
         columns=columns,
     )
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 3806b901b10..879a2c50db7 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1978,6 +1978,25 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
                 assert fn == filename
 
 
+@pytest.mark.parametrize("kwargs", [{"nrows": 1}, {"skip_rows": 1}])
+def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs):
+    # Checks that write_to_dataset is wrapping to_parquet
+    # as expected
+    pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
+    size = 100
+    pdf = pd.DataFrame(
+        {
+            "a": np.arange(0, stop=size, dtype="int64"),
+            "b": np.random.choice(list("abcd"), size=size),
+            "c": np.random.choice(np.arange(4), size=size),
+        }
+    )
+    pdf.to_parquet(pdf_dir, index=False, partition_cols=["b"])
+
+    with pytest.raises(NotImplementedError):
+        cudf.read_parquet(pdf_dir, **kwargs)
+
+
 @pytest.mark.parametrize("return_meta", [True, False])
 def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta):
     pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
@@ -3768,6 +3787,26 @@ def test_parquet_chunked_reader(
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "nrows,skip_rows",
+    [
+        (0, 0),
+        (1000, 0),
+        (0, 1000),
+        (1000, 10000),
+    ],
+)
+def test_parquet_reader_nrows_skiprows(nrows, skip_rows):
+    df = pd.DataFrame(
+        {"a": [1, 2, 3, 4] * 100000, "b": ["av", "qw", "hi", "xyz"] * 100000}
+    )
+    expected = df[skip_rows : skip_rows + nrows]
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+    got = cudf.read_parquet(buffer, nrows=nrows, skip_rows=skip_rows)
+    assert_eq(expected, got)
+
+
 def test_parquet_reader_pandas_compatibility():
     df = pd.DataFrame(
         {"a": [1, 2, 3, 4] * 10000, "b": ["av", "qw", "hi", "xyz"] * 10000}
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 80555750b3a..448a815fe1b 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -199,6 +199,16 @@
     in parallel (using a python thread pool). Default allocation is
     {bytes_per_thread} bytes.
     This parameter is functional only when `use_python_file_object=False`.
+skiprows : int, default None
+    If not None, the number of rows to skip from the start of the file.
+
+    .. note::
+       This option is not supported when the low-memory mode is on.
+nrows : int, default None
+    If not None, the total number of rows to read.
+
+    .. note:
+       This option is not supported when the low-memory mode is on.
 
 Returns
 -------
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 7f62dff4389..3754addeb11 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -321,7 +321,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             tbl_w_meta = plc.io.parquet.read_parquet(
                 plc.io.SourceInfo(self.paths),
                 columns=with_columns,
-                num_rows=nrows,
+                nrows=nrows,
             )
             df = DataFrame.from_table(
                 tbl_w_meta.tbl,

From 05745d04e08ea494a50d12bad977af7e71aaf27b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 1 Aug 2024 17:00:19 -0400
Subject: [PATCH 023/270] Improve performance of hash_character_ngrams using
 warp-per-string kernel (#16212)

Improves the performance of `nvtext::hash_character_ngrams` using a warp-per-string kernel instead of a string per thread.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16212
---
 cpp/src/text/generate_ngrams.cu | 161 ++++++++++++++++++++++----------
 1 file changed, 113 insertions(+), 48 deletions(-)

diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 724f3603f29..6f700f84ec4 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -36,10 +36,12 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
 #include <cuda/functional>
+#include <thrust/copy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform_scan.h>
 
 #include <stdexcept>
 
@@ -165,6 +167,47 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
 namespace detail {
 namespace {
 
+constexpr cudf::thread_index_type block_size       = 256;
+constexpr cudf::thread_index_type bytes_per_thread = 4;
+
+/**
+ * @brief Counts the number of ngrams in each row of the given strings column
+ *
+ * Each warp processes a single string.
+ * Formula is `count = max(0,str.length() - ngrams + 1)`
+ * If a string has less than ngrams characters, its count is 0.
+ */
+CUDF_KERNEL void count_char_ngrams_kernel(cudf::column_device_view const d_strings,
+                                          cudf::size_type ngrams,
+                                          cudf::size_type* d_counts)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+
+  auto const str_idx = idx / cudf::detail::warp_size;
+  if (str_idx >= d_strings.size()) { return; }
+  if (d_strings.is_null(str_idx)) {
+    d_counts[str_idx] = 0;
+    return;
+  }
+
+  namespace cg    = cooperative_groups;
+  auto const warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  auto const end   = d_str.data() + d_str.size_bytes();
+
+  auto const lane_idx   = warp.thread_rank();
+  cudf::size_type count = 0;
+  for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end;
+       itr += cudf::detail::warp_size * bytes_per_thread) {
+    for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) {
+      count += static_cast<cudf::size_type>(cudf::strings::detail::is_begin_utf8_char(*s));
+    }
+  }
+  auto const char_count = cg::reduce(warp, count, cg::plus<int>());
+  if (lane_idx == 0) { d_counts[str_idx] = cuda::std::max(0, char_count - ngrams + 1); }
+}
+
 /**
  * @brief Generate character ngrams for each string
  *
@@ -220,17 +263,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
-    0,
-    cuda::proclaim_return_type<cudf::size_type>(
-      [d_strings = *d_strings, ngrams] __device__(auto idx) {
-        if (d_strings.is_null(idx)) { return 0; }
-        auto const length = d_strings.element<cudf::string_view>(idx).length();
-        return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
-      }));
-  auto [offsets, total_ngrams] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto [offsets, total_ngrams] = [&] {
+    auto counts           = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    auto const num_blocks = cudf::util::div_rounding_up_safe(
+      static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
+    count_char_ngrams_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+      *d_strings, ngrams, counts.data());
+    return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
+  }();
   auto d_offsets = offsets->view().data<cudf::size_type>();
+
   CUDF_EXPECTS(total_ngrams > 0,
                "Insufficient number of characters in each string to generate ngrams");
 
@@ -246,36 +288,64 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 }
 
 namespace {
+
 /**
  * @brief Computes the hash of each character ngram
  *
- * Each thread processes a single string. Substrings are resolved for every character
+ * Each warp processes a single string. Substrings are resolved for every character
  * of the string and hashed.
  */
-struct character_ngram_hash_fn {
-  cudf::column_device_view const d_strings;
-  cudf::size_type ngrams;
-  cudf::size_type const* d_ngram_offsets;
-  cudf::hash_value_type* d_results;
+CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings,
+                                             cudf::size_type ngrams,
+                                             cudf::size_type const* d_ngram_offsets,
+                                             cudf::hash_value_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    if (d_strings.is_null(idx)) return;
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) return;
-    auto itr                = d_str.begin();
-    auto const ngram_offset = d_ngram_offsets[idx];
-    auto const ngram_count  = d_ngram_offsets[idx + 1] - ngram_offset;
-    auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
-    auto d_hashes           = d_results + ngram_offset;
-    for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
-      auto const begin = itr.byte_offset();
-      auto const end   = (itr + ngrams).byte_offset();
-      auto const ngram = cudf::string_view(d_str.data() + begin, end - begin);
-      *d_hashes++      = hasher(ngram);
+  auto const str_idx = idx / cudf::detail::warp_size;
+
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) { return; }
+
+  __shared__ cudf::hash_value_type hvs[block_size];  // temp store for hash values
+
+  auto const ngram_offset = d_ngram_offsets[str_idx];
+  auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
+  auto const lane_idx   = idx % cudf::detail::warp_size;
+
+  auto d_hashes = d_results + ngram_offset;
+  auto itr      = d_str.data() + lane_idx;
+  for (auto i = 0; i < warp_count; ++i) {
+    cudf::hash_value_type hash = 0;
+    if (itr < end && cudf::strings::detail::is_begin_utf8_char(*itr)) {
+      // resolve ngram substring
+      auto const sub_str =
+        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+      auto const [bytes, left] =
+        cudf::strings::detail::bytes_to_character_position(sub_str, ngrams);
+      if (left == 0) { hash = hasher(cudf::string_view(itr, bytes)); }
+    }
+    hvs[threadIdx.x] = hash;  // store hash into shared memory
+    __syncwarp();
+    if (lane_idx == 0) {
+      // copy valid hash values into d_hashes
+      auto const hashes = &hvs[threadIdx.x];
+      d_hashes          = thrust::copy_if(
+        thrust::seq, hashes, hashes + cudf::detail::warp_size, d_hashes, [](auto h) {
+          return h != 0;
+        });
     }
+    __syncwarp();
+    itr += cudf::detail::warp_size;
   }
-};
+}
 }  // namespace
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
@@ -291,18 +361,16 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
   if (input.is_empty()) { return cudf::make_empty_column(output_type); }
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+  auto const grid      = cudf::detail::grid_1d(
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
 
   // build offsets column by computing the number of ngrams per string
-  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
-    0,
-    cuda::proclaim_return_type<cudf::size_type>(
-      [d_strings = *d_strings, ngrams] __device__(auto idx) {
-        if (d_strings.is_null(idx)) { return 0; }
-        auto const length = d_strings.element<cudf::string_view>(idx).length();
-        return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
-      }));
-  auto [offsets, total_ngrams] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto [offsets, total_ngrams] = [&] {
+    auto counts = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    count_char_ngrams_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *d_strings, ngrams, counts.data());
+    return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
+  }();
   auto d_offsets = offsets->view().data<cudf::size_type>();
 
   CUDF_EXPECTS(total_ngrams > 0,
@@ -313,11 +381,8 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
     cudf::make_numeric_column(output_type, total_ngrams, cudf::mask_state::UNALLOCATED, stream, mr);
   auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();
 
-  character_ngram_hash_fn generator{*d_strings, ngrams, d_offsets, d_hashes};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input.size(),
-                     generator);
+  character_ngram_hash_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    *d_strings, ngrams, d_offsets, d_hashes);
 
   return make_lists_column(
     input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr);

From a8a367009ff64478d78eb916fc9dc65b77b89aac Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 1 Aug 2024 16:45:01 -0700
Subject: [PATCH 024/270] Move exception handler into pylibcudf from cudf
 (#16468)

PR to help prepare for the splitting out of pylibcudf.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16468
---
 docs/cudf/source/developer_guide/pylibcudf.md |  2 +-
 .../{ => pylibcudf}/exception_handler.pxd     |  6 +--
 .../cudf/_lib/pylibcudf/libcudf/binaryop.pxd  | 12 ++---
 .../cudf/_lib/pylibcudf/libcudf/copying.pxd   | 44 +++++++++----------
 .../_lib/pylibcudf/libcudf/lists/contains.pxd | 12 ++---
 5 files changed, 38 insertions(+), 38 deletions(-)
 rename python/cudf/cudf/_lib/{ => pylibcudf}/exception_handler.pxd (95%)

diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
index 0b881b2b057..2ae545a4955 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -149,7 +149,7 @@ Some guidelines on what should be tested:
   - Exception: In special cases where constructing suitable large tests is difficult in C++ (such as creating suitable input data for I/O testing), tests may be added to pylibcudf instead.
 - Nullable data should always be tested.
 - Expected exceptions should be tested. Tests should be written from the user's perspective in mind, and if the API is not currently throwing the appropriate exception it should be updated.
-  - Important note: If the exception should be produced by libcudf, the underlying libcudf API should be updated to throw the desired exception in C++. Such changes may require consultation with libcudf devs in nontrivial cases. [This issue](https://github.com/rapidsai/cudf/issues/12885) provides an overview and an indication of acceptable exception types that should cover most use cases. In rare cases a new C++ exception may need to be introduced in [`error.hpp`](https://github.com/rapidsai/cudf/blob/branch-24.04/cpp/include/cudf/utilities/error.hpp). If so, this exception will also need to be mapped to a suitable Python exception in [`exception_handler.pxd`](https://github.com/rapidsai/cudf/blob/branch-24.04/python/cudf/cudf/_lib/exception_handler.pxd).
+  - Important note: If the exception should be produced by libcudf, the underlying libcudf API should be updated to throw the desired exception in C++. Such changes may require consultation with libcudf devs in nontrivial cases. [This issue](https://github.com/rapidsai/cudf/issues/12885) provides an overview and an indication of acceptable exception types that should cover most use cases. In rare cases a new C++ exception may need to be introduced in [`error.hpp`](https://github.com/rapidsai/cudf/blob/branch-24.04/cpp/include/cudf/utilities/error.hpp). If so, this exception will also need to be mapped to a suitable Python exception in `exception_handler.pxd`.
 
 Some guidelines on how best to use pytests.
 - By default, fixtures producing device data containers should be of module scope and treated as immutable by tests. Allocating data on the GPU is expensive and slows tests. Almost all pylibcudf operations are out of place operations, so module-scoped fixtures should not typically be problematic to work with. Session-scoped fixtures would also work, but they are harder to reason about since they live in a different module, and if they need to change for any reason they could affect an arbitrarily large number of tests. Module scope is a good balance.
diff --git a/python/cudf/cudf/_lib/exception_handler.pxd b/python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd
similarity index 95%
rename from python/cudf/cudf/_lib/exception_handler.pxd
rename to python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd
index 4337d8db285..6abcd0a1c0f 100644
--- a/python/cudf/cudf/_lib/exception_handler.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 
 # See
@@ -24,7 +24,7 @@ cdef extern from *:
      * Since this function interoperates with Python's exception state, it
      * does not throw any C++ exceptions.
      */
-    void cudf_exception_handler()
+    void libcudf_exception_handler()
     {
       // Catch a handful of different errors here and turn them into the
       // equivalent Python errors.
@@ -66,4 +66,4 @@ cdef extern from *:
 
     }  // anonymous namespace
     """
-    cdef void cudf_exception_handler()
+    cdef void libcudf_exception_handler()
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
index b34fea6a775..78da5980db4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -5,7 +5,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
@@ -55,28 +55,28 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const scalar& rhs,
         binary_operator op,
         data_type output_type
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         const string& op,
         data_type output_type
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
 cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil:
     cdef bool is_supported_operation(
@@ -84,4 +84,4 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil:
         data_type lhs_type,
         data_type rhs_type,
         binary_operator op
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
index 001489d69bf..af3a16ad01b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
@@ -8,7 +8,7 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
     column_view,
@@ -30,25 +30,25 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const table_view& source_table,
         const column_view& gather_map,
         out_of_bounds_policy policy
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] shift(
         const column_view& input,
         size_type offset,
         const scalar& fill_values
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] scatter (
         const table_view& source_table,
         const column_view& scatter_map,
         const table_view& target_table,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] scatter (
         const vector[reference_wrapper[constscalar]]& source_scalars,
         const column_view& indices,
         const table_view& target,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cpdef enum class mask_allocation_policy(int32_t):
         NEVER
@@ -57,22 +57,22 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[column] empty_like (
         const column_view& input_column
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         mask_allocation_policy policy
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         size_type size,
         mask_allocation_policy policy
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] empty_like (
         const table_view& input_table
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef void copy_range_in_place (
         const column_view& input_column,
@@ -80,7 +80,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         size_type input_begin,
         size_type input_end,
         size_type target_begin
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_range (
         const column_view& input_column,
@@ -88,68 +88,68 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         size_type input_begin,
         size_type input_end,
         size_type target_begin
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef vector[column_view] slice (
         const column_view& input_column,
         vector[size_type] indices
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef vector[table_view] slice (
         const table_view& input_table,
         vector[size_type] indices
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef vector[column_view] split (
         const column_view& input_column,
         vector[size_type] splits
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef vector[table_view] split (
         const table_view& input_table,
         vector[size_type] splits
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const column_view& rhs,
         const column_view& boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const scalar& lhs,
         const column_view& rhs,
         const column_view& boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const scalar& rhs,
         const column_view boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const scalar& lhs,
         const scalar& rhs,
         const column_view boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] boolean_mask_scatter (
         const table_view& input,
         const table_view& target,
         const column_view& boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] boolean_mask_scatter (
         const vector[reference_wrapper[constscalar]]& input,
         const table_view& target,
         const column_view& boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[scalar] get_element (
         const column_view& input,
         size_type index
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cpdef enum class sample_with_replacement(bool):
         FALSE
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
index 82aed7d70a0..40bb2e78970 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
@@ -3,7 +3,7 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
@@ -21,25 +21,25 @@ cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] contains(
         const lists_column_view& lists,
         const scalar& search_key,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] contains(
         const lists_column_view& lists,
         const column_view& search_keys,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] contains_nulls(
         const lists_column_view& lists,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] index_of(
         const lists_column_view& lists,
         const scalar& search_key,
         duplicate_find_option find_option,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] index_of(
         const lists_column_view& lists,
         const column_view& search_keys,
         duplicate_find_option find_option,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler

From cc19d8a7b424abbc87f7767e3bc60c54390dc9e3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 2 Aug 2024 09:34:27 -1000
Subject: [PATCH 025/270] Use explicit construction of column subclass instead
 of `build_column` when type is known (#16470)

When we need to construct a column with a specific type, we do not need to go through the indirection of `build_column`, which matches a column subclass to a passed type, and instead construct directly from the class instead

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16470
---
 python/cudf/cudf/core/_internals/where.py   |  2 +-
 python/cudf/cudf/core/column/categorical.py | 46 +++++++++++++--------
 python/cudf/cudf/core/column/column.py      |  2 +-
 python/cudf/cudf/core/column/datetime.py    | 10 ++---
 python/cudf/cudf/core/column/numerical.py   | 43 ++++++++-----------
 python/cudf/cudf/core/column/string.py      |  6 +--
 python/cudf/cudf/core/column/timedelta.py   |  8 ++--
 python/cudf/cudf/core/dataframe.py          |  4 +-
 python/cudf/cudf/core/index.py              |  8 ++--
 9 files changed, 64 insertions(+), 65 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 18ab32d2c9e..9f36499586b 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -110,7 +110,7 @@ def _make_categorical_like(result, column):
     if isinstance(column, cudf.core.column.CategoricalColumn):
         result = cudf.core.column.build_categorical_column(
             categories=column.categories,
-            codes=cudf.core.column.build_column(
+            codes=cudf.core.column.NumericalColumn(
                 result.base_data, dtype=result.dtype
             ),
             mask=result.base_mask,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 9433a91b9c6..55bfae30470 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -572,13 +572,10 @@ def children(self) -> tuple[NumericalColumn]:
             codes_column = self.base_children[0]
             start = self.offset * codes_column.dtype.itemsize
             end = start + self.size * codes_column.dtype.itemsize
-            codes_column = cast(
-                cudf.core.column.NumericalColumn,
-                column.build_column(
-                    data=codes_column.base_data[start:end],
-                    dtype=codes_column.dtype,
-                    size=self.size,
-                ),
+            codes_column = cudf.core.column.NumericalColumn(
+                data=codes_column.base_data[start:end],
+                dtype=codes_column.dtype,
+                size=self.size,
             )
             self._children = (codes_column,)
         return self._children
@@ -660,8 +657,9 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
             Self,
             cudf.core.column.build_categorical_column(
                 categories=self.categories,
-                codes=cudf.core.column.build_column(
-                    codes.base_data, dtype=codes.dtype
+                codes=cudf.core.column.NumericalColumn(
+                    codes.base_data,  # type: ignore[arg-type]
+                    dtype=codes.dtype,
                 ),
                 mask=codes.base_mask,
                 ordered=self.ordered,
@@ -734,7 +732,10 @@ def sort_values(
         codes = self.codes.sort_values(ascending, na_position)
         col = column.build_categorical_column(
             categories=self.dtype.categories._values,
-            codes=column.build_column(codes.base_data, dtype=codes.dtype),
+            codes=cudf.core.column.NumericalColumn(
+                codes.base_data,  # type: ignore[arg-type]
+                dtype=codes.dtype,
+            ),
             mask=codes.base_mask,
             size=codes.size,
             ordered=self.dtype.ordered,
@@ -842,7 +843,10 @@ def unique(self) -> CategoricalColumn:
         codes = self.codes.unique()
         return column.build_categorical_column(
             categories=self.categories,
-            codes=column.build_column(codes.base_data, dtype=codes.dtype),
+            codes=cudf.core.column.NumericalColumn(
+                codes.base_data,  # type: ignore[arg-type]
+                dtype=codes.dtype,
+            ),
             mask=codes.base_mask,
             offset=codes.offset,
             size=codes.size,
@@ -980,7 +984,9 @@ def find_and_replace(
 
         result = column.build_categorical_column(
             categories=new_cats["cats"],
-            codes=column.build_column(output.base_data, dtype=output.dtype),
+            codes=cudf.core.column.NumericalColumn(
+                output.base_data, dtype=output.dtype
+            ),
             mask=output.base_mask,
             offset=output.offset,
             size=output.size,
@@ -1176,8 +1182,9 @@ def _concat(
 
         return column.build_categorical_column(
             categories=column.as_column(cats),
-            codes=column.build_column(
-                codes_col.base_data, dtype=codes_col.dtype
+            codes=cudf.core.column.NumericalColumn(
+                codes_col.base_data,  # type: ignore[arg-type]
+                dtype=codes_col.dtype,
             ),
             mask=codes_col.base_mask,
             size=codes_col.size,
@@ -1190,8 +1197,9 @@ def _with_type_metadata(
         if isinstance(dtype, CategoricalDtype):
             return column.build_categorical_column(
                 categories=dtype.categories._values,
-                codes=column.build_column(
-                    self.codes.base_data, dtype=self.codes.dtype
+                codes=cudf.core.column.NumericalColumn(
+                    self.codes.base_data,  # type: ignore[arg-type]
+                    dtype=self.codes.dtype,
                 ),
                 mask=self.codes.base_mask,
                 ordered=dtype.ordered,
@@ -1339,7 +1347,7 @@ def _set_categories(
             Self,
             column.build_categorical_column(
                 categories=new_cats,
-                codes=column.build_column(
+                codes=cudf.core.column.NumericalColumn(
                     new_codes.base_data, dtype=new_codes.dtype
                 ),
                 mask=new_codes.base_mask,
@@ -1472,7 +1480,9 @@ def pandas_categorical_as_column(
 
     return column.build_categorical_column(
         categories=categorical.categories,
-        codes=column.build_column(codes.base_data, codes.dtype),
+        codes=cudf.core.column.NumericalColumn(
+            codes.base_data, dtype=codes.dtype
+        ),
         size=codes.size,
         mask=mask,
         ordered=categorical.ordered,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7e0d8ced595..a7d2cb441dd 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1506,7 +1506,7 @@ def column_empty(
     elif isinstance(dtype, CategoricalDtype):
         data = None
         children = (
-            build_column(
+            cudf.core.column.NumericalColumn(
                 data=as_buffer(
                     rmm.DeviceBuffer(
                         size=row_count
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 81fbb914842..ce67ce81e6b 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -473,15 +473,15 @@ def as_timedelta_column(self, dtype: Dtype) -> None:  # type: ignore[override]
 
     def as_numerical_column(
         self, dtype: Dtype
-    ) -> "cudf.core.column.NumericalColumn":
-        col = column.build_column(
-            data=self.base_data,
-            dtype=np.int64,
+    ) -> cudf.core.column.NumericalColumn:
+        col = cudf.core.column.NumericalColumn(
+            data=self.base_data,  # type: ignore[arg-type]
+            dtype=np.dtype(np.int64),
             mask=self.base_mask,
             offset=self.offset,
             size=self.size,
         )
-        return cast("cudf.core.column.NumericalColumn", col.astype(dtype))
+        return cast(cudf.core.column.NumericalColumn, col.astype(dtype))
 
     def strftime(self, format: str) -> cudf.core.column.StringColumn:
         if len(self) == 0:
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f9404eb3b40..c326a10c844 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -13,13 +13,7 @@
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
 from cudf.api.types import is_integer, is_scalar
-from cudf.core.column import (
-    ColumnBase,
-    as_column,
-    build_column,
-    column,
-    string,
-)
+from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
@@ -338,29 +332,23 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
     def as_datetime_column(
         self, dtype: Dtype
     ) -> cudf.core.column.DatetimeColumn:
-        return cast(
-            "cudf.core.column.DatetimeColumn",
-            build_column(
-                data=self.astype("int64").base_data,
-                dtype=dtype,
-                mask=self.base_mask,
-                offset=self.offset,
-                size=self.size,
-            ),
+        return cudf.core.column.DatetimeColumn(
+            data=self.astype("int64").base_data,  # type: ignore[arg-type]
+            dtype=dtype,
+            mask=self.base_mask,
+            offset=self.offset,
+            size=self.size,
         )
 
     def as_timedelta_column(
         self, dtype: Dtype
     ) -> cudf.core.column.TimeDeltaColumn:
-        return cast(
-            "cudf.core.column.TimeDeltaColumn",
-            build_column(
-                data=self.astype("int64").base_data,
-                dtype=dtype,
-                mask=self.base_mask,
-                offset=self.offset,
-                size=self.size,
-            ),
+        return cudf.core.column.TimeDeltaColumn(
+            data=self.astype("int64").base_data,  # type: ignore[arg-type]
+            dtype=dtype,
+            mask=self.base_mask,
+            offset=self.offset,
+            size=self.size,
         )
 
     def as_decimal_column(
@@ -637,7 +625,10 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
         if isinstance(dtype, CategoricalDtype):
             return column.build_categorical_column(
                 categories=dtype.categories._values,
-                codes=build_column(self.base_data, dtype=self.dtype),
+                codes=cudf.core.column.NumericalColumn(
+                    self.base_data,  # type: ignore[arg-type]
+                    dtype=self.dtype,
+                ),
                 mask=self.base_mask,
                 ordered=dtype.ordered,
                 size=self.size,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ec95c50f455..b422ff86b17 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5934,9 +5934,9 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
 
         n_bytes_to_view = str_end_byte_offset - str_byte_offset
 
-        to_view = column.build_column(
-            self.base_data,
-            dtype=cudf.api.types.dtype("int8"),
+        to_view = cudf.core.column.NumericalColumn(
+            self.base_data,  # type: ignore[arg-type]
+            dtype=np.dtype(np.int8),
             offset=str_byte_offset,
             size=n_bytes_to_view,
         )
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 47c8ed6fd95..ba0dc4779bb 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -265,10 +265,10 @@ def round(self, freq: str) -> ColumnBase:
 
     def as_numerical_column(
         self, dtype: Dtype
-    ) -> "cudf.core.column.NumericalColumn":
-        col = column.build_column(
-            data=self.base_data,
-            dtype=np.int64,
+    ) -> cudf.core.column.NumericalColumn:
+        col = cudf.core.column.NumericalColumn(
+            data=self.base_data,  # type: ignore[arg-type]
+            dtype=np.dtype(np.int64),
             mask=self.base_mask,
             offset=self.offset,
             size=self.size,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 52dc29974bf..865d2706ca3 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -46,10 +46,10 @@
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
+    NumericalColumn,
     StructColumn,
     as_column,
     build_categorical_column,
-    build_column,
     column_empty,
     concat_columns,
 )
@@ -8543,7 +8543,7 @@ def _reassign_categories(categories, cols, col_idxs):
         if idx in categories:
             cols[name] = build_categorical_column(
                 categories=categories[idx],
-                codes=build_column(
+                codes=NumericalColumn(
                     cols[name].base_data, dtype=cols[name].dtype
                 ),
                 mask=cols[name].base_mask,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index cd879d559cd..0d29ef07e7d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2434,12 +2434,10 @@ def to_pandas(
         return result
 
     @_performance_tracking
-    def _get_dt_field(self, field):
+    def _get_dt_field(self, field: str) -> Index:
+        """Return an Index of a numerical component of the DatetimeIndex."""
         out_column = self._values.get_dt_field(field)
-        # column.column_empty_like always returns a Column object
-        # but we need a NumericalColumn for Index..
-        # how should this be handled?
-        out_column = column.build_column(
+        out_column = NumericalColumn(
             data=out_column.base_data,
             dtype=out_column.dtype,
             mask=out_column.base_mask,

From e0d1ac1efa9153f0a084bd72b7d4c300f9640196 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 2 Aug 2024 17:44:45 -0500
Subject: [PATCH 026/270] Fix typo in dispatch_row_equal. (#16473)

This PR fixes a small typo in the C++ code.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16473
---
 cpp/src/stream_compaction/distinct.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index e5cf29f3ebf..e2c5aba6802 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -51,7 +51,7 @@ namespace {
  * @param func The input functor to invoke
  */
 template <bool HasNested, typename Func>
-rmm::device_uvector<cudf::size_type> dipatch_row_equal(
+rmm::device_uvector<cudf::size_type> dispatch_row_equal(
   null_equality compare_nulls,
   nan_equality compare_nans,
   bool has_nulls,
@@ -110,9 +110,9 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
   };
 
   if (cudf::detail::has_nested_columns(input)) {
-    return dipatch_row_equal<true>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
+    return dispatch_row_equal<true>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
   } else {
-    return dipatch_row_equal<false>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
+    return dispatch_row_equal<false>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
   }
 }
 

From af57286536fc21b47b80e45be222773b751600c9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 5 Aug 2024 07:16:34 -0500
Subject: [PATCH 027/270] Add missing pylibcudf strings docs (#16471)

Noticed a few missing pylibcudf string docs that were missed, added them here.

Authors:
  - https://github.com/brandon-b-miller
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16471
---
 .../api_docs/pylibcudf/strings/capitalize.rst |  6 +++
 .../api_docs/pylibcudf/strings/char_types.rst |  6 +++
 .../api_docs/pylibcudf/strings/find.rst       |  6 +++
 .../api_docs/pylibcudf/strings/index.rst      |  5 ++
 .../pylibcudf/strings/regex_flags.rst         |  6 +++
 .../pylibcudf/strings/regex_program.rst       |  6 +++
 .../_lib/pylibcudf/strings/capitalize.pyx     | 48 ++++++++++++++++++-
 .../_lib/pylibcudf/strings/regex_program.pyx  | 19 ++++++++
 8 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
new file mode 100644
index 00000000000..578b2b75e37
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
@@ -0,0 +1,6 @@
+==========
+capitalize
+==========
+
+.. automodule:: cudf._lib.pylibcudf.strings.capitalize
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
new file mode 100644
index 00000000000..577ec34915b
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
@@ -0,0 +1,6 @@
+==========
+char_types
+==========
+
+.. automodule:: cudf._lib.pylibcudf.strings.char_types
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
new file mode 100644
index 00000000000..61d4079e9a3
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
@@ -0,0 +1,6 @@
+====
+find
+====
+
+.. automodule:: cudf._lib.pylibcudf.strings.find
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index cecf1ccc9bb..462a756a092 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -4,6 +4,11 @@ strings
 .. toctree::
     :maxdepth: 1
 
+    capitalize
+    char_types
     contains
+    find
+    regex_flags
+    regex_program
     replace
     slice
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
new file mode 100644
index 00000000000..0126b6a3706
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
@@ -0,0 +1,6 @@
+===========
+regex_flags
+===========
+
+.. automodule:: cudf._lib.pylibcudf.strings.regex_flags
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
new file mode 100644
index 00000000000..2f398186d51
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
@@ -0,0 +1,6 @@
+=============
+regex_program
+=============
+
+.. automodule:: cudf._lib.pylibcudf.strings.regex_program
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
index d3f79088018..ccf84d25572 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
@@ -22,7 +22,22 @@ cpdef Column capitalize(
     # TODO: default scalar values
     # https://github.com/rapidsai/cudf/issues/15505
 ):
-
+    """Returns a column of capitalized strings.
+
+    For details, see :cpp:func:`cudf::strings::capitalize`.
+
+    Parameters
+    ----------
+    input : Column
+        String column
+    delimiters : Scalar, default None
+        Characters for identifying words to capitalize
+
+    Returns
+    -------
+    pylibcudf.Column
+        Column of strings capitalized from the input column
+    """
     cdef unique_ptr[column] c_result
 
     if delimiters is None:
@@ -47,6 +62,23 @@ cpdef Column title(
     Column input,
     string_character_types sequence_type=string_character_types.ALPHA
 ):
+    """Modifies first character of each word to upper-case and lower-cases
+    the rest.
+
+    For details, see :cpp:func:`cudf::strings::title`.
+
+    Parameters
+    ----------
+    input : Column
+        String column
+    sequence_type : string_character_types, default string_character_types.ALPHA
+        The character type that is used when identifying words
+
+    Returns
+    -------
+    pylibcudf.Column
+        Column of titled strings
+    """
     cdef unique_ptr[column] c_result
     with nogil:
         c_result = cpp_capitalize.title(input.view(), sequence_type)
@@ -55,6 +87,20 @@ cpdef Column title(
 
 
 cpdef Column is_title(Column input):
+    """Checks if the strings in the input column are title formatted.
+
+    For details, see :cpp:func:`cudf::strings::is_title`.
+
+    Parameters
+    ----------
+    input : Column
+        String column
+
+    Returns
+    -------
+    pylibcudf.Column
+        Column of type BOOL8
+    """
     cdef unique_ptr[column] c_result
     with nogil:
         c_result = cpp_capitalize.is_title(input.view())
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
index d605b0aba02..5f0b8868452 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
@@ -13,12 +13,31 @@ from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags
 
 
 cdef class RegexProgram:
+    """Regex program class.
 
+    This is the Cython representation of
+    :cpp:class:`cudf::strings::regex_program`.
+
+    Do not instantiate this class directly, use the `create` method.
+
+    """
     def __init__(self, *args, **kwargs):
         raise ValueError("Do not instantiate RegexProgram directly, use create")
 
     @staticmethod
     def create(str pattern, int flags):
+        """Create a program from a pattern.
+
+        For detils, see :cpp:func:`cudf::strings::regex_program::create`.
+
+        Parameters
+        ----------
+        pattern : str
+            Regex pattern
+        flags : Uniont[int, RegexFlags]
+            Regex flags for interpreting special characters in the pattern
+
+        """
         cdef unique_ptr[regex_program] c_prog
         cdef regex_flags c_flags
         cdef string c_pattern = pattern.encode()

From 837dfe51a2f4d0268d6976464eed637645f524ff Mon Sep 17 00:00:00 2001
From: Rahul Prabhu <100436830+sdrp713@users.noreply.github.com>
Date: Mon, 5 Aug 2024 14:14:41 -0700
Subject: [PATCH 028/270] Added batch memset to memset data and validity
 buffers in parquet reader (#16281)

Under some situations in the Parquet reader (particularly the case with tables containing many columns or deeply nested column) we burn a decent amount of time doing cudaMemset() operations on output buffers. A good amount of this overhead seems to stem from the fact that we're simply launching many tiny kernels. This PR adds a batched memset kernel that takes a list of device spans as a single input and does all the work under a single kernel launch. This PR addresses issue #15773

## Improvements
Using out performance cluster, improvements of 2.39% were shown on running the overall NDS queries
Additionally, benchmarks were added showing big improvements(around 20%) especially on fixed width data types which can be shown below

data_type | num_cols | cardinality | run_length | bytes_per_second_before_this_pr | bytes_per_second_after_this_pr | speedup
--- | --- | --- | --- | --- | --- | ---
INTEGRAL | 1000 | 0 | 1 | 36514934834 | 42756531566 | 1.170932709
INTEGRAL | 1000 | 1000 | 1 | 35364061247 | 39112512476 | 1.105996062
INTEGRAL | 1000 | 0 | 32 | 37349112510 | 39641370858 | 1.061373837
INTEGRAL | 1000 | 1000 | 32 | 39167079622 | 43740824957 | 1.116775245
FLOAT | 1000 | 0 | 1 | 51877322003 | 64083898838 | 1.235296973
FLOAT | 1000 | 1000 | 1 | 48983612272 | 58705522023 | 1.198472699
FLOAT | 1000 | 0 | 32 | 46544977658 | 53715018581 | 1.154045426
FLOAT | 1000 | 1000 | 32 | 54493432148 | 66617609904 | 1.22248879
DECIMAL | 1000 | 0 | 1 | 47616412888 | 57952310685 | 1.217065864
DECIMAL | 1000 | 1000 | 1 | 47166138095 | 54283772484 | 1.1509056
DECIMAL | 1000 | 0 | 32 | 45266163387 | 53770390830 | 1.18787162
DECIMAL | 1000 | 1000 | 32 | 52292176603 | 58847723569 | 1.125363819
TIMESTAMP | 1000 | 0 | 1 | 50245415328 | 60797982330 | 1.210020495
TIMESTAMP | 1000 | 1000 | 1 | 50300238706 | 60810368331 | 1.208947908
TIMESTAMP | 1000 | 0 | 32 | 55338354243 | 66786275739 | 1.206871376
TIMESTAMP | 1000 | 1000 | 32 | 55680028082 | 69029227374 | 1.23974843
DURATION | 1000 | 0 | 1 | 54680007758 | 66855201896 | 1.222662626
DURATION | 1000 | 1000 | 1 | 54305832171 | 66602436269 | 1.226432477
DURATION | 1000 | 0 | 32 | 60040760815 | 72663056969 | 1.210228784
DURATION | 1000 | 1000 | 32 | 60212221703 | 75646396131 | 1.256329595
STRING | 1000 | 0 | 1 | 29691707753 | 33388700976 | 1.12451265
STRING | 1000 | 1000 | 1 | 31411129876 | 35407241037 | 1.127219593
STRING | 1000 | 0 | 32 | 29680479388 | 33382478907 | 1.124728427
STRING | 1000 | 1000 | 32 | 35476213777 | 40478389269 | 1.141000827
LIST | 1000 | 0 | 1 | 6874253484 | 7370835717 | 1.072237987
LIST | 1000 | 1000 | 1 | 6763426009 | 7253762966 | 1.07249831
LIST | 1000 | 0 | 32 | 6981508808 | 7502741115 | 1.074658977
LIST | 1000 | 1000 | 32 | 6989374761 | 7506418252 | 1.073975643
STRUCT | 1000 | 0 | 1 | 2137525922 | 2189495762 | 1.024313081
STRUCT | 1000 | 1000 | 1 | 1057923939 | 1078475980 | 1.019426766
STRUCT | 1000 | 0 | 32 | 1637342446 | 1698913790 | 1.037604439
STRUCT | 1000 | 1000 | 32 | 1057587701 | 1082539399 | 1.02359303

Authors:
  - Rahul Prabhu (https://github.com/sdrp713)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - https://github.com/nvdbaranec
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16281
---
 cpp/benchmarks/CMakeLists.txt                 |   5 +
 .../io/utilities/batched_memset_bench.cpp     | 101 ++++++++++++++++++
 cpp/include/cudf/io/detail/batched_memset.hpp |  82 ++++++++++++++
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  29 ++++-
 cpp/src/io/utilities/column_buffer.cpp        |  29 +++--
 cpp/src/io/utilities/column_buffer.hpp        |  23 +++-
 cpp/tests/CMakeLists.txt                      |   1 +
 .../utilities_tests/batched_memset_tests.cu   |  97 +++++++++++++++++
 8 files changed, 353 insertions(+), 14 deletions(-)
 create mode 100644 cpp/benchmarks/io/utilities/batched_memset_bench.cpp
 create mode 100644 cpp/include/cudf/io/detail/batched_memset.hpp
 create mode 100644 cpp/tests/utilities_tests/batched_memset_tests.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index ff431c7f260..7be456ddfba 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -353,6 +353,11 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader
 ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
+# ##################################################################################################
+# * multi buffer memset benchmark
+# ----------------------------------------------------------------------
+ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp)
+
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
diff --git a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
new file mode 100644
index 00000000000..2905895a63b
--- /dev/null
+++ b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
+constexpr size_t data_size = 512 << 20;
+
+void parquet_read_common(cudf::size_type num_rows_to_read,
+                         cudf::size_type num_cols_to_read,
+                         cuio_source_sink_pair& source_sink,
+                         nvbench::state& state)
+{
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+
+      timer.start();
+      auto const result = cudf::io::read_parquet(read_opts);
+      timer.stop();
+
+      CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns");
+      CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
+    });
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+template <data_type DataType>
+void bench_batched_memset(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
+{
+  auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
+  auto const num_cols    = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
+  auto const compression = cudf::io::compression_type::NONE;
+  cuio_source_sink_pair source_sink(source_type);
+  auto const tbl =
+    create_random_table(cycle_dtypes(d_type, num_cols),
+                        table_size_bytes{data_size},
+                        data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+  auto const view = tbl->view();
+
+  cudf::io::parquet_writer_options write_opts =
+    cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+      .compression(compression);
+  cudf::io::write_parquet(write_opts);
+  auto const num_rows = view.num_rows();
+
+  parquet_read_common(num_rows, num_cols, source_sink, state);
+}
+
+using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
+                                            data_type::FLOAT,
+                                            data_type::DECIMAL,
+                                            data_type::TIMESTAMP,
+                                            data_type::DURATION,
+                                            data_type::STRING,
+                                            data_type::LIST,
+                                            data_type::STRUCT>;
+
+NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list))
+  .set_name("batched_memset")
+  .set_type_axes_names({"data_type"})
+  .add_int64_axis("num_cols", {1000})
+  .add_string_axis("io_type", {"DEVICE_BUFFER"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/io/detail/batched_memset.hpp
new file mode 100644
index 00000000000..d0922cc64ee
--- /dev/null
+++ b/cpp/include/cudf/io/detail/batched_memset.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <cub/device/device_copy.cuh>
+#include <cuda/functional>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/transform.h>
+
+namespace CUDF_EXPORT cudf {
+namespace io::detail {
+
+/**
+ * @brief A helper function that takes in a vector of device spans and memsets them to the
+ * value provided using batches sent to the GPU.
+ *
+ * @param bufs Vector with device spans of data
+ * @param value Value to memset all device spans to
+ * @param _stream Stream used for device memory operations and kernel launches
+ *
+ * @return The data in device spans all set to value
+ */
+template <typename T>
+void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
+                    T const value,
+                    rmm::cuda_stream_view stream)
+{
+  // define task and bytes parameters
+  auto const num_bufs = bufs.size();
+
+  // copy bufs into device memory and then get sizes
+  auto gpu_bufs =
+    cudf::detail::make_device_uvector_async(bufs, stream, rmm::mr::get_current_device_resource());
+
+  // get a vector with the sizes of all buffers
+  auto sizes = cudf::detail::make_counting_transform_iterator(
+    static_cast<std::size_t>(0),
+    cuda::proclaim_return_type<std::size_t>(
+      [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].size(); }));
+
+  // get an iterator with a constant value to memset
+  auto iter_in = thrust::make_constant_iterator(thrust::make_constant_iterator(value));
+
+  // get an iterator pointing to each device span
+  auto iter_out = thrust::make_transform_iterator(
+    thrust::counting_iterator<std::size_t>(0),
+    cuda::proclaim_return_type<T*>(
+      [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].data(); }));
+
+  size_t temp_storage_bytes = 0;
+
+  cub::DeviceCopy::Batched(nullptr, temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
+
+  rmm::device_buffer d_temp_storage(
+    temp_storage_bytes, stream, rmm::mr::get_current_device_resource());
+
+  cub::DeviceCopy::Batched(
+    d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
+}
+
+}  // namespace io::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index e006cc7d714..557b1a45c1f 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/batched_memset.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -1494,6 +1495,11 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
   // buffers if they are not part of a list hierarchy. mark down
   // if we have any list columns that need further processing.
   bool has_lists = false;
+  // Casting to std::byte since data buffer pointer is void *
+  std::vector<cudf::device_span<std::byte>> memset_bufs;
+  // Validity Buffer is a uint32_t pointer
+  std::vector<cudf::device_span<cudf::bitmask_type>> nullmask_bufs;
+
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const& input_col  = _input_columns[idx];
     size_t const max_depth = input_col.nesting_depth();
@@ -1514,13 +1520,19 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
         // we're going to start null mask as all valid and then turn bits off if necessary
         out_buf.create_with_mask(
           out_buf.type.id() == type_id::LIST && l_idx < max_depth ? num_rows + 1 : num_rows,
-          cudf::mask_state::ALL_VALID,
+          cudf::mask_state::UNINITIALIZED,
+          false,
           _stream,
           _mr);
+        memset_bufs.push_back(cudf::device_span<std::byte>(static_cast<std::byte*>(out_buf.data()),
+                                                           out_buf.data_size()));
+        nullmask_bufs.push_back(cudf::device_span<cudf::bitmask_type>(
+          out_buf.null_mask(),
+          cudf::util::round_up_safe(out_buf.null_mask_size(), sizeof(cudf::bitmask_type)) /
+            sizeof(cudf::bitmask_type)));
       }
     }
   }
-
   // compute output column sizes by examining the pages of the -input- columns
   if (has_lists) {
     auto h_cols_info =
@@ -1593,11 +1605,22 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
 
           // allocate
           // we're going to start null mask as all valid and then turn bits off if necessary
-          out_buf.create_with_mask(size, cudf::mask_state::ALL_VALID, _stream, _mr);
+          out_buf.create_with_mask(size, cudf::mask_state::UNINITIALIZED, false, _stream, _mr);
+          memset_bufs.push_back(cudf::device_span<std::byte>(
+            static_cast<std::byte*>(out_buf.data()), out_buf.data_size()));
+          nullmask_bufs.push_back(cudf::device_span<cudf::bitmask_type>(
+            out_buf.null_mask(),
+            cudf::util::round_up_safe(out_buf.null_mask_size(), sizeof(cudf::bitmask_type)) /
+              sizeof(cudf::bitmask_type)));
         }
       }
     }
   }
+
+  cudf::io::detail::batched_memset(memset_bufs, static_cast<std::byte>(0), _stream);
+  // Need to set null mask bufs to all high bits
+  cudf::io::detail::batched_memset(
+    nullmask_bufs, std::numeric_limits<cudf::bitmask_type>::max(), _stream);
 }
 
 std::vector<size_t> reader::impl::calculate_page_string_offsets()
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 2f4272b0367..8abfb000b94 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -33,7 +33,7 @@
 
 namespace cudf::io::detail {
 
-void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+void gather_column_buffer::allocate_strings_data(bool memset_data, rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
   // The contents of _strings will never be directly returned to the user.
@@ -56,11 +56,12 @@ std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_
   return make_strings_column(*_strings, stream, _mr);
 }
 
-void cudf::io::detail::inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+void cudf::io::detail::inline_column_buffer::allocate_strings_data(bool memset_data,
+                                                                   rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
   // size + 1 for final offset. _string_data will be initialized later.
-  _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr);
+  _data = create_data(data_type{type_to_id<size_type>()}, size + 1, memset_data, stream, _mr);
 }
 
 void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes,
@@ -93,6 +94,7 @@ void copy_buffer_data(string_policy const& buff, string_policy& new_buff)
 template <class string_policy>
 void column_buffer_base<string_policy>::create_with_mask(size_type _size,
                                                          cudf::mask_state null_mask_state,
+                                                         bool memset_data,
                                                          rmm::cuda_stream_view stream,
                                                          rmm::device_async_resource_ref mr)
 {
@@ -100,16 +102,20 @@ void column_buffer_base<string_policy>::create_with_mask(size_type _size,
   _mr  = mr;
 
   switch (type.id()) {
-    case type_id::STRING: static_cast<string_policy*>(this)->allocate_strings_data(stream); break;
+    case type_id::STRING:
+      static_cast<string_policy*>(this)->allocate_strings_data(memset_data, stream);
+      break;
 
     // list columns store a buffer of int32's as offsets to represent
     // their individual rows
-    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, _mr); break;
+    case type_id::LIST:
+      _data = create_data(data_type{type_to_id<size_type>()}, size, memset_data, stream, _mr);
+      break;
 
     // struct columns store no data themselves.  just validity and children.
     case type_id::STRUCT: break;
 
-    default: _data = create_data(type, size, stream, _mr); break;
+    default: _data = create_data(type, size, memset_data, stream, _mr); break;
   }
   if (is_nullable) {
     _null_mask =
@@ -117,12 +123,21 @@ void column_buffer_base<string_policy>::create_with_mask(size_type _size,
   }
 }
 
+template <class string_policy>
+void column_buffer_base<string_policy>::create(size_type _size,
+                                               bool memset_data,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  create_with_mask(_size, mask_state::ALL_NULL, memset_data, stream, mr);
+}
+
 template <class string_policy>
 void column_buffer_base<string_policy>::create(size_type _size,
                                                rmm::cuda_stream_view stream,
                                                rmm::device_async_resource_ref mr)
 {
-  create_with_mask(_size, mask_state::ALL_NULL, stream, mr);
+  create_with_mask(_size, mask_state::ALL_NULL, true, stream, mr);
 }
 
 template <class string_policy>
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index ed6bb8bbdca..b2290965bb9 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -44,6 +44,7 @@ namespace detail {
  *
  * @param type The intended data type to populate
  * @param size The number of elements to be represented by the mask
+ * @param memset_data Defines whether data should be memset to 0
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  *
@@ -51,17 +52,25 @@ namespace detail {
  */
 inline rmm::device_buffer create_data(data_type type,
                                       size_type size,
+                                      bool memset_data,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   std::size_t data_size = size_of(type) * size;
 
   rmm::device_buffer data(data_size, stream, mr);
-  CUDF_CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value()));
-
+  if (memset_data) { CUDF_CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value())); }
   return data;
 }
 
+inline rmm::device_buffer create_data(data_type type,
+                                      size_type size,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  return create_data(type, size, true, stream, mr);
+}
+
 using string_index_pair = thrust::pair<char const*, size_type>;
 
 // forward declare friend functions
@@ -113,12 +122,18 @@ class column_buffer_base {
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
+  void create(size_type _size,
+              bool memset_data,
+              rmm::cuda_stream_view stream,
+              rmm::device_async_resource_ref mr);
+
   void create(size_type _size, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
   // like create(), but also takes a `cudf::mask_state` to allow initializing the null mask as
   // something other than `ALL_NULL`
   void create_with_mask(size_type _size,
                         cudf::mask_state null_mask_state,
+                        bool memset_data,
                         rmm::cuda_stream_view stream,
                         rmm::device_async_resource_ref mr);
 
@@ -192,7 +207,7 @@ class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
     create(_size, stream, mr);
   }
 
-  void allocate_strings_data(rmm::cuda_stream_view stream);
+  void allocate_strings_data(bool memset_data, rmm::cuda_stream_view stream);
 
   [[nodiscard]] void* data_impl() { return _strings ? _strings->data() : _data.data(); }
   [[nodiscard]] void const* data_impl() const { return _strings ? _strings->data() : _data.data(); }
@@ -226,7 +241,7 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
     create(_size, stream, mr);
   }
 
-  void allocate_strings_data(rmm::cuda_stream_view stream);
+  void allocate_strings_data(bool memset_data, rmm::cuda_stream_view stream);
 
   void* data_impl() { return _data.data(); }
   [[nodiscard]] void const* data_impl() const { return _data.data(); }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4dffcb41ba2..5e85b3e8adf 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -393,6 +393,7 @@ ConfigureTest(
   utilities_tests/pinned_memory_tests.cpp
   utilities_tests/type_check_tests.cpp
   utilities_tests/type_list_tests.cpp
+  utilities_tests/batched_memset_tests.cu
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu
new file mode 100644
index 00000000000..9fc5baeec97
--- /dev/null
+++ b/cpp/tests/utilities_tests/batched_memset_tests.cu
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/batched_memset.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/tuple.h>
+
+#include <type_traits>
+
+template <typename T>
+struct MultiBufferTestIntegral : public cudf::test::BaseFixture {};
+
+TEST(MultiBufferTestIntegral, BasicTest1)
+{
+  std::vector<size_t> const BUF_SIZES{
+    50000, 4, 1000, 0, 250000, 1, 100, 8000, 0, 1, 100, 1000, 10000, 100000, 0, 1, 100000};
+
+  // Device init
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
+  // Creating base vector for data and setting it to all 0xFF
+  std::vector<std::vector<uint64_t>> expected;
+  std::transform(BUF_SIZES.begin(), BUF_SIZES.end(), std::back_inserter(expected), [](auto size) {
+    return std::vector<uint64_t>(size + 2000, std::numeric_limits<uint64_t>::max());
+  });
+
+  // set buffer region to other value
+  std::for_each(thrust::make_zip_iterator(thrust::make_tuple(expected.begin(), BUF_SIZES.begin())),
+                thrust::make_zip_iterator(thrust::make_tuple(expected.end(), BUF_SIZES.end())),
+                [](auto elem) {
+                  std::fill_n(
+                    thrust::get<0>(elem).begin() + 1000, thrust::get<1>(elem), 0xEEEEEEEEEEEEEEEE);
+                });
+
+  // Copy host vector data to device
+  std::vector<rmm::device_uvector<uint64_t>> device_bufs;
+  std::transform(expected.begin(),
+                 expected.end(),
+                 std::back_inserter(device_bufs),
+                 [stream, mr](auto const& vec) {
+                   return cudf::detail::make_device_uvector_async(vec, stream, mr);
+                 });
+
+  // Initialize device buffers for memset
+  std::vector<cudf::device_span<uint64_t>> memset_bufs;
+  std::transform(
+    thrust::make_zip_iterator(thrust::make_tuple(device_bufs.begin(), BUF_SIZES.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(device_bufs.end(), BUF_SIZES.end())),
+    std::back_inserter(memset_bufs),
+    [](auto const& elem) {
+      return cudf::device_span<uint64_t>(thrust::get<0>(elem).data() + 1000, thrust::get<1>(elem));
+    });
+
+  // Function Call
+  cudf::io::detail::batched_memset(memset_bufs, uint64_t{0}, stream);
+
+  // Set all buffer regions to 0 for expected comparison
+  std::for_each(
+    thrust::make_zip_iterator(thrust::make_tuple(expected.begin(), BUF_SIZES.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected.end(), BUF_SIZES.end())),
+    [](auto elem) { std::fill_n(thrust::get<0>(elem).begin() + 1000, thrust::get<1>(elem), 0UL); });
+
+  // Compare to see that only given buffers are zeroed out
+  std::for_each(
+    thrust::make_zip_iterator(thrust::make_tuple(device_bufs.begin(), expected.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(device_bufs.end(), expected.end())),
+    [stream](auto const& elem) {
+      auto after_memset = cudf::detail::make_std_vector_async(thrust::get<0>(elem), stream);
+      EXPECT_TRUE(
+        std::equal(thrust::get<1>(elem).begin(), thrust::get<1>(elem).end(), after_memset.begin()));
+    });
+}

From 8068a2d616b6647bcd80720a2c24af858cbffd2d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 5 Aug 2024 14:48:33 -0700
Subject: [PATCH 029/270] Fix build failures with GCC 13 (#16488)

Closes #16395

This PR resolves two types of compilation errors, allowing for successful builds with GCC 13:

- replacing the `cuco_allocator` strong type with an alias to fix a new build time check with GCC 13
- removing `std::move` when returning a temporary

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16488
---
 cpp/include/cudf/detail/cuco_helpers.hpp      | 17 ++----
 .../cudf/detail/distinct_hash_join.cuh        |  2 +-
 .../cudf/detail/hash_reduce_by_row.cuh        |  2 +-
 cpp/include/cudf/detail/join.hpp              |  2 +-
 cpp/include/cudf_test/column_wrapper.hpp      | 14 ++---
 cpp/src/groupby/hash/groupby.cu               | 19 +++----
 cpp/src/io/json/json_tree.cu                  | 35 +++++++------
 cpp/src/io/json/write_json.cu                 |  2 +-
 cpp/src/join/conditional_join.cu              | 52 +++++++++----------
 cpp/src/join/distinct_hash_join.cu            |  2 +-
 cpp/src/join/hash_join.cu                     |  2 +-
 cpp/src/join/join_common_utils.hpp            |  8 +--
 cpp/src/join/mixed_join.cu                    | 22 ++++----
 cpp/src/join/mixed_join_semi.cu               | 11 ++--
 cpp/src/reductions/histogram.cu               | 12 +++--
 cpp/src/search/contains_table.cu              | 17 +++---
 cpp/src/stream_compaction/distinct.cu         | 19 +++----
 cpp/src/stream_compaction/distinct_count.cu   | 17 +++---
 .../stream_compaction/distinct_helpers.hpp    |  2 +-
 cpp/src/text/bpe/byte_pair_encoding.cuh       |  4 +-
 cpp/src/text/bpe/load_merge_pairs.cu          | 39 +++++++-------
 cpp/src/text/vocabulary_tokenize.cu           |  4 +-
 cpp/tests/copying/gather_tests.cpp            | 14 ++---
 cpp/tests/reshape/byte_cast_tests.cpp         | 22 ++++----
 cpp/tests/structs/structs_column_tests.cpp    | 48 ++++++++---------
 25 files changed, 195 insertions(+), 193 deletions(-)

diff --git a/cpp/include/cudf/detail/cuco_helpers.hpp b/cpp/include/cudf/detail/cuco_helpers.hpp
index dca5a39bece..926df921715 100644
--- a/cpp/include/cudf/detail/cuco_helpers.hpp
+++ b/cpp/include/cudf/detail/cuco_helpers.hpp
@@ -36,19 +36,10 @@ static double constexpr CUCO_DESIRED_LOAD_FACTOR = 0.5;
  * later expects a standard C++ `Allocator` interface. This allocator helper provides a simple way
  * to handle cuco memory allocation/deallocation with the given `stream` and the rmm default memory
  * resource.
+ *
+ * @tparam T The allocator's value type.
  */
-class cuco_allocator
-  : public rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<char>> {
-  /// Default stream-ordered allocator type
-  using default_allocator = rmm::mr::polymorphic_allocator<char>;
-  /// The base allocator adaptor type
-  using base_type = rmm::mr::stream_allocator_adaptor<default_allocator>;
-
- public:
-  /**
-   * @brief Constructs the allocator adaptor with the given `stream`
-   */
-  cuco_allocator(rmm::cuda_stream_view stream) : base_type{default_allocator{}, stream} {}
-};
+template <typename T>
+using cuco_allocator = rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<T>>;
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index c3bc3ad89fa..0b3d7ac58bf 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -99,7 +99,7 @@ struct distinct_hash_join {
                                            cuda::thread_scope_device,
                                            comparator_adapter<d_equal_type>,
                                            probing_scheme_type,
-                                           cudf::detail::cuco_allocator,
+                                           cudf::detail::cuco_allocator<char>,
                                            cuco_storage_type>;
 
   bool _has_nulls;  ///< true if nulls are present in either build table or probe table
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
index dfe79646167..7a1e38eefe0 100644
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -32,7 +32,7 @@
 namespace cudf::detail {
 
 using hash_map_type = cuco::legacy::
-  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
+  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator<char>>;
 
 /**
  * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index ff7da4462a2..af46dd79cdb 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -59,7 +59,7 @@ struct hash_join {
     cuco::static_multimap<hash_value_type,
                           cudf::size_type,
                           cuda::thread_scope_device,
-                          cudf::detail::cuco_allocator,
+                          cudf::detail::cuco_allocator<char>,
                           cuco::legacy::double_hashing<DEFAULT_JOIN_CG_SIZE, Hasher, Hasher>>;
 
   hash_join()                            = delete;
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 4e504ec1d30..d00db222b62 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1337,7 +1337,7 @@ class lists_column_wrapper : public detail::column_wrapper {
   lists_column_wrapper(std::initializer_list<SourceElementT> elements) : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements).release()));
+      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements).release());
   }
 
   /**
@@ -1361,7 +1361,7 @@ class lists_column_wrapper : public detail::column_wrapper {
   lists_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end).release()));
+      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end).release());
   }
 
   /**
@@ -1386,7 +1386,7 @@ class lists_column_wrapper : public detail::column_wrapper {
     : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements, v).release()));
+      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements, v).release());
   }
 
   /**
@@ -1413,8 +1413,8 @@ class lists_column_wrapper : public detail::column_wrapper {
   lists_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
     : column_wrapper{}
   {
-    build_from_non_nested(std::move(
-      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end, v).release()));
+    build_from_non_nested(
+      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end, v).release());
   }
 
   /**
@@ -1435,7 +1435,7 @@ class lists_column_wrapper : public detail::column_wrapper {
   lists_column_wrapper(std::initializer_list<std::string> elements) : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::strings_column_wrapper(elements.begin(), elements.end()).release()));
+      cudf::test::strings_column_wrapper(elements.begin(), elements.end()).release());
   }
 
   /**
@@ -1460,7 +1460,7 @@ class lists_column_wrapper : public detail::column_wrapper {
     : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::strings_column_wrapper(elements.begin(), elements.end(), v).release()));
+      cudf::test::strings_column_wrapper(elements.begin(), elements.end(), v).release());
   }
 
   /**
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 5fe4a5eb30f..35161eada28 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -568,15 +568,16 @@ std::unique_ptr<table> groupby(table_view const& keys,
   cudf::detail::result_cache sparse_results(requests.size());
 
   auto const comparator_helper = [&](auto const d_key_equal) {
-    auto const set = cuco::static_set{num_keys,
-                                      0.5,  // desired load factor
-                                      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                                      d_key_equal,
-                                      probing_scheme_type{d_row_hash},
-                                      cuco::thread_scope_device,
-                                      cuco::storage<1>{},
-                                      cudf::detail::cuco_allocator{stream},
-                                      stream.value()};
+    auto const set = cuco::static_set{
+      num_keys,
+      0.5,  // desired load factor
+      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+      d_key_equal,
+      probing_scheme_type{d_row_hash},
+      cuco::thread_scope_device,
+      cuco::storage<1>{},
+      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+      stream.value()};
 
     // Compute all single pass aggs first
     compute_single_pass_aggs(keys,
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index ad807b57766..ee6bc0b9f4b 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -545,15 +545,15 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
 
   using hasher_type                             = decltype(d_hasher);
   constexpr size_type empty_node_index_sentinel = -1;
-  auto key_set =
-    cuco::static_set{cuco::extent{compute_hash_table_size(num_fields, 40)},  // 40% occupancy
-                     cuco::empty_key{empty_node_index_sentinel},
-                     d_equal,
-                     cuco::linear_probing<1, hasher_type>{d_hasher},
-                     {},
-                     {},
-                     cudf::detail::cuco_allocator{stream},
-                     stream.value()};
+  auto key_set                                  = cuco::static_set{
+    cuco::extent{compute_hash_table_size(num_fields, 40)},  // 40% occupancy
+    cuco::empty_key{empty_node_index_sentinel},
+    d_equal,
+    cuco::linear_probing<1, hasher_type>{d_hasher},
+                                     {},
+                                     {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
   key_set.insert_if_async(iter,
                           iter + num_nodes,
                           thrust::counting_iterator<size_type>(0),  // stencil
@@ -734,14 +734,15 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
   constexpr size_type empty_node_index_sentinel = -1;
   using hasher_type                             = decltype(d_hashed_cache);
 
-  auto key_set = cuco::static_set{cuco::extent{compute_hash_table_size(num_nodes)},
-                                  cuco::empty_key<cudf::size_type>{empty_node_index_sentinel},
-                                  d_equal,
-                                  cuco::linear_probing<1, hasher_type>{d_hashed_cache},
-                                  {},
-                                  {},
-                                  cudf::detail::cuco_allocator{stream},
-                                  stream.value()};
+  auto key_set = cuco::static_set{
+    cuco::extent{compute_hash_table_size(num_nodes)},
+    cuco::empty_key<cudf::size_type>{empty_node_index_sentinel},
+    d_equal,
+    cuco::linear_probing<1, hasher_type>{d_hashed_cache},
+    {},
+    {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
   // insert and convert node ids to unique set ids
   auto nodes_itr         = thrust::make_counting_iterator<size_type>(0);
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index c688c809e04..60bb2366e87 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -649,7 +649,7 @@ struct column_to_strings_fn {
     auto const list_child_string = make_lists_column(
       column.size(),
       std::move(new_offsets),
-      std::move(child_string_with_null()),
+      child_string_with_null(),
       column.null_count(),
       cudf::detail::copy_bitmask(column, stream_, rmm::mr::get_current_device_resource()),
       stream_);
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index d4ef2747c9d..789702ce538 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -432,13 +432,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join_anti_semi(left,
-                                                      right,
-                                                      binary_predicate,
-                                                      detail::join_kind::LEFT_SEMI_JOIN,
-                                                      output_size,
-                                                      cudf::get_default_stream(),
-                                                      mr));
+  return detail::conditional_join_anti_semi(left,
+                                            right,
+                                            binary_predicate,
+                                            detail::join_kind::LEFT_SEMI_JOIN,
+                                            output_size,
+                                            cudf::get_default_stream(),
+                                            mr);
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
@@ -449,13 +449,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join_anti_semi(left,
-                                                      right,
-                                                      binary_predicate,
-                                                      detail::join_kind::LEFT_ANTI_JOIN,
-                                                      output_size,
-                                                      cudf::get_default_stream(),
-                                                      mr));
+  return detail::conditional_join_anti_semi(left,
+                                            right,
+                                            binary_predicate,
+                                            detail::join_kind::LEFT_ANTI_JOIN,
+                                            output_size,
+                                            cudf::get_default_stream(),
+                                            mr);
 }
 
 std::size_t conditional_inner_join_size(table_view const& left,
@@ -484,12 +484,12 @@ std::size_t conditional_left_semi_join_size(table_view const& left,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::compute_conditional_join_output_size(left,
-                                                                right,
-                                                                binary_predicate,
-                                                                detail::join_kind::LEFT_SEMI_JOIN,
-                                                                cudf::get_default_stream(),
-                                                                mr));
+  return detail::compute_conditional_join_output_size(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_SEMI_JOIN,
+                                                      cudf::get_default_stream(),
+                                                      mr);
 }
 
 std::size_t conditional_left_anti_join_size(table_view const& left,
@@ -498,12 +498,12 @@ std::size_t conditional_left_anti_join_size(table_view const& left,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::compute_conditional_join_output_size(left,
-                                                                right,
-                                                                binary_predicate,
-                                                                detail::join_kind::LEFT_ANTI_JOIN,
-                                                                cudf::get_default_stream(),
-                                                                mr));
+  return detail::compute_conditional_join_output_size(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_ANTI_JOIN,
+                                                      cudf::get_default_stream(),
+                                                      mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index daa1bf17c0d..3d95b0c5a5c 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -119,7 +119,7 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
                 {},
                 cuco::thread_scope_device,
                 cuco_storage_type{},
-                cudf::detail::cuco_allocator{stream},
+                cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
                 stream.value()}
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index eb9b687630b..5d01482f44a 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -374,7 +374,7 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
                 cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
                 cuco::empty_value{cudf::detail::JoinNoneValue},
                 stream.value(),
-                cudf::detail::cuco_allocator{stream}},
+                cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}},
     _build{build},
     _preprocessed_build{
       cudf::experimental::row::equality::preprocessed_table::create(_build, stream)}
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 4157100b67e..86402a0e7de 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -48,11 +48,13 @@ using mixed_multimap_type =
   cuco::static_multimap<hash_value_type,
                         size_type,
                         cuda::thread_scope_device,
-                        cudf::detail::cuco_allocator,
+                        cudf::detail::cuco_allocator<char>,
                         cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
-using semi_map_type = cuco::legacy::
-  static_map<hash_value_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
+using semi_map_type = cuco::legacy::static_map<hash_value_type,
+                                               size_type,
+                                               cuda::thread_scope_device,
+                                               cudf::detail::cuco_allocator<char>>;
 
 using row_hash_legacy =
   cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 90748e6f322..48b94c777de 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -126,11 +126,12 @@ mixed_join(
   auto build_view = table_device_view::create(build, stream);
 
   // Don't use multimap_type because we want a CG size of 1.
-  mixed_multimap_type hash_table{compute_hash_table_size(build.num_rows()),
-                                 cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-                                 cuco::empty_value{cudf::detail::JoinNoneValue},
-                                 stream.value(),
-                                 cudf::detail::cuco_allocator{stream}};
+  mixed_multimap_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
+    cuco::empty_value{cudf::detail::JoinNoneValue},
+    stream.value(),
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
 
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
@@ -391,11 +392,12 @@ compute_mixed_join_output_size(table_view const& left_equality,
   auto build_view = table_device_view::create(build, stream);
 
   // Don't use multimap_type because we want a CG size of 1.
-  mixed_multimap_type hash_table{compute_hash_table_size(build.num_rows()),
-                                 cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-                                 cuco::empty_value{cudf::detail::JoinNoneValue},
-                                 stream.value(),
-                                 cudf::detail::cuco_allocator{stream}};
+  mixed_multimap_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
+    cuco::empty_value{cudf::detail::JoinNoneValue},
+    stream.value(),
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
 
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index c147ea3c253..3e4188a0fbd 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -163,11 +163,12 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
   auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
 
-  semi_map_type hash_table{compute_hash_table_size(build.num_rows()),
-                           cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-                           cuco::empty_value{cudf::detail::JoinNoneValue},
-                           cudf::detail::cuco_allocator{stream},
-                           stream.value()};
+  semi_map_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
+    cuco::empty_value{cudf::detail::JoinNoneValue},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
   // Create hash table containing all keys found in right table
   // TODO: To add support for nested columns we will need to flatten in many
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index bebb9d14923..d49c0c6f0d2 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -164,11 +164,13 @@ compute_row_frequencies(table_view const& input,
                "Nested types are not yet supported in histogram aggregation.",
                std::invalid_argument);
 
-  auto map = cudf::detail::hash_map_type{compute_hash_table_size(input.num_rows()),
-                                         cuco::empty_key{-1},
-                                         cuco::empty_value{std::numeric_limits<size_type>::min()},
-                                         cudf::detail::cuco_allocator{stream},
-                                         stream.value()};
+  auto map = cudf::detail::hash_map_type{
+    compute_hash_table_size(input.num_rows()),
+    cuco::empty_key{-1},
+    cuco::empty_value{std::numeric_limits<size_type>::min()},
+
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(input, stream);
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 81227cb9a2d..66cefd0aa2f 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -229,14 +229,15 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
     [&](auto const& d_self_equal, auto const& d_two_table_equal, auto const& probing_scheme) {
       auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal};
 
-      auto set = cuco::static_set{cuco::extent{compute_hash_table_size(haystack.num_rows())},
-                                  cuco::empty_key{rhs_index_type{-1}},
-                                  d_equal,
-                                  probing_scheme,
-                                  {},
-                                  {},
-                                  cudf::detail::cuco_allocator{stream},
-                                  stream.value()};
+      auto set = cuco::static_set{
+        cuco::extent{compute_hash_table_size(haystack.num_rows())},
+        cuco::empty_key{rhs_index_type{-1}},
+        d_equal,
+        probing_scheme,
+        {},
+        {},
+        cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+        stream.value()};
 
       if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
         auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index e2c5aba6802..6afd6e34c50 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -97,15 +97,16 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
 
   auto const helper_func = [&](auto const& d_equal) {
     using RowHasher = std::decay_t<decltype(d_equal)>;
-    auto set        = hash_set_type<RowHasher>{num_rows,
-                                               0.5,  // desired load factor
-                                               cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                                               d_equal,
-                                               {row_hash.device_hasher(has_nulls)},
-                                               {},
-                                               {},
-                                               cudf::detail::cuco_allocator{stream},
-                                               stream.value()};
+    auto set        = hash_set_type<RowHasher>{
+      num_rows,
+      0.5,  // desired load factor
+      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+      d_equal,
+      {row_hash.device_hasher(has_nulls)},
+      {},
+      {},
+      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+      stream.value()};
     return detail::reduce_by_row(set, num_rows, keep, stream, mr);
   };
 
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 9843bb889f4..cdf9faddf31 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -141,14 +141,15 @@ cudf::size_type distinct_count(table_view const& keys,
 
   auto const comparator_helper = [&](auto const row_equal) {
     using hasher_type = decltype(hash_key);
-    auto key_set      = cuco::static_set{cuco::extent{compute_hash_table_size(num_rows)},
-                                    cuco::empty_key<cudf::size_type>{-1},
-                                    row_equal,
-                                    cuco::linear_probing<1, hasher_type>{hash_key},
-                                         {},
-                                         {},
-                                    cudf::detail::cuco_allocator{stream},
-                                    stream.value()};
+    auto key_set      = cuco::static_set{
+      cuco::extent{compute_hash_table_size(num_rows)},
+      cuco::empty_key<cudf::size_type>{-1},
+      row_equal,
+      cuco::linear_probing<1, hasher_type>{hash_key},
+           {},
+           {},
+      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+      stream.value()};
 
     auto const iter = thrust::counting_iterator<cudf::size_type>(0);
     // when nulls are equal, we skip hashing any row that has a null
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index fca67c98873..bea02e3dbe8 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -57,7 +57,7 @@ using hash_set_type =
                                         cudf::experimental::row::hash::device_row_hasher<
                                           cudf::hashing::detail::default_hash,
                                           cudf::nullate::DYNAMIC>>,
-                   cudf::detail::cuco_allocator,
+                   cudf::detail::cuco_allocator<char>,
                    cuco::storage<1>>;
 
 /**
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh
index a2e441c3284..69c77224eb7 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cuh
+++ b/cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -106,7 +106,7 @@ using merge_pairs_map_type = cuco::static_map<cudf::size_type,
                                               cuda::thread_scope_device,
                                               bpe_equal,
                                               bpe_probe_scheme,
-                                              cudf::detail::cuco_allocator,
+                                              cudf::detail::cuco_allocator<char>,
                                               cuco_storage>;
 
 /**
@@ -164,7 +164,7 @@ using mp_table_map_type = cuco::static_map<cudf::size_type,
                                            cuda::thread_scope_device,
                                            mp_equal,
                                            mp_probe_scheme,
-                                           cudf::detail::cuco_allocator,
+                                           cudf::detail::cuco_allocator<char>,
                                            cuco_storage>;
 
 }  // namespace detail
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index f34c5c4f7f6..9fb86aecce3 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -43,16 +43,16 @@ namespace {
 std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
-  auto merge_pairs_map =
-    std::make_unique<merge_pairs_map_type>(static_cast<size_t>(input.size()),
-                                           cuco::empty_key{-1},
-                                           cuco::empty_value{-1},
-                                           bpe_equal{input},
-                                           bpe_probe_scheme{bpe_hasher{input}},
-                                           cuco::thread_scope_device,
-                                           cuco_storage{},
-                                           cudf::detail::cuco_allocator{stream},
-                                           stream.value());
+  auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
+    static_cast<size_t>(input.size()),
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    bpe_equal{input},
+    bpe_probe_scheme{bpe_hasher{input}},
+    cuco::thread_scope_device,
+    cuco_storage{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value());
 
   auto iter = cudf::detail::make_counting_transform_iterator(
     0,
@@ -67,15 +67,16 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
 std::unique_ptr<detail::mp_table_map_type> initialize_mp_table_map(
   cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
-  auto mp_table_map = std::make_unique<mp_table_map_type>(static_cast<size_t>(input.size()),
-                                                          cuco::empty_key{-1},
-                                                          cuco::empty_value{-1},
-                                                          mp_equal{input},
-                                                          mp_probe_scheme{mp_hasher{input}},
-                                                          cuco::thread_scope_device,
-                                                          cuco_storage{},
-                                                          cudf::detail::cuco_allocator{stream},
-                                                          stream.value());
+  auto mp_table_map = std::make_unique<mp_table_map_type>(
+    static_cast<size_t>(input.size()),
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    mp_equal{input},
+    mp_probe_scheme{mp_hasher{input}},
+    cuco::thread_scope_device,
+    cuco_storage{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value());
 
   auto iter = cudf::detail::make_counting_transform_iterator(
     0,
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index 97abb1487d8..5945921ed9d 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -100,7 +100,7 @@ using vocabulary_map_type = cuco::static_map<cudf::size_type,
                                              cuda::thread_scope_device,
                                              vocab_equal,
                                              probe_scheme,
-                                             cudf::detail::cuco_allocator,
+                                             cudf::detail::cuco_allocator<char>,
                                              cuco_storage>;
 }  // namespace
 }  // namespace detail
@@ -152,7 +152,7 @@ tokenize_vocabulary::tokenize_vocabulary(cudf::strings_column_view const& input,
     detail::probe_scheme{detail::vocab_hasher{*d_vocabulary}},
     cuco::thread_scope_device,
     detail::cuco_storage{},
-    cudf::detail::cuco_allocator{stream},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value());
 
   // the row index is the token id (value for each key in the map)
diff --git a/cpp/tests/copying/gather_tests.cpp b/cpp/tests/copying/gather_tests.cpp
index 284b6c4c50c..07ce672b14d 100644
--- a/cpp/tests/copying/gather_tests.cpp
+++ b/cpp/tests/copying/gather_tests.cpp
@@ -43,7 +43,7 @@ TYPED_TEST(GatherTest, IdentityTest)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   for (auto i = 0; i < source_table.num_columns(); ++i) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
@@ -66,7 +66,7 @@ TYPED_TEST(GatherTest, ReverseIdentityTest)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
   cudf::test::fixed_width_column_wrapper<TypeParam> expect_column(reversed_data,
                                                                   reversed_data + source_size);
 
@@ -94,7 +94,7 @@ TYPED_TEST(GatherTest, EveryOtherNullOdds)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   auto expect_data  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; });
   auto expect_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; });
@@ -126,7 +126,7 @@ TYPED_TEST(GatherTest, EveryOtherNullEvens)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   auto expect_data =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2 + 1; });
@@ -160,7 +160,7 @@ TYPED_TEST(GatherTest, AllNull)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   // Check that the result is also all invalid
   CUDF_TEST_EXPECT_TABLES_EQUAL(source_table, result->view());
@@ -190,7 +190,7 @@ TYPED_TEST(GatherTest, MultiColReverseIdentityTest)
 
   cudf::table_view source_table{source_columns};
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   cudf::test::fixed_width_column_wrapper<TypeParam> expect_column(reversed_data,
                                                                   reversed_data + source_size);
@@ -228,7 +228,7 @@ TYPED_TEST(GatherTest, MultiColNulls)
 
   cudf::table_view source_table{source_columns};
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   // Expected data
   auto expect_data =
diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp
index cd280302677..b3d9b2e2f5f 100644
--- a/cpp/tests/reshape/byte_cast_tests.cpp
+++ b/cpp/tests/reshape/byte_cast_tests.cpp
@@ -61,8 +61,8 @@ TEST_F(ByteCastTest, int16ValuesWithNulls)
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5);
   auto int16_expected          = cudf::make_lists_column(
     5,
-    std::move(cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 2, 2, 4, 4}.release()),
-    std::move(int16_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 2, 2, 4, 4}.release(),
+    int16_data.release(),
     null_count,
     std::move(null_mask));
 
@@ -109,8 +109,8 @@ TEST_F(ByteCastTest, int32ValuesWithNulls)
 
   auto int32_expected = cudf::make_lists_column(
     5,
-    std::move(cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 4, 8, 8, 12}.release()),
-    std::move(int32_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 4, 8, 8, 12}.release(),
+    int32_data.release(),
     null_count,
     std::move(null_mask));
 
@@ -163,9 +163,8 @@ TEST_F(ByteCastTest, int64ValuesWithNulls)
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5);
   auto int64_expected          = cudf::make_lists_column(
     5,
-    std::move(
-      cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 8, 8, 16, 16}.release()),
-    std::move(int64_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 8, 8, 16, 16}.release(),
+    int64_data.release(),
     null_count,
     std::move(null_mask));
 
@@ -226,8 +225,8 @@ TEST_F(ByteCastTest, fp32ValuesWithNulls)
     cudf::test::detail::make_null_mask(even_validity, even_validity + 5);
   auto fp32_expected = cudf::make_lists_column(
     5,
-    std::move(cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 4, 8, 8, 12}.release()),
-    std::move(fp32_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 4, 8, 8, 12}.release(),
+    fp32_data.release(),
     null_count,
     std::move(null_mask));
 
@@ -297,9 +296,8 @@ TEST_F(ByteCastTest, fp64ValuesWithNulls)
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5);
   auto fp64_expected           = cudf::make_lists_column(
     5,
-    std::move(
-      cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 8, 8, 16, 16}.release()),
-    std::move(fp64_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 8, 8, 16, 16}.release(),
+    fp64_data.release(),
     null_count,
     std::move(null_mask));
 
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index df005dfa1dc..f0010fc1ed9 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -448,12 +448,12 @@ TYPED_TEST(TypedStructColumnWrapperTest, ListOfStructOfList)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
   auto [null_mask, null_count] =
     detail::make_null_mask(list_of_struct_of_list_validity, list_of_struct_of_list_validity + 5);
-  auto list_of_struct_of_list = cudf::make_lists_column(
-    5,
-    std::move(fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release()),
-    std::move(struct_of_lists_col),
-    null_count,
-    std::move(null_mask));
+  auto list_of_struct_of_list =
+    cudf::make_lists_column(5,
+                            fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release(),
+                            std::move(struct_of_lists_col),
+                            null_count,
+                            std::move(null_mask));
 
   // Compare with expected values.
 
@@ -468,12 +468,12 @@ TYPED_TEST(TypedStructColumnWrapperTest, ListOfStructOfList)
 
   std::tie(null_mask, null_count) =
     detail::make_null_mask(list_of_struct_of_list_validity, list_of_struct_of_list_validity + 5);
-  auto expected_level3_list = cudf::make_lists_column(
-    5,
-    std::move(fixed_width_column_wrapper<size_type>{0, 0, 2, 4, 4, 6}.release()),
-    std::move(expected_level2_struct),
-    null_count,
-    std::move(null_mask));
+  auto expected_level3_list =
+    cudf::make_lists_column(5,
+                            fixed_width_column_wrapper<size_type>{0, 0, 2, 4, 4, 6}.release(),
+                            std::move(expected_level2_struct),
+                            null_count,
+                            std::move(null_mask));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*list_of_struct_of_list, *expected_level3_list);
 }
@@ -498,12 +498,12 @@ TYPED_TEST(TypedStructColumnWrapperTest, StructOfListOfStruct)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
   auto [null_mask, null_count] = detail::make_null_mask(list_validity, list_validity + 5);
 
-  auto lists_col = cudf::make_lists_column(
-    5,
-    std::move(fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release()),
-    std::move(structs_col),
-    null_count,
-    std::move(null_mask));
+  auto lists_col =
+    cudf::make_lists_column(5,
+                            fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release(),
+                            std::move(structs_col),
+                            null_count,
+                            std::move(null_mask));
 
   std::vector<std::unique_ptr<cudf::column>> cols;
   cols.push_back(std::move(lists_col));
@@ -519,12 +519,12 @@ TYPED_TEST(TypedStructColumnWrapperTest, StructOfListOfStruct)
 
   std::tie(null_mask, null_count) = detail::make_null_mask(list_validity, list_validity + 5);
 
-  auto expected_lists_col = cudf::make_lists_column(
-    5,
-    std::move(fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release()),
-    std::move(expected_structs_col),
-    null_count,
-    std::move(null_mask));
+  auto expected_lists_col =
+    cudf::make_lists_column(5,
+                            fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release(),
+                            std::move(expected_structs_col),
+                            null_count,
+                            std::move(null_mask));
 
   // Test that the lists child column is as expected.
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_lists_col, struct_of_list_of_struct->child(0));

From e8156d42163fb02aa90baba9be20ab89bc9ebef1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 6 Aug 2024 17:03:10 -0400
Subject: [PATCH 030/270] Fix segmented-sort overlapped input/output indices
 (#16463)

Fixes call to CUB `DeviceSegmentedSort::SortPairs` where the input and output indices pointed to the same temp memory. The documentation from https://nvidia.github.io/cccl/cub/api/structcub_1_1DeviceSegmentedSort.html#id8 indicates the `d_values_in` and `d_values_out` memory must not overlap so using the same pointer for both created invalid output in certain conditions. The internal function was implemented to expect the input values to be updated in-place. The fix uses separate device memory for the input and output indices.

Closes #16455

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16463
---
 cpp/src/sort/segmented_sort_impl.cuh    |  4 +++-
 cpp/tests/sort/segmented_sort_tests.cpp | 26 ++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index 6d472925b30..281fdfa6b8f 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -79,6 +79,8 @@ struct column_fast_sort_fn {
                                                 stream,
                                                 rmm::mr::get_current_device_resource());
     mutable_column_view output_view = temp_col->mutable_view();
+    auto temp_indices               = cudf::column(
+      cudf::column_view(indices.type(), indices.size(), indices.head(), nullptr, 0), stream);
 
     // DeviceSegmentedSort is faster than DeviceSegmentedRadixSort at this time
     auto fast_sort_impl = [stream](bool ascending, [[maybe_unused]] auto&&... args) {
@@ -118,7 +120,7 @@ struct column_fast_sort_fn {
     fast_sort_impl(ascending,
                    input.begin<T>(),
                    output_view.begin<T>(),
-                   indices.begin<size_type>(),
+                   temp_indices.view().begin<size_type>(),
                    indices.begin<size_type>(),
                    input.size(),
                    segment_offsets.size() - 1,
diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp
index da9666cbc74..f4fe2c5956a 100644
--- a/cpp/tests/sort/segmented_sort_tests.cpp
+++ b/cpp/tests/sort/segmented_sort_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,9 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/sorting.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <type_traits>
 #include <vector>
@@ -338,3 +340,25 @@ TEST_F(SegmentedSortInt, Bool)
   result = cudf::stable_segmented_sorted_order(cudf::table_view({test_col}), segments);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 }
+
+// Specific test for fix in https://github.com/rapidsai/cudf/pull/16463
+TEST_F(SegmentedSortInt, UnbalancedOffsets)
+{
+  auto h_input = std::vector<int64_t>(3535);
+  std::iota(h_input.begin(), h_input.end(), 1);
+  std::sort(h_input.begin(), h_input.end(), std::greater<int64_t>{});
+  std::fill_n(h_input.begin(), 4, 0);
+  std::fill(h_input.begin() + 3533, h_input.end(), 10000);
+  auto d_input = cudf::detail::make_device_uvector_sync(
+    h_input, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto input    = cudf::column_view(cudf::device_span<int64_t const>(d_input));
+  auto segments = cudf::test::fixed_width_column_wrapper<int32_t>({0, 4, 3533, 3535});
+  // full sort should match handcrafted input data here
+  auto expected = cudf::sort(cudf::table_view({input}));
+
+  auto input_view = cudf::table_view({input});
+  auto result     = cudf::segmented_sort_by_key(input_view, input_view, segments);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected->view().column(0));
+  result = cudf::stable_segmented_sort_by_key(input_view, input_view, segments);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected->view().column(0));
+}

From 6b0bff4b096ea87cd3436dba86146ed75af0f81e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 6 Aug 2024 14:48:16 -1000
Subject: [PATCH 031/270] Disallow cudf.Series to accept column in favor of
 `._from_column` (#16454)

`cudf.Series` is a public constructor that happens to accept a private `ColumnBase` object. Many ops return Columns and is natural to want to reconstruct a `Series`.

This PR adds a `SingleColumnFrame._from_column` classmethod for instances where we need to wrap a new column in an `Index` or `Series`. This constructor also passes some unneeded validation in `ColumnAccessor` and `Series`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16454
---
 python/cudf/cudf/core/byte_pair_encoding.py  |   6 +-
 python/cudf/cudf/core/column/categorical.py  |  16 +--
 python/cudf/cudf/core/column/methods.py      |  15 ++-
 python/cudf/cudf/core/column/numerical.py    |  12 +-
 python/cudf/cudf/core/column/string.py       |  23 ++--
 python/cudf/cudf/core/dataframe.py           | 116 +++++++++----------
 python/cudf/cudf/core/groupby/groupby.py     |  13 +--
 python/cudf/cudf/core/index.py               |  44 ++++++-
 python/cudf/cudf/core/indexed_frame.py       |  18 +--
 python/cudf/cudf/core/multiindex.py          |  19 ++-
 python/cudf/cudf/core/reshape.py             |   8 +-
 python/cudf/cudf/core/series.py              | 101 ++++++++++++----
 python/cudf/cudf/core/single_column_frame.py |  41 +++----
 python/cudf/cudf/core/tokenize_vocabulary.py |   8 +-
 python/cudf/cudf/core/tools/datetimes.py     |  11 +-
 python/cudf/cudf/core/tools/numeric.py       |  29 ++---
 python/cudf/cudf/datasets.py                 |   5 +-
 python/cudf/cudf/io/dlpack.py                |   2 +-
 python/cudf/cudf/tests/test_apply_rows.py    |   8 +-
 python/cudf/cudf/tests/test_column.py        |  44 ++++---
 python/cudf/cudf/tests/test_dataframe.py     |  26 +++--
 python/cudf/cudf/tests/test_decimal.py       |  10 +-
 python/cudf/cudf/tests/test_df_protocol.py   |   6 +-
 python/cudf/cudf/tests/test_list.py          |   2 +-
 python/cudf/cudf/tests/test_pickling.py      |   4 +-
 python/cudf/cudf/tests/test_replace.py       |   6 +-
 python/cudf/cudf/tests/test_series.py        |  10 +-
 python/cudf/cudf/tests/test_setitem.py       |  10 +-
 python/cudf/cudf/tests/test_string.py        |   2 +-
 python/cudf/cudf/tests/test_string_udfs.py   |   4 +-
 python/dask_cudf/dask_cudf/backends.py       |   7 +-
 python/dask_cudf/dask_cudf/core.py           |   2 +-
 32 files changed, 360 insertions(+), 268 deletions(-)

diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index 4c881022ecf..6ca64a0a2be 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -27,7 +27,7 @@ class BytePairEncoder:
     def __init__(self, merges_pair: "cudf.Series"):
         self.merge_pairs = cpp_merge_pairs(merges_pair._column)
 
-    def __call__(self, text, separator: str = " "):
+    def __call__(self, text, separator: str = " ") -> cudf.Series:
         """
 
         Parameters
@@ -56,4 +56,4 @@ def __call__(self, text, separator: str = " "):
         sep = cudf.Scalar(separator, dtype="str")
         result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
 
-        return cudf.Series(result)
+        return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 55bfae30470..6fa69eb9cc1 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -123,7 +123,7 @@ def categories(self) -> "cudf.core.index.Index":
         return self._column.dtype.categories
 
     @property
-    def codes(self) -> "cudf.Series":
+    def codes(self) -> cudf.Series:
         """
         Return Series of codes as well as the index.
         """
@@ -132,7 +132,7 @@ def codes(self) -> "cudf.Series":
             if isinstance(self._parent, cudf.Series)
             else None
         )
-        return cudf.Series(self._column.codes, index=index)
+        return cudf.Series._from_column(self._column.codes, index=index)
 
     @property
     def ordered(self) -> bool:
@@ -918,7 +918,7 @@ def find_and_replace(
             )
             cur_categories = replaced.categories
             new_categories = cur_categories.apply_boolean_mask(
-                ~cudf.Series(cur_categories.isin(drop_values))
+                cur_categories.isin(drop_values).unary_operator("not")
             )
             replaced = replaced._set_categories(new_categories)
             df = df.dropna(subset=["new"])
@@ -943,7 +943,7 @@ def find_and_replace(
         # If a category is being replaced by an existing one, we
         # want to map it to None. If it's totally new, we want to
         # map it to the new label it is to be replaced by
-        dtype_replace = cudf.Series._from_data({None: replacement_col})
+        dtype_replace = cudf.Series._from_column(replacement_col)
         dtype_replace[dtype_replace.isin(cats_col)] = None
         new_cats_col = cats_col.find_and_replace(
             to_replace_col, dtype_replace._column
@@ -1273,12 +1273,8 @@ def _categories_equal(
             return False
         # if order doesn't matter, sort before the equals call below
         if not ordered:
-            cur_categories = cudf.Series(cur_categories).sort_values(
-                ignore_index=True
-            )
-            new_categories = cudf.Series(new_categories).sort_values(
-                ignore_index=True
-            )
+            cur_categories = cur_categories.sort_values()
+            new_categories = new_categories.sort_values()
         return cur_categories.equals(new_categories)
 
     def _set_categories(
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 7c6f4e05577..8c46d238057 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -7,6 +7,8 @@
 from typing_extensions import Literal
 
 import cudf
+import cudf.core.column
+import cudf.core.column_accessor
 from cudf.utils.utils import NotIterable
 
 ParentType = Union["cudf.Series", "cudf.core.index.Index"]
@@ -84,14 +86,11 @@ def _return_or_inplace(
                         data=table, index=self._parent.index
                     )
             elif isinstance(self._parent, cudf.Series):
-                if retain_index:
-                    return cudf.Series(
-                        new_col,
-                        name=self._parent.name,
-                        index=self._parent.index,
-                    )
-                else:
-                    return cudf.Series(new_col, name=self._parent.name)
+                return cudf.Series._from_column(
+                    new_col,
+                    name=self._parent.name,
+                    index=self._parent.index if retain_index else None,
+                )
             elif isinstance(self._parent, cudf.BaseIndex):
                 return cudf.Index(new_col, name=self._parent.name)
             else:
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index c326a10c844..df27134d458 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -555,11 +555,8 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
 
                 if self.dtype.kind == "f":
                     # Exclude 'np.inf', '-np.inf'
-                    s = cudf.Series(self)
-                    # TODO: replace np.inf with cudf scalar when
-                    # https://github.com/rapidsai/cudf/pull/6297 merges
-                    non_infs = s[~((s == np.inf) | (s == -np.inf))]
-                    col = non_infs._column
+                    not_inf = (self != np.inf) & (self != -np.inf)
+                    col = self.apply_boolean_mask(not_inf)
                 else:
                     col = self
 
@@ -599,8 +596,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
             else:
                 filled = self.fillna(0)
                 return (
-                    cudf.Series(filled).astype(to_dtype).astype(filled.dtype)
-                    == cudf.Series(filled)
+                    filled.astype(to_dtype).astype(filled.dtype) == filled
                 ).all()
 
         # want to cast float to int:
@@ -615,7 +611,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
             # NOTE(seberg): it would make sense to limit to the mantissa range.
             if (float(self.min()) >= min_) and (float(self.max()) <= max_):
                 filled = self.fillna(0)
-                return (cudf.Series(filled) % 1 == 0).all()
+                return (filled % 1 == 0).all()
             else:
                 return False
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index b422ff86b17..1a4b558749d 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -358,7 +358,7 @@ def cat(self, others=None, sep=None, na_rep=None):
             )
 
         if len(data) == 1 and data.null_count == 1:
-            data = [""]
+            data = cudf.core.column.as_column("", length=len(data))
         # We only want to keep the index if we are adding something to each
         # row, not if we are joining all the rows into a single string.
         out = self._return_or_inplace(data, retain_index=others is not None)
@@ -3623,7 +3623,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         data = libstrings.findall(self._column, pat, flags)
         return self._return_or_inplace(data)
 
-    def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series":
+    def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series:
         """
         Find all first occurrences of patterns in the Series/Index.
 
@@ -3679,12 +3679,12 @@ def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series":
                 f"got: {patterns_column.dtype}"
             )
 
-        return cudf.Series(
+        return cudf.Series._from_column(
             libstrings.find_multiple(self._column, patterns_column),
+            name=self._parent.name,
             index=self._parent.index
             if isinstance(self._parent, cudf.Series)
             else self._parent,
-            name=self._parent.name,
         )
 
     def isempty(self) -> SeriesOrIndex:
@@ -4376,14 +4376,9 @@ def code_points(self) -> SeriesOrIndex:
         2    99
         dtype: int32
         """
-
-        new_col = libstrings.code_points(self._column)
-        if isinstance(self._parent, cudf.Series):
-            return cudf.Series(new_col, name=self._parent.name)
-        elif isinstance(self._parent, cudf.BaseIndex):
-            return cudf.Index(new_col, name=self._parent.name)
-        else:
-            return new_col
+        return self._return_or_inplace(
+            libstrings.code_points(self._column), retain_index=False
+        )
 
     def translate(self, table: dict) -> SeriesOrIndex:
         """
@@ -4694,7 +4689,9 @@ def character_tokenize(self) -> SeriesOrIndex:
         if isinstance(self._parent, cudf.Series):
             lengths = self.len().fillna(0)
             index = self._parent.index.repeat(lengths)
-            return cudf.Series(result_col, name=self._parent.name, index=index)
+            return cudf.Series._from_column(
+                result_col, name=self._parent.name, index=index
+            )
         elif isinstance(self._parent, cudf.BaseIndex):
             return cudf.Index(result_col, name=self._parent.name)
         else:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 865d2706ca3..a53c7bcc63c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -382,7 +382,10 @@ def _setitem_tuple_arg(self, key, value):
                 length = len(idx) if idx is not None else 1
                 value = as_column(value, length=length)
 
-            new_col = cudf.Series(value, index=idx)
+            if isinstance(value, ColumnBase):
+                new_col = cudf.Series._from_column(value, index=idx)
+            else:
+                new_col = cudf.Series(value, index=idx)
             if len(self._frame.index) != 0:
                 new_col = new_col._align_to_index(
                     self._frame.index, how="right"
@@ -500,28 +503,33 @@ def __getitem__(self, arg):
             return frame._slice(row_spec.key)
         elif isinstance(row_spec, indexing_utils.ScalarIndexer):
             result = frame._gather(row_spec.key, keep_index=True)
+            new_name = result.index[0]
+            new_index = ensure_index(result.keys())
             # Attempt to turn into series.
-            try:
-                # Behaviour difference from pandas, which will merrily
-                # turn any heterogeneous set of columns into a series if
-                # you only ask for one row.
-                new_name = result.index[0]
-                result = Series._concat(
-                    [result[name] for name in column_names],
-                    index=result.keys(),
-                )
-                result.name = new_name
-                return result
-            except TypeError:
-                # Couldn't find a common type, Hence:
-                # Raise in pandas compatibility mode,
-                # or just return a 1xN dataframe otherwise
-                if cudf.get_option("mode.pandas_compatible"):
-                    raise TypeError(
-                        "All columns need to be of same type, please "
-                        "typecast to common dtype."
+            if len(column_names) == 0:
+                return Series([], index=new_index, name=new_name)
+            else:
+                try:
+                    # Behaviour difference from pandas, which will merrily
+                    # turn any heterogeneous set of columns into a series if
+                    # you only ask for one row.
+                    ser = Series._concat(
+                        [result[name] for name in column_names],
                     )
-                return result
+                except TypeError as err:
+                    # Couldn't find a common type, Hence:
+                    # Raise in pandas compatibility mode,
+                    # or just return a 1xN dataframe otherwise
+                    if cudf.get_option("mode.pandas_compatible"):
+                        raise TypeError(
+                            "All columns need to be of same type, please "
+                            "typecast to common dtype."
+                        ) from err
+                    return result
+                else:
+                    ser.index = new_index
+                    ser.name = new_name
+                    return ser
         elif isinstance(row_spec, indexing_utils.EmptyIndexer):
             return frame._empty_like(keep_index=True)
         assert_never(row_spec)
@@ -1488,14 +1496,14 @@ def __delitem__(self, name):
         self._drop_column(name)
 
     @_performance_tracking
-    def memory_usage(self, index=True, deep=False):
+    def memory_usage(self, index=True, deep=False) -> cudf.Series:
         mem_usage = [col.memory_usage for col in self._data.columns]
         names = [str(name) for name in self._data.names]
         if index:
             mem_usage.append(self.index.memory_usage())
             names.append("Index")
-        return Series._from_data(
-            data={None: as_column(mem_usage)},
+        return Series._from_column(
+            as_column(mem_usage),
             index=cudf.Index(names),
         )
 
@@ -1752,7 +1760,7 @@ def _concat(
             if 1 == first_data_column_position:
                 table_index = cudf.Index(cols[0])
             elif first_data_column_position > 1:
-                table_index = DataFrame._from_data(
+                table_index = cudf.MultiIndex._from_data(
                     data=dict(
                         zip(
                             indices[:first_data_column_position],
@@ -3803,7 +3811,9 @@ def agg(self, aggs, axis=None):
                     col_empty = column_empty(
                         len(idxs), dtype=col.dtype, masked=True
                     )
-                    ans = cudf.Series(data=col_empty, index=idxs)
+                    ans = cudf.Series._from_column(
+                        col_empty, index=cudf.Index(idxs)
+                    )
                     if isinstance(aggs.get(key), abc.Iterable):
                         # TODO : Allow simultaneous pass for multi-aggregation
                         # as a future optimization
@@ -4801,7 +4811,7 @@ def _func(x):  # pragma: no cover
         # this could be written as a single kernel
         result = {}
         for name, col in self._data.items():
-            apply_sr = Series._from_data({None: col})
+            apply_sr = Series._from_column(col)
             result[name] = apply_sr.apply(_func)._column
 
         return DataFrame._from_data(result, index=self.index)
@@ -6083,8 +6093,8 @@ def quantile(
 
             if q_is_number:
                 result = result.transpose()
-                return Series(
-                    data=result._columns[0], index=result.index, name=q
+                return Series._from_column(
+                    result._columns[0], name=q, index=result.index
                 )
         else:
             # Ensure that qs is non-scalar so that we always get a column back.
@@ -6346,13 +6356,9 @@ def count(self, axis=0, numeric_only=False):
         if axis != 0:
             raise NotImplementedError("Only axis=0 is currently supported.")
         length = len(self)
-        return Series._from_data(
-            {
-                None: as_column(
-                    [length - col.null_count for col in self._columns]
-                )
-            },
-            cudf.Index(self._data.names),
+        return Series._from_column(
+            as_column([length - col.null_count for col in self._columns]),
+            index=cudf.Index(self._data.names),
         )
 
     _SUPPORT_AXIS_LOOKUP = {
@@ -6480,7 +6486,7 @@ def _reduce(
                     )
                 else:
                     idx = cudf.Index(source._data.names)
-                return Series._from_data({None: as_column(result)}, idx)
+                return Series._from_column(as_column(result), index=idx)
         elif axis == 1:
             return source._apply_cupy_method_axis_1(op, **kwargs)
         else:
@@ -6710,11 +6716,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
                 result = result.set_mask(
                     cudf._lib.transform.bools_to_mask(mask._column)
                 )
-            return Series(
-                result,
-                index=self.index,
-                dtype=result_dtype,
-            )
+            return Series._from_column(result, index=self.index)
         else:
             result_df = DataFrame(result).set_index(self.index)
             result_df._set_columns_like(prepared._data)
@@ -7302,9 +7304,7 @@ def unnamed_group_generator():
 
         # Construct the resulting dataframe / series
         if not has_unnamed_levels:
-            result = Series._from_data(
-                data={None: stacked[0]}, index=new_index
-            )
+            result = Series._from_column(stacked[0], index=new_index)
         else:
             if unnamed_level_values.nlevels == 1:
                 unnamed_level_values = unnamed_level_values.get_level_values(0)
@@ -7445,10 +7445,8 @@ def to_struct(self, name=None):
             size=len(self),
             offset=0,
         )
-        return cudf.Series._from_data(
-            cudf.core.column_accessor.ColumnAccessor(
-                {name: col}, verify=False
-            ),
+        return cudf.Series._from_column(
+            col,
             index=self.index,
             name=name,
         )
@@ -7935,12 +7933,10 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
                 raise ValueError(
                     "Cannot operate inplace if there is no assignment"
                 )
-            return Series._from_data(
-                {
-                    None: libcudf.transform.compute_column(
-                        [*self._columns], self._column_names, statements[0]
-                    )
-                }
+            return Series._from_column(
+                libcudf.transform.compute_column(
+                    [*self._columns], self._column_names, statements[0]
+                )
             )
 
         targets = []
@@ -8484,7 +8480,9 @@ def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns):
     return non_null_columns, dtypes
 
 
-def _find_common_dtypes_and_categories(non_null_columns, dtypes):
+def _find_common_dtypes_and_categories(
+    non_null_columns, dtypes
+) -> dict[Any, ColumnBase]:
     # A mapping of {idx: categories}, where `categories` is a
     # column of all the unique categorical values from each
     # categorical column across all input frames
@@ -8500,9 +8498,9 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
             isinstance(col, cudf.core.column.CategoricalColumn) for col in cols
         ):
             # Combine and de-dupe the categories
-            categories[idx] = cudf.Series(
-                concat_columns([col.categories for col in cols])
-            )._column.unique()
+            categories[idx] = concat_columns(
+                [col.categories for col in cols]
+            ).unique()
             # Set the column dtype to the codes' dtype. The categories
             # will be re-assigned at the end
             dtypes[idx] = min_signed_type(len(categories[idx]))
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 3cfbd1d736a..92c4b73ceaa 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -458,12 +458,11 @@ def size(self):
         """
         Return the size of each group.
         """
+        col = cudf.core.column.column_empty(
+            len(self.obj), "int8", masked=False
+        )
         return (
-            cudf.Series(
-                cudf.core.column.column_empty(
-                    len(self.obj), "int8", masked=False
-                )
-            )
+            cudf.Series._from_column(col)
             .groupby(self.grouping, sort=self._sort, dropna=self._dropna)
             .agg("size")
         )
@@ -484,7 +483,7 @@ def cumcount(self, ascending: bool = True):
                 "ascending is currently not implemented."
             )
         return (
-            cudf.Series(
+            cudf.Series._from_column(
                 cudf.core.column.column_empty(
                     len(self.obj), "int8", masked=False
                 ),
@@ -1069,7 +1068,7 @@ def ngroup(self, ascending=True):
             # Count descending from num_groups - 1 to 0
             groups = range(num_groups - 1, -1, -1)
 
-        group_ids = cudf.Series._from_data({None: as_column(groups)})
+        group_ids = cudf.Series._from_column(as_column(groups))
 
         if has_null_group:
             group_ids.iloc[-1] = cudf.NA
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 0d29ef07e7d..094da09ab08 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -60,7 +60,7 @@
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Iterable
+    from collections.abc import Generator, Hashable, Iterable
     from datetime import tzinfo
 
 
@@ -1071,6 +1071,16 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
 
         return NotImplemented
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None
+    ) -> Self:
+        ca = cudf.core.column_accessor.ColumnAccessor(
+            {name: column}, verify=False
+        )
+        return _index_from_data(ca)
+
     @classmethod
     @_performance_tracking
     def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
@@ -1092,8 +1102,30 @@ def _from_data_like_self(
     @classmethod
     @_performance_tracking
     def from_arrow(cls, obj):
+        """Create from PyArrow Array/ChunkedArray.
+
+        Parameters
+        ----------
+        array : PyArrow Array/ChunkedArray
+            PyArrow Object which has to be converted.
+
+        Raises
+        ------
+        TypeError for invalid input type.
+
+        Returns
+        -------
+        SingleColumnFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pyarrow as pa
+        >>> cudf.Index.from_arrow(pa.array(["a", "b", None]))
+        Index(['a', 'b', <NA>], dtype='object')
+        """
         try:
-            return cls(ColumnBase.from_arrow(obj))
+            return cls._from_column(ColumnBase.from_arrow(obj))
         except TypeError:
             # Try interpreting object as a MultiIndex before failing.
             return cudf.MultiIndex.from_arrow(obj)
@@ -1297,22 +1329,22 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             return _return_get_indexer_result(result.values)
 
         scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner")
-        (result,) = libcudf.copying.scatter([indices], scatter_map, [result])
-        result_series = cudf.Series(result)
+        result = libcudf.copying.scatter([indices], scatter_map, [result])[0]
+        result_series = cudf.Series._from_column(result)
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
             result_series = _get_indexer_basic(
                 index=self,
                 positions=result_series,
                 method=method,
-                target_col=cudf.Series(needle),
+                target_col=cudf.Series._from_column(needle),
                 tolerance=tolerance,
             )
         elif method == "nearest":
             result_series = _get_nearest_indexer(
                 index=self,
                 positions=result_series,
-                target_col=cudf.Series(needle),
+                target_col=cudf.Series._from_column(needle),
                 tolerance=tolerance,
             )
         elif method is not None:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 0678ebfdd81..24d947a574a 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -35,6 +35,7 @@
     is_list_like,
     is_scalar,
 )
+from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import ColumnBase, as_column
@@ -67,7 +68,6 @@
         Dtype,
         NotImplementedType,
     )
-    from cudf.core._base_index import BaseIndex
 
 
 doc_reset_index_template = """
@@ -304,6 +304,10 @@ def _from_data(
         index: BaseIndex | None = None,
     ):
         out = super()._from_data(data)
+        if not (index is None or isinstance(index, BaseIndex)):
+            raise ValueError(
+                f"index must be None or a cudf.Index not {type(index).__name__}"
+            )
         out._index = RangeIndex(out._data.nrows) if index is None else index
         return out
 
@@ -2934,8 +2938,8 @@ def hash_values(self, method="murmur3", seed=None):
         # Note that both Series and DataFrame return Series objects from this
         # calculation, necessitating the unfortunate circular reference to the
         # child class here.
-        return cudf.Series._from_data(
-            {None: libcudf.hash.hash([*self._columns], method, seed)},
+        return cudf.Series._from_column(
+            libcudf.hash.hash([*self._columns], method, seed),
             index=self.index,
         )
 
@@ -3219,13 +3223,13 @@ def duplicated(self, subset=None, keep="first"):
         distinct = libcudf.stream_compaction.distinct_indices(
             columns, keep=keep
         )
-        (result,) = libcudf.copying.scatter(
+        result = libcudf.copying.scatter(
             [cudf.Scalar(False, dtype=bool)],
             distinct,
             [as_column(True, length=len(self), dtype=bool)],
             bounds_check=False,
-        )
-        return cudf.Series(result, index=self.index)
+        )[0]
+        return cudf.Series._from_column(result, index=self.index)
 
     @_performance_tracking
     def _empty_like(self, keep_index=True) -> Self:
@@ -3506,7 +3510,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
         col = _post_process_output_col(ans_col, retty)
 
         col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
-        result = cudf.Series._from_data({None: col}, self.index)
+        result = cudf.Series._from_column(col, index=self.index)
 
         return result
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 2788455aebf..9646b34830f 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -702,12 +702,8 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
             data_table = cudf.concat(
                 [
                     frame,
-                    cudf.DataFrame(
-                        {
-                            "idx": cudf.Series(
-                                column.as_column(range(len(frame)))
-                            )
-                        }
+                    cudf.DataFrame._from_data(
+                        {"idx": column.as_column(range(len(frame)))}
                     ),
                 ],
                 axis=1,
@@ -786,7 +782,7 @@ def _index_and_downcast(self, result, index, index_key):
             out_index.insert(
                 out_index._num_columns,
                 k,
-                cudf.Series._from_data({None: index._data.columns[k]}),
+                cudf.Series._from_column(index._data.columns[k]),
             )
 
         # determine if we should downcast from a DataFrame to a Series
@@ -852,7 +848,10 @@ def _get_row_major(
         valid_indices = self._get_valid_indices_by_tuple(
             df.index, row_tuple, len(df.index)
         )
-        indices = cudf.Series(valid_indices)
+        if isinstance(valid_indices, column.ColumnBase):
+            indices = cudf.Series._from_column(valid_indices)
+        else:
+            indices = cudf.Series(valid_indices)
         result = df.take(indices)
         final = self._index_and_downcast(result, result.index, row_tuple)
         return final
@@ -1925,8 +1924,8 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             *join_keys,
             how="inner",
         )
-        (result,) = libcudf.copying.scatter([indices], scatter_map, [result])
-        result_series = cudf.Series(result)
+        result = libcudf.copying.scatter([indices], scatter_map, [result])[0]
+        result_series = cudf.Series._from_column(result)
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
             result_series = _get_indexer_basic(
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index e7248977b1d..52a55760d4a 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -484,9 +484,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         if len(new_objs) == 1 and not ignore_index:
             return new_objs[0]
         else:
-            return cudf.Series._concat(
-                objs, axis=axis, index=None if ignore_index else True
-            )
+            return cudf.Series._concat(objs, axis=axis, index=not ignore_index)
     elif typ is cudf.MultiIndex:
         return cudf.MultiIndex._concat(objs)
     elif issubclass(typ, cudf.Index):
@@ -632,7 +630,7 @@ def melt(
     def _tile(A, reps):
         series_list = [A] * reps
         if reps > 0:
-            return cudf.Series._concat(objs=series_list, index=None)
+            return cudf.Series._concat(objs=series_list, index=False)
         else:
             return cudf.Series([], dtype=A.dtype)
 
@@ -661,7 +659,7 @@ def _tile(A, reps):
 
     # Step 3: add values
     mdata[value_name] = cudf.Series._concat(
-        objs=[frame[val] for val in value_vars], index=None
+        objs=[frame[val] for val in value_vars], index=False
     )
 
     return cudf.DataFrame(mdata)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 929af5cd981..de57ac5f290 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -69,6 +69,8 @@
 from cudf.utils.performance_tracking import _performance_tracking
 
 if TYPE_CHECKING:
+    import pyarrow as pa
+
     from cudf._typing import (
         ColumnLike,
         DataFrameOrSeries,
@@ -294,8 +296,8 @@ def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries:
             return result
         try:
             arg = self._loc_to_iloc(arg)
-        except (TypeError, KeyError, IndexError, ValueError):
-            raise KeyError(arg)
+        except (TypeError, KeyError, IndexError, ValueError) as err:
+            raise KeyError(arg) from err
 
         return self._frame.iloc[arg]
 
@@ -394,8 +396,10 @@ def _loc_to_iloc(self, arg):
             return _indices_from_labels(self._frame, arg)
 
         else:
-            arg = cudf.core.series.Series(cudf.core.column.as_column(arg))
-            if arg.dtype in (bool, np.bool_):
+            arg = cudf.core.series.Series._from_column(
+                cudf.core.column.as_column(arg)
+            )
+            if arg.dtype.kind == "b":
                 return arg
             else:
                 indices = _indices_from_labels(self._frame, arg)
@@ -510,7 +514,37 @@ def from_categorical(cls, categorical, codes=None):
         col = cudf.core.column.categorical.pandas_categorical_as_column(
             categorical, codes=codes
         )
-        return Series(data=col)
+        return Series._from_column(col)
+
+    @classmethod
+    @_performance_tracking
+    def from_arrow(cls, array: pa.Array):
+        """Create from PyArrow Array/ChunkedArray.
+
+        Parameters
+        ----------
+        array : PyArrow Array/ChunkedArray
+            PyArrow Object which has to be converted.
+
+        Raises
+        ------
+        TypeError for invalid input type.
+
+        Returns
+        -------
+        SingleColumnFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pyarrow as pa
+        >>> cudf.Series.from_arrow(pa.array(["a", "b", None]))
+        0       a
+        1       b
+        2    <NA>
+        dtype: object
+        """
+        return cls._from_column(ColumnBase.from_arrow(array))
 
     @classmethod
     @_performance_tracking
@@ -560,7 +594,8 @@ def from_masked_array(cls, data, mask, null_count=None):
         dtype: int64
         """
         col = as_column(data).set_mask(mask)
-        return cls(data=col)
+        ca = ColumnAccessor({None: col}, verify=False)
+        return cls._from_data(ca)
 
     @_performance_tracking
     def __init__(
@@ -586,10 +621,10 @@ def __init__(
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
             if isinstance(data, (pd.Series, Series)):
                 index_from_data = ensure_index(data.index)
-        elif isinstance(data, ColumnAccessor):
+        elif isinstance(data, (ColumnAccessor, ColumnBase)):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
-                "ColumnAccessor"
+                "ColumnAccessor or a ColumnBase"
             )
         elif isinstance(data, dict):
             if not data:
@@ -656,6 +691,18 @@ def __init__(
             self._index = second_index
         self._check_data_index_length_match()
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls,
+        column: ColumnBase,
+        *,
+        name: abc.Hashable = None,
+        index: BaseIndex | None = None,
+    ) -> Self:
+        ca = ColumnAccessor({name: column}, verify=False)
+        return cls._from_data(ca, index=index)
+
     @classmethod
     @_performance_tracking
     def _from_data(
@@ -1535,17 +1582,21 @@ def dtype(self):
 
     @classmethod
     @_performance_tracking
-    def _concat(cls, objs, axis=0, index=True):
+    def _concat(cls, objs, axis=0, index: bool = True):
         # Concatenate index if not provided
         if index is True:
             if isinstance(objs[0].index, cudf.MultiIndex):
-                index = cudf.MultiIndex._concat([o.index for o in objs])
+                result_index = cudf.MultiIndex._concat([o.index for o in objs])
             else:
                 with warnings.catch_warnings():
                     warnings.simplefilter("ignore", FutureWarning)
-                    index = cudf.core.index.Index._concat(
+                    result_index = cudf.core.index.Index._concat(
                         [o.index for o in objs]
                     )
+        elif index is False:
+            result_index = None
+        else:
+            raise ValueError(f"{index=} must be a bool")
 
         names = {obj.name for obj in objs}
         if len(names) == 1:
@@ -1597,7 +1648,9 @@ def _concat(cls, objs, axis=0, index=True):
         if len(objs):
             col = col._with_type_metadata(objs[0].dtype)
 
-        return cls(data=col, index=index, name=name)
+        return cls._from_data(
+            ColumnAccessor({name: col}, verify=False), index=result_index
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -2709,8 +2762,8 @@ def mode(self, dropna=True):
         if len(val_counts) > 0:
             val_counts = val_counts[val_counts == val_counts.iloc[0]]
 
-        return Series._from_data(
-            {self.name: val_counts.index.sort_values()._column}, name=self.name
+        return Series._from_column(
+            val_counts.index.sort_values()._column, name=self.name
         )
 
     @_performance_tracking
@@ -2999,8 +3052,8 @@ def isin(self, values):
                 f"to isin(), you passed a [{type(values).__name__}]"
             )
 
-        return Series._from_data(
-            {self.name: self._column.isin(values)}, index=self.index
+        return Series._from_column(
+            self._column.isin(values), name=self.name, index=self.index
         )
 
     @_performance_tracking
@@ -3036,7 +3089,7 @@ def unique(self):
         res = self._column.unique()
         if cudf.get_option("mode.pandas_compatible"):
             return res.values
-        return Series(res, name=self.name)
+        return Series._from_column(res, name=self.name)
 
     @_performance_tracking
     def value_counts(
@@ -3268,8 +3321,9 @@ def quantile(
         if return_scalar:
             return result
 
-        return Series._from_data(
-            data={self.name: result},
+        return Series._from_column(
+            result,
+            name=self.name,
             index=cudf.Index(np_array_q) if quant_index else None,
         )
 
@@ -3351,8 +3405,9 @@ def digitize(self, bins, right=False):
         3    2
         dtype: int32
         """
-        return Series(
-            cudf.core.column.numerical.digitize(self._column, bins, right)
+        return Series._from_column(
+            cudf.core.column.numerical.digitize(self._column, bins, right),
+            name=self.name,
         )
 
     @_performance_tracking
@@ -5293,10 +5348,10 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     elif b_col.null_count:
         null_values = b_col.isnull()
     else:
-        return Series(result_col, index=index)
+        return Series._from_column(result_col, index=index)
 
     result_col[null_values] = False
     if equal_nan is True and a_col.null_count and b_col.null_count:
         result_col[equal_nulls] = True
 
-    return Series(result_col, index=index)
+    return Series._from_column(result_col, index=index)
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index a5ff1223791..eb6714029cf 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -15,11 +15,14 @@
     is_numeric_dtype,
 )
 from cudf.core.column import ColumnBase, as_column
+from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable
 
 if TYPE_CHECKING:
+    from collections.abc import Hashable
+
     import cupy
     import numpy
     import pyarrow as pa
@@ -112,35 +115,17 @@ def values_host(self) -> numpy.ndarray:  # noqa: D102
 
     @classmethod
     @_performance_tracking
-    def from_arrow(cls, array) -> Self:
-        """Create from PyArrow Array/ChunkedArray.
-
-        Parameters
-        ----------
-        array : PyArrow Array/ChunkedArray
-            PyArrow Object which has to be converted.
-
-        Raises
-        ------
-        TypeError for invalid input type.
-
-        Returns
-        -------
-        SingleColumnFrame
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None
+    ) -> Self:
+        """Constructor for a single Column."""
+        ca = ColumnAccessor({name: column}, verify=False)
+        return cls._from_data(ca)
 
-        Examples
-        --------
-        >>> import cudf
-        >>> import pyarrow as pa
-        >>> cudf.Index.from_arrow(pa.array(["a", "b", None]))
-        Index(['a', 'b', None], dtype='object')
-        >>> cudf.Series.from_arrow(pa.array(["a", "b", None]))
-        0       a
-        1       b
-        2    <NA>
-        dtype: object
-        """
-        return cls(ColumnBase.from_arrow(array))
+    @classmethod
+    @_performance_tracking
+    def from_arrow(cls, array) -> Self:
+        raise NotImplementedError
 
     @_performance_tracking
     def to_arrow(self) -> pa.Array:
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
index afb3496311b..99d85c0c5c0 100644
--- a/python/cudf/cudf/core/tokenize_vocabulary.py
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -22,7 +22,9 @@ class TokenizeVocabulary:
     def __init__(self, vocabulary: "cudf.Series"):
         self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column)
 
-    def tokenize(self, text, delimiter: str = "", default_id: int = -1):
+    def tokenize(
+        self, text, delimiter: str = "", default_id: int = -1
+    ) -> cudf.Series:
         """
         Parameters
         ----------
@@ -45,4 +47,4 @@ def tokenize(self, text, delimiter: str = "", default_id: int = -1):
             text._column, self.vocabulary, delim, default_id
         )
 
-        return cudf.Series(result)
+        return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index c6e2b5d10e1..2f77778116f 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -18,6 +18,8 @@
 )
 from cudf.api.types import is_integer, is_scalar
 from cudf.core import column
+from cudf.core.column_accessor import ColumnAccessor
+from cudf.core.index import ensure_index
 
 # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
 _unit_map = {
@@ -275,7 +277,7 @@ def to_datetime(
                 format=format,
                 utc=utc,
             )
-            return cudf.Series(col, index=arg.index)
+            return cudf.Series._from_column(col, index=arg.index)
         else:
             col = _process_col(
                 col=column.as_column(arg),
@@ -286,9 +288,12 @@ def to_datetime(
                 utc=utc,
             )
             if isinstance(arg, (cudf.BaseIndex, pd.Index)):
-                return cudf.Index(col, name=arg.name)
+                ca = ColumnAccessor({arg.name: col}, verify=False)
+                return cudf.DatetimeIndex._from_data(ca)
             elif isinstance(arg, (cudf.Series, pd.Series)):
-                return cudf.Series(col, index=arg.index, name=arg.name)
+                return cudf.Series._from_column(
+                    col, name=arg.name, index=ensure_index(arg.index)
+                )
             elif is_scalar(arg):
                 return col.element_indexing(0)
             else:
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 07158e4ee61..8b95f6f6a04 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import warnings
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
@@ -11,8 +13,12 @@
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core.column import as_column
 from cudf.core.dtypes import CategoricalDtype
+from cudf.core.index import ensure_index
 from cudf.utils.dtypes import can_convert_to_column
 
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
 
 def to_numeric(arg, errors="raise", downcast=None):
     """
@@ -164,7 +170,9 @@ def to_numeric(arg, errors="raise", downcast=None):
                     break
 
     if isinstance(arg, (cudf.Series, pd.Series)):
-        return cudf.Series(col, index=arg.index, name=arg.name)
+        return cudf.Series._from_column(
+            col, name=arg.name, index=ensure_index(arg.index)
+        )
     else:
         if col.has_nulls():
             # To match pandas, always return a floating type filled with nan.
@@ -226,25 +234,10 @@ def _convert_str_col(col, errors, _downcast=None):
             raise ValueError("Unable to convert some strings to numerics.")
 
 
-def _proc_inf_empty_strings(col):
+def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase:
     """Handles empty and infinity strings"""
     col = libstrings.to_lower(col)
-    col = _proc_empty_strings(col)
-    col = _proc_inf_strings(col)
-    return col
-
-
-def _proc_empty_strings(col):
-    """Replaces empty strings with NaN"""
-    s = cudf.Series(col)
-    s = s.where(s != "", "NaN")
-    return s._column
-
-
-def _proc_inf_strings(col):
-    """Convert "inf/infinity" strings into "Inf", the native string
-    representing infinity in libcudf
-    """
+    col = col.find_and_replace(as_column([""]), as_column(["NaN"]))
     # TODO: This can be handled by libcudf in
     # future see StringColumn.as_numerical_column
     col = libstrings.replace_multi(
diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py
index 7b183d5f1a3..dbabaacf6b5 100644
--- a/python/cudf/cudf/datasets.py
+++ b/python/cudf/cudf/datasets.py
@@ -5,7 +5,6 @@
 
 import cudf
 from cudf._lib.transform import bools_to_mask
-from cudf.core.column_accessor import ColumnAccessor
 
 __all__ = ["timeseries", "randomdata"]
 
@@ -73,9 +72,7 @@ def timeseries(
         )
         mask_buf = bools_to_mask(cudf.core.column.as_column(mask))
         masked_col = gdf[col]._column.set_mask(mask_buf)
-        gdf[col] = cudf.Series._from_data(
-            ColumnAccessor({None: masked_col}), index=gdf.index
-        )
+        gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index)
 
     return gdf
 
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index d3d99aab0cd..1347b2cc38f 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -71,7 +71,7 @@ def to_dlpack(cudf_obj):
     if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)):
         gdf = cudf_obj
     elif isinstance(cudf_obj, ColumnBase):
-        gdf = cudf.Series._from_data({None: cudf_obj})
+        gdf = cudf.Series._from_column(cudf_obj)
     else:
         raise TypeError(
             f"Input of type {type(cudf_obj)} cannot be converted "
diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py
index a11022c1a17..f9b0d9c1e78 100644
--- a/python/cudf/cudf/tests/test_apply_rows.py
+++ b/python/cudf/cudf/tests/test_apply_rows.py
@@ -27,8 +27,12 @@ def test_dataframe_apply_rows(dtype, has_nulls, pessimistic):
         gdf_series_expected = gdf_series_a * gdf_series_b
     else:
         # optimistically ignore the null masks
-        a = cudf.Series(column.build_column(gdf_series_a.data, dtype))
-        b = cudf.Series(column.build_column(gdf_series_b.data, dtype))
+        a = cudf.Series._from_column(
+            column.build_column(gdf_series_a.data, dtype)
+        )
+        b = cudf.Series._from_column(
+            column.build_column(gdf_series_b.data, dtype)
+        )
         gdf_series_expected = a * b
 
     df_expected = cudf.DataFrame(
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index c288155112c..4aa7fb27c9b 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -95,7 +95,7 @@ def test_column_offset_and_size(pandas_input, offset, size):
     else:
         assert col.size == (col.data.size / col.dtype.itemsize)
 
-    got = cudf.Series(col)
+    got = cudf.Series._from_column(col)
 
     if offset is None:
         offset = 0
@@ -112,8 +112,8 @@ def test_column_offset_and_size(pandas_input, offset, size):
 
 def column_slicing_test(col, offset, size, cast_to_float=False):
     col_slice = col.slice(offset, offset + size)
-    series = cudf.Series(col)
-    sliced_series = cudf.Series(col_slice)
+    series = cudf.Series._from_column(col)
+    sliced_series = cudf.Series._from_column(col_slice)
 
     if cast_to_float:
         pd_series = series.astype(float).to_pandas()
@@ -208,7 +208,9 @@ def test_as_column_scalar_with_nan(nan_as_null, scalar, size):
     )
 
     got = (
-        cudf.Series(as_column(scalar, length=size, nan_as_null=nan_as_null))
+        cudf.Series._from_column(
+            as_column(scalar, length=size, nan_as_null=nan_as_null)
+        )
         .dropna()
         .to_numpy()
     )
@@ -250,12 +252,18 @@ def test_column_chunked_array_creation():
     actual_column = cudf.core.column.as_column(chunked_array, dtype="float")
     expected_column = cudf.core.column.as_column(pyarrow_array, dtype="float")
 
-    assert_eq(cudf.Series(actual_column), cudf.Series(expected_column))
+    assert_eq(
+        cudf.Series._from_column(actual_column),
+        cudf.Series._from_column(expected_column),
+    )
 
     actual_column = cudf.core.column.as_column(chunked_array)
     expected_column = cudf.core.column.as_column(pyarrow_array)
 
-    assert_eq(cudf.Series(actual_column), cudf.Series(expected_column))
+    assert_eq(
+        cudf.Series._from_column(actual_column),
+        cudf.Series._from_column(expected_column),
+    )
 
 
 @pytest.mark.parametrize(
@@ -287,7 +295,7 @@ def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype):
     gpu_data_view = gpu_data.view(to_dtype)
 
     expect = pd.Series(cpu_data_view, dtype=cpu_data_view.dtype)
-    got = cudf.Series(gpu_data_view, dtype=gpu_data_view.dtype)
+    got = cudf.Series._from_column(gpu_data_view).astype(gpu_data_view.dtype)
 
     gpu_ptr = gpu_data.data.get_ptr(mode="read")
     assert gpu_ptr == got._column.data.get_ptr(mode="read")
@@ -327,7 +335,7 @@ def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype):
     ],
 )
 def test_column_view_valid_string_to_numeric(data, to_dtype):
-    expect = cudf.Series(cudf.Series(data)._column.view(to_dtype))
+    expect = cudf.Series._from_column(cudf.Series(data)._column.view(to_dtype))
     got = cudf.Series(str_host_view(data, to_dtype))
 
     assert_eq(expect, got)
@@ -342,7 +350,7 @@ def test_column_view_nulls_widths_even():
 
     sr = cudf.Series(data, dtype="int32")
     expect = cudf.Series(expect_data, dtype="float32")
-    got = cudf.Series(sr._column.view("float32"))
+    got = cudf.Series._from_column(sr._column.view("float32"))
 
     assert_eq(expect, got)
 
@@ -354,7 +362,7 @@ def test_column_view_nulls_widths_even():
 
     sr = cudf.Series(data, dtype="float64")
     expect = cudf.Series(expect_data, dtype="int64")
-    got = cudf.Series(sr._column.view("int64"))
+    got = cudf.Series._from_column(sr._column.view("int64"))
 
     assert_eq(expect, got)
 
@@ -365,7 +373,9 @@ def test_column_view_numeric_slice(slc):
     sr = cudf.Series(data)
 
     expect = cudf.Series(data[slc].view("int64"))
-    got = cudf.Series(sr._column.slice(slc.start, slc.stop).view("int64"))
+    got = cudf.Series._from_column(
+        sr._column.slice(slc.start, slc.stop).view("int64")
+    )
 
     assert_eq(expect, got)
 
@@ -376,7 +386,7 @@ def test_column_view_numeric_slice(slc):
 def test_column_view_string_slice(slc):
     data = ["a", "bcde", "cd", "efg", "h"]
 
-    expect = cudf.Series(
+    expect = cudf.Series._from_column(
         cudf.Series(data)._column.slice(slc.start, slc.stop).view("int8")
     )
     got = cudf.Series(str_host_view(data[slc], "int8"))
@@ -409,7 +419,10 @@ def test_as_column_buffer(data, expected):
     actual_column = cudf.core.column.as_column(
         cudf.core.buffer.as_buffer(data), dtype=data.dtype
     )
-    assert_eq(cudf.Series(actual_column), cudf.Series(expected))
+    assert_eq(
+        cudf.Series._from_column(actual_column),
+        cudf.Series._from_column(expected),
+    )
 
 
 @pytest.mark.parametrize(
@@ -436,7 +449,10 @@ def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs):
     pyarrow_data = pa.array(data, **pyarrow_kwargs)
     cudf_from_pyarrow = as_column(pyarrow_data)
     expected = as_column(data, **cudf_kwargs)
-    assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected))
+    assert_eq(
+        cudf.Series._from_column(cudf_from_pyarrow),
+        cudf.Series._from_column(expected),
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index e2ce5c03b70..2c59253d500 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4264,34 +4264,36 @@ def test_empty_dataframe_describe():
 def test_as_column_types():
     col = column.as_column(cudf.Series([], dtype="float64"))
     assert_eq(col.dtype, np.dtype("float64"))
-    gds = cudf.Series(col)
+    gds = cudf.Series._from_column(col)
     pds = pd.Series(pd.Series([], dtype="float64"))
 
     assert_eq(pds, gds)
 
     col = column.as_column(cudf.Series([], dtype="float64"), dtype="float32")
     assert_eq(col.dtype, np.dtype("float32"))
-    gds = cudf.Series(col)
+    gds = cudf.Series._from_column(col)
     pds = pd.Series(pd.Series([], dtype="float32"))
 
     assert_eq(pds, gds)
 
     col = column.as_column(cudf.Series([], dtype="float64"), dtype="str")
     assert_eq(col.dtype, np.dtype("object"))
-    gds = cudf.Series(col)
+    gds = cudf.Series._from_column(col)
     pds = pd.Series(pd.Series([], dtype="str"))
 
     assert_eq(pds, gds)
 
     col = column.as_column(cudf.Series([], dtype="float64"), dtype="object")
     assert_eq(col.dtype, np.dtype("object"))
-    gds = cudf.Series(col)
+    gds = cudf.Series._from_column(col)
     pds = pd.Series(pd.Series([], dtype="object"))
 
     assert_eq(pds, gds)
 
     pds = pd.Series(np.array([1, 2, 3]), dtype="float32")
-    gds = cudf.Series(column.as_column(np.array([1, 2, 3]), dtype="float32"))
+    gds = cudf.Series._from_column(
+        column.as_column(np.array([1, 2, 3]), dtype="float32")
+    )
 
     assert_eq(pds, gds)
 
@@ -4301,23 +4303,25 @@ def test_as_column_types():
     assert_eq(pds, gds)
 
     pds = pd.Series([], dtype="float64")
-    gds = cudf.Series(column.as_column(pds))
+    gds = cudf.Series._from_column(column.as_column(pds))
     assert_eq(pds, gds)
 
     pds = pd.Series([1, 2, 4], dtype="int64")
-    gds = cudf.Series(column.as_column(cudf.Series([1, 2, 4]), dtype="int64"))
+    gds = cudf.Series._from_column(
+        column.as_column(cudf.Series([1, 2, 4]), dtype="int64")
+    )
 
     assert_eq(pds, gds)
 
     pds = pd.Series([1.2, 18.0, 9.0], dtype="float32")
-    gds = cudf.Series(
+    gds = cudf.Series._from_column(
         column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="float32")
     )
 
     assert_eq(pds, gds)
 
     pds = pd.Series([1.2, 18.0, 9.0], dtype="str")
-    gds = cudf.Series(
+    gds = cudf.Series._from_column(
         column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str")
     )
 
@@ -6521,7 +6525,9 @@ def test_from_pandas_for_series_nan_as_null(nan_as_null):
     data = [np.nan, 2.0, 3.0]
     psr = pd.Series(data)
 
-    expected = cudf.Series(column.as_column(data, nan_as_null=nan_as_null))
+    expected = cudf.Series._from_column(
+        column.as_column(data, nan_as_null=nan_as_null)
+    )
     got = cudf.from_pandas(psr, nan_as_null=nan_as_null)
 
     assert_eq(expected, got)
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index 65f739bc74a..b63788d20b7 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -106,7 +106,7 @@ def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype):
     pa_arr = got.to_arrow().cast(
         pa.decimal128(to_dtype.precision, to_dtype.scale)
     )
-    expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
+    expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr))
 
     got = got.astype(to_dtype)
 
@@ -146,7 +146,7 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype):
         .cast("float64")
         .cast(pa.decimal128(to_dtype.precision, to_dtype.scale))
     )
-    expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
+    expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr))
 
     got = got.astype(to_dtype)
 
@@ -206,9 +206,9 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
         pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False
     )
     if isinstance(to_dtype, Decimal32Dtype):
-        expected = cudf.Series(Decimal32Column.from_arrow(pa_arr))
+        expected = cudf.Series._from_column(Decimal32Column.from_arrow(pa_arr))
     elif isinstance(to_dtype, Decimal64Dtype):
-        expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
+        expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr))
 
     with expect_warning_if(to_dtype.scale < s.dtype.scale, UserWarning):
         got = s.astype(to_dtype)
@@ -245,7 +245,7 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype):
     pa_arr = got.to_arrow().cast(to_dtype, safe=False)
 
     got = got.astype(to_dtype)
-    expected = cudf.Series(NumericalColumn.from_arrow(pa_arr))
+    expected = cudf.Series._from_column(NumericalColumn.from_arrow(pa_arr))
 
     assert_eq(got, expected)
     assert_eq(got.dtype, expected.dtype)
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 7f48e414180..44270d20d59 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -78,7 +78,7 @@ def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol):
     # FIXME: In gh-10202 some minimal fixes were added to unblock CI. But
     # currently only non-null values are compared, null positions are
     # unchecked.
-    non_null_idxs = ~cudf.Series(cudfcol).isna()
+    non_null_idxs = cudfcol.notnull()
     assert_eq(
         col_from_buf.apply_boolean_mask(non_null_idxs),
         cudfcol.apply_boolean_mask(non_null_idxs),
@@ -86,8 +86,8 @@ def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol):
     array_from_dlpack = cp.from_dlpack(buf.__dlpack__()).get()
     col_array = cp.asarray(cudfcol.data_array_view(mode="read")).get()
     assert_eq(
-        array_from_dlpack[non_null_idxs.to_numpy()].flatten(),
-        col_array[non_null_idxs.to_numpy()].flatten(),
+        array_from_dlpack[non_null_idxs.values_host].flatten(),
+        col_array[non_null_idxs.values_host].flatten(),
     )
 
 
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 36bcaa66d7d..c4c883ca9f9 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -946,5 +946,5 @@ def test_empty_nested_list_uninitialized_offsets_memory_usage():
         null_count=col.null_count,
         children=(column_empty(0, col.children[0].dtype), empty_inner),
     )
-    ser = cudf.Series._from_data({None: col_empty_offset})
+    ser = cudf.Series._from_column(col_empty_offset)
     assert ser.memory_usage() == 8
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index 719e8a33285..0f13a9e173a 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -127,7 +127,7 @@ def test_pickle_categorical_column(slices):
     pickled = pickle.dumps(input_col)
     out = pickle.loads(pickled)
 
-    assert_eq(Series(out), Series(input_col))
+    assert_eq(Series._from_column(out), Series._from_column(input_col))
 
 
 @pytest.mark.parametrize(
@@ -148,4 +148,4 @@ def test_pickle_string_column(slices):
     pickled = pickle.dumps(input_col)
     out = pickle.loads(pickled)
 
-    assert_eq(Series(out), Series(input_col))
+    assert_eq(Series._from_column(out), Series._from_column(input_col))
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index d4fe5ff3bb5..1973fe6fb41 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -817,12 +817,12 @@ def test_fillna_string(ps_data, fill_value, inplace):
 def test_series_fillna_invalid_dtype(data_dtype):
     gdf = cudf.Series([1, 2, None, 3], dtype=data_dtype)
     fill_value = 2.5
-    with pytest.raises(TypeError) as raises:
-        gdf.fillna(fill_value)
-    raises.match(
+    msg = (
         f"Cannot safely cast non-equivalent"
         f" {type(fill_value).__name__} to {gdf.dtype.type.__name__}"
     )
+    with pytest.raises(TypeError, match=msg):
+        gdf.fillna(fill_value)
 
 
 @pytest.mark.parametrize("data_dtype", NUMERIC_TYPES)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8ed78d804bf..6a1887afb1f 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2041,7 +2041,7 @@ def test_series_ordered_dedup():
     sr = cudf.Series(np.random.randint(0, 100, 1000))
     # pandas unique() preserves order
     expect = pd.Series(sr.to_pandas().unique())
-    got = cudf.Series(sr._column.unique())
+    got = cudf.Series._from_column(sr._column.unique())
     assert_eq(expect.values, got.values)
 
 
@@ -2697,7 +2697,9 @@ def test_series_duplicate_index_reindex():
 def test_list_category_like_maintains_dtype():
     dtype = cudf.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True)
     data = [1, 2, 3]
-    result = cudf.Series(cudf.core.column.as_column(data, dtype=dtype))
+    result = cudf.Series._from_column(
+        cudf.core.column.as_column(data, dtype=dtype)
+    )
     expected = pd.Series(data, dtype=dtype.to_pandas())
     assert_eq(result, expected)
 
@@ -2705,7 +2707,9 @@ def test_list_category_like_maintains_dtype():
 def test_list_interval_like_maintains_dtype():
     dtype = cudf.IntervalDtype(subtype=np.int8)
     data = [pd.Interval(1, 2)]
-    result = cudf.Series(cudf.core.column.as_column(data, dtype=dtype))
+    result = cudf.Series._from_column(
+        cudf.core.column.as_column(data, dtype=dtype)
+    )
     expected = pd.Series(data, dtype=dtype.to_pandas())
     assert_eq(result, expected)
 
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 69122cdbafa..5406836ba61 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -178,13 +178,19 @@ def test_column_set_equal_length_object_by_mask():
     bool_col = cudf.Series([True, True, True, True, True])._column
 
     data[bool_col] = replace_data
-    assert_eq(cudf.Series(data), cudf.Series(replace_data))
+    assert_eq(
+        cudf.Series._from_column(data),
+        cudf.Series._from_column(replace_data),
+    )
 
     data = cudf.Series([0, 0, 1, 1, 1])._column
     bool_col = cudf.Series([True, False, True, False, True])._column
     data[bool_col] = replace_data
 
-    assert_eq(cudf.Series(data), cudf.Series([100, 0, 300, 1, 500]))
+    assert_eq(
+        cudf.Series._from_column(data),
+        cudf.Series([100, 0, 300, 1, 500]),
+    )
 
 
 def test_column_set_unequal_length_object_by_mask():
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index f447759d010..4bd084a3938 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -2677,7 +2677,7 @@ def test_string_int_to_ipv4():
         ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"]
     )
 
-    got = cudf.Series(gsr._column.int2ip())
+    got = cudf.Series._from_column(gsr._column.int2ip())
 
     assert_eq(expected, got)
 
diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py
index 4432d2afc8e..69876d97aad 100644
--- a/python/cudf/cudf/tests/test_string_udfs.py
+++ b/python/cudf/cudf/tests/test_string_udfs.py
@@ -96,7 +96,7 @@ def run_udf_test(data, func, dtype):
     else:
         result = output
 
-    got = cudf.Series(result, dtype=dtype)
+    got = cudf.Series._from_column(result.astype(dtype))
     assert_eq(expect, got, check_dtype=False)
     with _CUDFNumbaConfig():
         udf_str_kernel.forall(len(data))(str_views, output)
@@ -105,7 +105,7 @@ def run_udf_test(data, func, dtype):
     else:
         result = output
 
-    got = cudf.Series(result, dtype=dtype)
+    got = cudf.Series._from_column(result.astype(dtype))
     assert_eq(expect, got, check_dtype=False)
 
 
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 4bdb5d921ec..2b1f745fc04 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -102,6 +102,7 @@ def _nest_list_data(data, leaf_type):
 
 @_dask_cudf_performance_tracking
 def _get_non_empty_data(s):
+    """Return a non empty column as metadata."""
     if isinstance(s, cudf.core.column.CategoricalColumn):
         categories = (
             s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]
@@ -128,7 +129,7 @@ def _get_non_empty_data(s):
         data = [{key: None for key in struct_dtype.fields.keys()}] * 2
         data = cudf.core.column.as_column(data, dtype=s.dtype)
     elif is_string_dtype(s.dtype):
-        data = pa.array(["cat", "dog"])
+        data = cudf.core.column.as_column(pa.array(["cat", "dog"]))
     elif isinstance(s.dtype, pd.DatetimeTZDtype):
         from cudf.utils.dtypes import get_time_unit
 
@@ -153,7 +154,7 @@ def _nonempty_series(s, idx=None):
         idx = _nonempty_index(s.index)
     data = _get_non_empty_data(s._column)
 
-    return cudf.Series(data, name=s.name, index=idx)
+    return cudf.Series._from_column(data, name=s.name, index=idx)
 
 
 @meta_nonempty.register(cudf.DataFrame)
@@ -424,7 +425,7 @@ def hash_object_cudf_index(ind, index=None):
         return ind.to_frame(index=False).hash_values()
 
     col = cudf.core.column.as_column(ind)
-    return cudf.Series(col).hash_values()
+    return cudf.Series._from_column(col).hash_values()
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index aab56e3a1b0..3181c8d69ec 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -342,7 +342,7 @@ def groupby(self, by=None, **kwargs):
 def sum_of_squares(x):
     x = x.astype("f8")._column
     outcol = libcudf.reduce.reduce("sum_of_squares", x)
-    return cudf.Series(outcol)
+    return cudf.Series._from_column(outcol)
 
 
 @_dask_cudf_performance_tracking

From 3fd8783e49246f4ae61351375201d616d5ab6b55 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 7 Aug 2024 13:00:09 -0700
Subject: [PATCH 032/270] Add `stream` param to stream compaction APIs (#16295)

Add `stream` param to a bunch of stream compaction APIs.

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)
  - Karthikeyan (https://github.com/karthikeyann)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16295
---
 cpp/include/cudf/detail/stream_compaction.hpp |  30 +-
 .../cudf/lists/detail/stream_compaction.hpp   |   9 +-
 cpp/include/cudf/stream_compaction.hpp        |  30 +-
 .../stream_compaction/apply_boolean_mask.cu   |   3 +-
 cpp/src/stream_compaction/distinct.cu         |   4 +-
 cpp/src/stream_compaction/distinct_count.cu   |  11 +-
 cpp/src/stream_compaction/drop_nans.cu        |   6 +-
 cpp/src/stream_compaction/drop_nulls.cu       |   6 +-
 cpp/src/stream_compaction/unique.cu           |   3 +-
 cpp/src/stream_compaction/unique_count.cu     |   8 +-
 .../stream_compaction/unique_count_column.cu  |   7 +-
 cpp/tests/streams/stream_compaction_test.cpp  | 365 ++++++++++++++----
 java/src/main/native/src/TableJni.cpp         |   1 +
 13 files changed, 362 insertions(+), 121 deletions(-)

diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 05194148a70..85d2ee9790f 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -29,9 +29,7 @@ namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::drop_nulls(table_view const&, std::vector<size_type> const&,
- *                           cudf::size_type, rmm::device_async_resource_ref)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ *                           cudf::size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
@@ -41,9 +39,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
 
 /**
  * @copydoc cudf::drop_nans(table_view const&, std::vector<size_type> const&,
- *                          cudf::size_type, rmm::device_async_resource_ref)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ *                          cudf::size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
@@ -53,8 +49,6 @@ std::unique_ptr<table> drop_nans(table_view const& input,
 
 /**
  * @copydoc cudf::apply_boolean_mask
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
@@ -63,8 +57,6 @@ std::unique_ptr<table> apply_boolean_mask(table_view const& input,
 
 /**
  * @copydoc cudf::unique
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> unique(table_view const& input,
                               std::vector<size_type> const& keys,
@@ -75,8 +67,6 @@ std::unique_ptr<table> unique(table_view const& input,
 
 /**
  * @copydoc cudf::distinct
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> distinct(table_view const& input,
                                 std::vector<size_type> const& keys,
@@ -110,9 +100,7 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy, rmm::cuda_stream_view)
  */
 cudf::size_type unique_count(column_view const& input,
                              null_policy null_handling,
@@ -120,18 +108,14 @@ cudf::size_type unique_count(column_view const& input,
                              rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::unique_count(table_view const&, null_equality)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::unique_count(table_view const&, null_equality, rmm::cuda_stream_view)
  */
 cudf::size_type unique_count(table_view const& input,
                              null_equality nulls_equal,
                              rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy, rmm::cuda_stream_view)
  */
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
@@ -139,9 +123,7 @@ cudf::size_type distinct_count(column_view const& input,
                                rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::distinct_count(table_view const&, null_equality)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::distinct_count(table_view const&, null_equality, rmm::cuda_stream_view)
  */
 cudf::size_type distinct_count(table_view const& input,
                                null_equality nulls_equal,
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
index c11e07cd190..be0bd27083c 100644
--- a/cpp/include/cudf/lists/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -26,10 +26,7 @@ namespace CUDF_EXPORT cudf {
 namespace lists::detail {
 
 /**
- * @copydoc cudf::lists::apply_boolean_mask(lists_column_view const&, lists_column_view const&,
- * rmm::device_async_resource_ref)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
+ * @copydoc cudf::lists::apply_boolean_mask
  */
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
@@ -37,9 +34,7 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::list::distinct
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::lists::distinct
  */
 std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index cfe404ff6ab..ced8d5849d0 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -67,6 +67,7 @@ namespace CUDF_EXPORT cudf {
  * @param[in] keys  vector of indices representing key columns from `input`
  * @param[in] keep_threshold The minimum number of non-null fields in a row
  *                           required to keep the row.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing all rows of the `input` with at least @p
  * keep_threshold non-null fields in @p keys.
@@ -75,6 +76,7 @@ std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -99,6 +101,7 @@ std::unique_ptr<table> drop_nulls(
  *
  * @param[in] input The input `table_view` to filter
  * @param[in] keys  vector of indices representing key columns from `input`
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing all rows of the `input` without nulls in the columns
  * of @p keys.
@@ -106,6 +109,7 @@ std::unique_ptr<table> drop_nulls(
 std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -141,6 +145,7 @@ std::unique_ptr<table> drop_nulls(
  * @param[in] keys  vector of indices representing key columns from `input`
  * @param[in] keep_threshold The minimum number of non-NAN elements in a row
  *                           required to keep the row.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing all rows of the `input` with at least @p
  * keep_threshold non-NAN elements in @p keys.
@@ -149,6 +154,7 @@ std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -174,6 +180,7 @@ std::unique_ptr<table> drop_nans(
  *
  * @param[in] input The input `table_view` to filter
  * @param[in] keys  vector of indices representing key columns from `input`
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing all rows of the `input` without NANs in the columns
  * of @p keys.
@@ -181,6 +188,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -200,6 +208,7 @@ std::unique_ptr<table> drop_nans(
  * @param[in] input The input table_view to filter
  * @param[in] boolean_mask A nullable column_view of type type_id::BOOL8 used
  * as a mask to filter the `input`.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing copy of all rows of @p input passing
  * the filter defined by @p boolean_mask.
@@ -207,6 +216,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> apply_boolean_mask(
   table_view const& input,
   column_view const& boolean_mask,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -241,6 +251,7 @@ enum class duplicate_keep_option {
  * @param[in] keep            keep any, first, last, or none of the found duplicates
  * @param[in] nulls_equal     flag to denote nulls are equal if null_equality::EQUAL, nulls are not
  *                            equal if null_equality::UNEQUAL
+ * @param[in] stream          CUDA stream used for device memory operations and kernel launches
  * @param[in] mr              Device memory resource used to allocate the returned table's device
  *                            memory
  *
@@ -251,6 +262,7 @@ std::unique_ptr<table> unique(
   std::vector<size_type> const& keys,
   duplicate_keep_option keep,
   null_equality nulls_equal         = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -269,6 +281,7 @@ std::unique_ptr<table> unique(
  * @param keep Copy any, first, last, or none of the found duplicates
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether NaN elements should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table
  * @return Table with distinct rows in an unspecified order
  */
@@ -278,6 +291,7 @@ std::unique_ptr<table> distinct(
   duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -346,12 +360,14 @@ std::unique_ptr<table> stable_distinct(
  * @param[in] input The column_view whose consecutive groups of equivalent rows will be counted
  * @param[in] null_handling flag to include or ignore `null` while counting
  * @param[in] nan_handling flag to consider `NaN==null` or not
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return number of consecutive groups of equivalent rows in the column
  */
 cudf::size_type unique_count(column_view const& input,
                              null_policy null_handling,
-                             nan_policy nan_handling);
+                             nan_policy nan_handling,
+                             rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Count the number of consecutive groups of equivalent rows in a table.
@@ -359,11 +375,13 @@ cudf::size_type unique_count(column_view const& input,
  * @param[in] input Table whose consecutive groups of equivalent rows will be counted
  * @param[in] nulls_equal flag to denote if null elements should be considered equal
  *            nulls are not equal if null_equality::UNEQUAL.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return number of consecutive groups of equivalent rows in the column
  */
 cudf::size_type unique_count(table_view const& input,
-                             null_equality nulls_equal = null_equality::EQUAL);
+                             null_equality nulls_equal    = null_equality::EQUAL,
+                             rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Count the distinct elements in the column_view.
@@ -382,12 +400,14 @@ cudf::size_type unique_count(table_view const& input,
  * @param[in] input The column_view whose distinct elements will be counted
  * @param[in] null_handling flag to include or ignore `null` while counting
  * @param[in] nan_handling flag to consider `NaN==null` or not
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return number of distinct rows in the table
  */
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
-                               nan_policy nan_handling);
+                               nan_policy nan_handling,
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Count the distinct rows in a table.
@@ -395,11 +415,13 @@ cudf::size_type distinct_count(column_view const& input,
  * @param[in] input Table whose distinct rows will be counted
  * @param[in] nulls_equal flag to denote if null elements should be considered equal.
  *            nulls are not equal if null_equality::UNEQUAL.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return number of distinct rows in the table
  */
 cudf::size_type distinct_count(table_view const& input,
-                               null_equality nulls_equal = null_equality::EQUAL);
+                               null_equality nulls_equal    = null_equality::EQUAL,
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu
index cdca9517d94..9812f4ffbd7 100644
--- a/cpp/src/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/stream_compaction/apply_boolean_mask.cu
@@ -91,9 +91,10 @@ std::unique_ptr<table> apply_boolean_mask(table_view const& input,
  */
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
+                                          rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::apply_boolean_mask(input, boolean_mask, cudf::get_default_stream(), mr);
+  return detail::apply_boolean_mask(input, boolean_mask, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 6afd6e34c50..24e2692cb6f 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -150,11 +150,11 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 duplicate_keep_option keep,
                                 null_equality nulls_equal,
                                 nan_equality nans_equal,
+                                rmm::cuda_stream_view stream,
                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct(
-    input, keys, keep, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::distinct(input, keys, keep, nulls_equal, nans_equal, stream, mr);
 }
 
 std::unique_ptr<column> distinct_indices(table_view const& input,
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index cdf9faddf31..78eb0fa5212 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -218,15 +218,18 @@ cudf::size_type distinct_count(column_view const& input,
 
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
-                               nan_policy nan_handling)
+                               nan_policy nan_handling,
+                               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct_count(input, null_handling, nan_handling, cudf::get_default_stream());
+  return detail::distinct_count(input, null_handling, nan_handling, stream);
 }
 
-cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal)
+cudf::size_type distinct_count(table_view const& input,
+                               null_equality nulls_equal,
+                               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct_count(input, nulls_equal, cudf::get_default_stream());
+  return detail::distinct_count(input, nulls_equal, stream);
 }
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu
index b46381c8ff6..b98ebbc2ecc 100644
--- a/cpp/src/stream_compaction/drop_nans.cu
+++ b/cpp/src/stream_compaction/drop_nans.cu
@@ -117,20 +117,22 @@ std::unique_ptr<table> drop_nans(table_view const& input,
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
+                                 rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nans(input, keys, keep_threshold, cudf::get_default_stream(), mr);
+  return detail::drop_nans(input, keys, keep_threshold, stream, mr);
 }
 /*
  * Filters a table to remove nan elements.
  */
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nans(input, keys, keys.size(), cudf::get_default_stream(), mr);
+  return detail::drop_nans(input, keys, keys.size(), stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu
index cb7cd61bf02..2497e4e5065 100644
--- a/cpp/src/stream_compaction/drop_nulls.cu
+++ b/cpp/src/stream_compaction/drop_nulls.cu
@@ -90,20 +90,22 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nulls(input, keys, keep_threshold, cudf::get_default_stream(), mr);
+  return detail::drop_nulls(input, keys, keep_threshold, stream, mr);
 }
 /*
  * Filters a table to remove null elements.
  */
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nulls(input, keys, keys.size(), cudf::get_default_stream(), mr);
+  return detail::drop_nulls(input, keys, keys.size(), stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index edb47984d13..93de0e60b6d 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -119,10 +119,11 @@ std::unique_ptr<table> unique(table_view const& input,
                               std::vector<size_type> const& keys,
                               duplicate_keep_option const keep,
                               null_equality nulls_equal,
+                              rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique(input, keys, keep, nulls_equal, cudf::get_default_stream(), mr);
+  return detail::unique(input, keys, keep, nulls_equal, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu
index 19607fe8105..d842f63cd7b 100644
--- a/cpp/src/stream_compaction/unique_count.cu
+++ b/cpp/src/stream_compaction/unique_count.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,10 +67,12 @@ cudf::size_type unique_count(table_view const& keys,
 
 }  // namespace detail
 
-cudf::size_type unique_count(table_view const& input, null_equality nulls_equal)
+cudf::size_type unique_count(table_view const& input,
+                             null_equality nulls_equal,
+                             rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique_count(input, nulls_equal, cudf::get_default_stream());
+  return detail::unique_count(input, nulls_equal, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique_count_column.cu b/cpp/src/stream_compaction/unique_count_column.cu
index 16758b6e3a7..89ce2391a7b 100644
--- a/cpp/src/stream_compaction/unique_count_column.cu
+++ b/cpp/src/stream_compaction/unique_count_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -101,10 +101,11 @@ cudf::size_type unique_count(column_view const& input,
 
 cudf::size_type unique_count(column_view const& input,
                              null_policy null_handling,
-                             nan_policy nan_handling)
+                             nan_policy nan_handling,
+                             rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique_count(input, null_handling, nan_handling, cudf::get_default_stream());
+  return detail::unique_count(input, null_handling, nan_handling, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp
index 56443870602..443f4548b2c 100644
--- a/cpp/tests/streams/stream_compaction_test.cpp
+++ b/cpp/tests/streams/stream_compaction_test.cpp
@@ -41,6 +41,7 @@ auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL;
 auto constexpr NAN_EQUAL    = cudf::nan_equality::ALL_EQUAL;
 auto constexpr NAN_UNEQUAL  = cudf::nan_equality::UNEQUAL;
 
+using int16s_col = cudf::test::fixed_width_column_wrapper<int16_t>;
 using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>;
 using floats_col = cudf::test::fixed_width_column_wrapper<float>;
 
@@ -51,50 +52,9 @@ using cudf::test::iterators::no_nulls;
 using cudf::test::iterators::null_at;
 using cudf::test::iterators::nulls_at;
 
-struct StableDistinctKeepAny : public cudf::test::BaseFixture {};
+struct StreamCompactionTest : public cudf::test::BaseFixture {};
 
-struct StableDistinctKeepFirstLastNone : public cudf::test::BaseFixture {};
-
-TEST_F(StableDistinctKeepAny, NoNullsTableWithNaNs)
-{
-  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
-  // groups for equivalent keys because KEEP_ANY is nondeterministic.
-  auto const col1  = int32s_col{6, 6, 6, 1, 1, 1, 3, 5, 8, 5};
-  auto const col2  = floats_col{6, 6, 6, 1, 1, 1, 3, 4, 9, 4};
-  auto const keys1 = int32s_col{20, 20, 20, 15, 15, 15, 20, 19, 21, 9};
-  auto const keys2 = floats_col{19., 19., 19., NaN, NaN, NaN, 20., 20., 9., 21.};
-
-  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
-  auto const key_idx = std::vector<cudf::size_type>{2, 3};
-
-  // NaNs are unequal.
-  {
-    auto const exp_col1  = int32s_col{6, 1, 1, 1, 3, 5, 8, 5};
-    auto const exp_col2  = floats_col{6, 1, 1, 1, 3, 4, 9, 4};
-    auto const exp_keys1 = int32s_col{20, 15, 15, 15, 20, 19, 21, 9};
-    auto const exp_keys2 = floats_col{19., NaN, NaN, NaN, 20., 20., 9., 21.};
-    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
-
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
-  }
-
-  // NaNs are equal.
-  {
-    auto const exp_col1  = int32s_col{6, 1, 3, 5, 8, 5};
-    auto const exp_col2  = floats_col{6, 1, 3, 4, 9, 4};
-    auto const exp_keys1 = int32s_col{20, 15, 20, 19, 21, 9};
-    auto const exp_keys2 = floats_col{19., NaN, 20., 20., 9., 21.};
-    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
-
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
-  }
-}
-
-TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs)
+TEST_F(StreamCompactionTest, StableDistinctKeepAny)
 {
   auto constexpr null{0.0};  // shadow the global `null` variable of type int
 
@@ -150,7 +110,7 @@ TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs)
   }
 }
 
-TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual)
+TEST_F(StreamCompactionTest, StableDistinctKeepFirstLastNone)
 {
   // Column(s) used to test needs to have different rows for the same keys.
   auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6};
@@ -192,44 +152,313 @@ TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual)
   }
 }
 
-TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsUnequal)
+TEST_F(StreamCompactionTest, DropNaNs)
 {
-  // Column(s) used to test needs to have different rows for the same keys.
-  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6, 7};
-  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22., 20.};
-  auto const input   = cudf::table_view{{col, keys}};
-  auto const key_idx = std::vector<cudf::size_type>{1};
+  auto const col1 = floats_col{{1., 2., NaN, NaN, 5., 6.}, nulls_at({2, 5})};
+  auto const col2 = int32s_col{{10, 40, 70, 5, 2, 10}, nulls_at({2, 5})};
+  auto const col3 = floats_col{{NaN, 40., 70., NaN, 2., 10.}, nulls_at({2, 5})};
+  cudf::table_view input{{col1, col2, col3}};
+
+  std::vector<cudf::size_type> keys{0, 2};
 
-  // KEEP_FIRST
   {
-    auto const exp_col  = int32s_col{0, 1, 2, 3, 4, 6};
-    auto const exp_keys = floats_col{20., NaN, NaN, 19., 21., 22.};
-    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+    // With keep_threshold
+    auto const col1_expected = floats_col{{1., 2., 3., 5., 6.}, nulls_at({2, 4})};
+    auto const col2_expected = int32s_col{{10, 40, 70, 2, 10}, nulls_at({2, 4})};
+    auto const col3_expected = floats_col{{NaN, 40., 70., 2., 10.}, nulls_at({2, 4})};
+    cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
+
+    auto result = cudf::drop_nans(input, keys, keys.size() - 1, cudf::test::get_default_stream());
 
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_FIRST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
   }
 
-  // KEEP_LAST
   {
-    auto const exp_col  = int32s_col{1, 2, 4, 5, 6, 7};
-    auto const exp_keys = floats_col{NaN, NaN, 21., 19., 22., 20.};
-    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+    // Without keep_threshold
+    auto const col1_expected = floats_col{{2., 3., 5., 6.}, nulls_at({1, 3})};
+    auto const col2_expected = int32s_col{{40, 70, 2, 10}, nulls_at({1, 3})};
+    auto const col3_expected = floats_col{{40., 70., 2., 10.}, nulls_at({1, 3})};
+    cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
+
+    auto result = cudf::drop_nans(input, keys, cudf::test::get_default_stream());
 
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_LAST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
   }
+}
+
+TEST_F(StreamCompactionTest, DropNulls)
+{
+  auto const col1 = int16s_col{{1, 0, 1, 0, 1, 0}, nulls_at({2, 5})};
+  auto const col2 = int32s_col{{10, 40, 70, 5, 2, 10}, nulls_at({2})};
+  auto const col3 = floats_col{{10., 40., 70., 5., 2., 10.}, no_nulls()};
+  cudf::table_view input{{col1, col2, col3}};
+  std::vector<cudf::size_type> keys{0, 1, 2};
 
-  // KEEP_NONE
   {
-    auto const exp_col  = int32s_col{1, 2, 4, 6};
-    auto const exp_keys = floats_col{NaN, NaN, 21., 22.};
-    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+    // With keep_threshold
+    auto const col1_expected = int16s_col{{1, 0, 0, 1, 0}, null_at(4)};
+    auto const col2_expected = int32s_col{{10, 40, 5, 2, 10}, no_nulls()};
+    auto const col3_expected = floats_col{{10., 40., 5., 2., 10.}, no_nulls()};
+    cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
+
+    auto result = cudf::drop_nulls(input, keys, keys.size() - 1, cudf::test::get_default_stream());
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  {
+    // Without keep_threshold
+    auto const col1_expected = int16s_col{{1, 0, 0, 1}, no_nulls()};
+    auto const col2_expected = int32s_col{{10, 40, 5, 2}, no_nulls()};
+    auto const col3_expected = floats_col{{10., 40., 5., 2.}, no_nulls()};
+    cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
+
+    auto result = cudf::drop_nulls(input, keys, cudf::test::get_default_stream());
 
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_NONE, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
   }
 }
+
+TEST_F(StreamCompactionTest, Unique)
+{
+  auto const col1     = int32s_col{5, 4, 3, 5, 8, 5};
+  auto const col2     = floats_col{4., 5., 3., 4., 9., 4.};
+  auto const col1_key = int32s_col{20, 20, 20, 19, 21, 9};
+  auto const col2_key = int32s_col{19, 19, 20, 20, 9, 21};
+
+  cudf::table_view input{{col1, col2, col1_key, col2_key}};
+  std::vector<cudf::size_type> keys = {2, 3};
+
+  {
+    // KEEP_FIRST
+    auto const exp_col1_first     = int32s_col{5, 3, 5, 8, 5};
+    auto const exp_col2_first     = floats_col{4., 3., 4., 9., 4.};
+    auto const exp_col1_key_first = int32s_col{20, 20, 19, 21, 9};
+    auto const exp_col2_key_first = int32s_col{19, 20, 20, 9, 21};
+    cudf::table_view expected_first{
+      {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}};
+
+    auto const result = cudf::unique(input,
+                                     keys,
+                                     cudf::duplicate_keep_option::KEEP_FIRST,
+                                     cudf::null_equality::EQUAL,
+                                     cudf::test::get_default_stream());
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, *result);
+  }
+
+  {
+    // KEEP_LAST
+    auto const exp_col1_last     = int32s_col{4, 3, 5, 8, 5};
+    auto const exp_col2_last     = floats_col{5., 3., 4., 9., 4.};
+    auto const exp_col1_key_last = int32s_col{20, 20, 19, 21, 9};
+    auto const exp_col2_key_last = int32s_col{19, 20, 20, 9, 21};
+    cudf::table_view expected_last{
+      {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}};
+
+    auto const result = cudf::unique(input,
+                                     keys,
+                                     cudf::duplicate_keep_option::KEEP_LAST,
+                                     cudf::null_equality::EQUAL,
+                                     cudf::test::get_default_stream());
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, *result);
+  }
+
+  {
+    // KEEP_NONE
+    auto const exp_col1_unique     = int32s_col{3, 5, 8, 5};
+    auto const exp_col2_unique     = floats_col{3., 4., 9., 4.};
+    auto const exp_col1_key_unique = int32s_col{20, 19, 21, 9};
+    auto const exp_col2_key_unique = int32s_col{20, 20, 9, 21};
+    cudf::table_view expected_unique{
+      {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}};
+
+    auto const result = cudf::unique(input,
+                                     keys,
+                                     cudf::duplicate_keep_option::KEEP_NONE,
+                                     cudf::null_equality::EQUAL,
+                                     cudf::test::get_default_stream());
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, *result);
+  }
+}
+
+TEST_F(StreamCompactionTest, Distinct)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col1  = int32s_col{0, 1, 2, 3, 4, 5, 6};
+  auto const col2  = floats_col{10, 11, 12, 13, 14, 15, 16};
+  auto const keys1 = int32s_col{20, 20, 20, 20, 19, 21, 9};
+  auto const keys2 = int32s_col{19, 19, 19, 20, 20, 9, 21};
+
+  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
+  auto const key_idx = std::vector<cudf::size_type>{2, 3};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col1_sort  = int32s_col{6, 4, 0, 3, 5};
+    auto const exp_col2_sort  = floats_col{16, 14, 10, 13, 15};
+    auto const exp_keys1_sort = int32s_col{9, 19, 20, 20, 21};
+    auto const exp_keys2_sort = int32s_col{21, 20, 19, 20, 9};
+    auto const expected_sort =
+      cudf::table_view{{exp_col1_sort, exp_col2_sort, exp_keys1_sort, exp_keys2_sort}};
+
+    auto const result = cudf::distinct(input,
+                                       key_idx,
+                                       cudf::duplicate_keep_option::KEEP_FIRST,
+                                       cudf::null_equality::EQUAL,
+                                       cudf::nan_equality::ALL_EQUAL,
+                                       cudf::test::get_default_stream());
+    auto const result_sort =
+      cudf::sort_by_key(*result, result->select(key_idx), {}, {}, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col1_sort  = int32s_col{6, 4, 2, 3, 5};
+    auto const exp_col2_sort  = floats_col{16, 14, 12, 13, 15};
+    auto const exp_keys1_sort = int32s_col{9, 19, 20, 20, 21};
+    auto const exp_keys2_sort = int32s_col{21, 20, 19, 20, 9};
+    auto const expected_sort =
+      cudf::table_view{{exp_col1_sort, exp_col2_sort, exp_keys1_sort, exp_keys2_sort}};
+
+    auto const result = cudf::distinct(input,
+                                       key_idx,
+                                       cudf::duplicate_keep_option::KEEP_LAST,
+                                       cudf::null_equality::EQUAL,
+                                       cudf::nan_equality::ALL_EQUAL,
+                                       cudf::test::get_default_stream());
+    auto const result_sort =
+      cudf::sort_by_key(*result, result->select(key_idx), {}, {}, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col1_sort  = int32s_col{6, 4, 3, 5};
+    auto const exp_col2_sort  = floats_col{16, 14, 13, 15};
+    auto const exp_keys1_sort = int32s_col{9, 19, 20, 21};
+    auto const exp_keys2_sort = int32s_col{21, 20, 20, 9};
+    auto const expected_sort =
+      cudf::table_view{{exp_col1_sort, exp_col2_sort, exp_keys1_sort, exp_keys2_sort}};
+
+    auto const result = cudf::distinct(input,
+                                       key_idx,
+                                       cudf::duplicate_keep_option::KEEP_NONE,
+                                       cudf::null_equality::EQUAL,
+                                       cudf::nan_equality::ALL_EQUAL,
+                                       cudf::test::get_default_stream());
+    auto const result_sort =
+      cudf::sort_by_key(*result, result->select(key_idx), {}, {}, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort);
+  }
+}
+
+TEST_F(StreamCompactionTest, ApplyBooleanMask)
+{
+  auto const col = int32s_col{
+    9668, 9590, 9526, 9205, 9434, 9347, 9160, 9569, 9143, 9807, 9606, 9446, 9279, 9822, 9691};
+  cudf::test::fixed_width_column_wrapper<bool> mask({false,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     true});
+  cudf::table_view input({col});
+  auto const col_expected = int32s_col{9526, 9347, 9569, 9807, 9279, 9691};
+  cudf::table_view expected({col_expected});
+  auto const result = cudf::apply_boolean_mask(input, mask, cudf::test::get_default_stream());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+}
+
+TEST_F(StreamCompactionTest, UniqueCountColumn)
+{
+  std::vector<int32_t> const input = {1, 3,  3,  4,  31, 1, 8,  2, 0, 4, 1,
+                                      4, 10, 40, 31, 42, 0, 42, 8, 5, 4};
+
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col(input.begin(), input.end());
+  std::vector<double> input_data(input.begin(), input.end());
+
+  auto const new_end  = std::unique(input_data.begin(), input_data.end());
+  auto const expected = std::distance(input_data.begin(), new_end);
+  EXPECT_EQ(
+    expected,
+    cudf::unique_count(
+      input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID, cudf::test::get_default_stream()));
+}
+
+TEST_F(StreamCompactionTest, UniqueCountTable)
+{
+  std::vector<int32_t> const input1 = {1, 3, 3,  3,  4,  31, 1, 8,  2, 0, 4,
+                                       1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4};
+  std::vector<int32_t> const input2 = {3, 3,  3,  4,  31, 1, 8,  5, 0, 4, 1,
+                                       4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1};
+
+  std::vector<std::pair<int32_t, int32_t>> pair_input;
+  std::transform(input1.begin(),
+                 input1.end(),
+                 input2.begin(),
+                 std::back_inserter(pair_input),
+                 [](int32_t a, int32_t b) { return std::pair(a, b); });
+
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col1(input1.begin(), input1.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col2(input2.begin(), input2.end());
+  cudf::table_view input_table({input_col1, input_col2});
+
+  auto const new_end = std::unique(pair_input.begin(), pair_input.end());
+  auto const result  = std::distance(pair_input.begin(), new_end);
+  EXPECT_EQ(
+    result,
+    cudf::unique_count(input_table, null_equality::EQUAL, cudf::test::get_default_stream()));
+}
+
+TEST_F(StreamCompactionTest, DistinctCountColumn)
+{
+  std::vector<int32_t> const input = {1, 3,  3,  4,  31, 1, 8,  2, 0, 4, 1,
+                                      4, 10, 40, 31, 42, 0, 42, 8, 5, 4};
+
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col(input.begin(), input.end());
+
+  auto const expected =
+    static_cast<cudf::size_type>(std::set<double>(input.begin(), input.end()).size());
+  EXPECT_EQ(
+    expected,
+    cudf::distinct_count(
+      input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID, cudf::test::get_default_stream()));
+}
+
+TEST_F(StreamCompactionTest, DistinctCountTable)
+{
+  std::vector<int32_t> const input1 = {1, 3, 3,  3,  4,  31, 1, 8,  2, 0, 4,
+                                       1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4};
+  std::vector<int32_t> const input2 = {3, 3,  3,  4,  31, 1, 8,  5, 0, 4, 1,
+                                       4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1};
+
+  std::vector<std::pair<int32_t, int32_t>> pair_input;
+  std::transform(input1.begin(),
+                 input1.end(),
+                 input2.begin(),
+                 std::back_inserter(pair_input),
+                 [](int32_t a, int32_t b) { return std::pair(a, b); });
+
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col1(input1.begin(), input1.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col2(input2.begin(), input2.end());
+  cudf::table_view input_table({input_col1, input_col2});
+
+  auto const expected = static_cast<cudf::size_type>(
+    std::set<std::pair<int32_t, int32_t>>(pair_input.begin(), pair_input.end()).size());
+  EXPECT_EQ(
+    expected,
+    cudf::distinct_count(input_table, null_equality::EQUAL, cudf::test::get_default_stream()));
+}
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index a9ace1398e4..76ca8c533ce 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -3919,6 +3919,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(
                      keep_option,
                      nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
                      cudf::nan_equality::ALL_EQUAL,
+                     cudf::get_default_stream(),
                      rmm::mr::get_current_device_resource());
     return convert_table_for_return(env, result);
   }

From b933b54858a84082980f20522738fda4969a1318 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 7 Aug 2024 20:07:42 -0500
Subject: [PATCH 033/270] Use tool.scikit-build.cmake.version, set
 scikit-build-core minimum-version (#16503)

Contributes to https://github.com/rapidsai/build-planning/issues/58.

`scikit-build-core==0.10.0` was released today (https://github.com/scikit-build/scikit-build-core/releases/tag/v0.10.0), and wheel-building configurations across RAPIDS are incompatible with it.

This proposes upgrading to that version and fixing configuration here in a way that:

* is compatible with that new `scikit-build-core` version
* takes advantage of the forward-compatibility mechanism (`minimum-version`) that `scikit-build-core` provides, to reduce the risk of needing to do this again in the future

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/16503
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 conda/recipes/cudf_kafka/meta.yaml               | 2 +-
 dependencies.yaml                                | 4 ++--
 python/cudf/pyproject.toml                       | 5 +++--
 python/cudf_kafka/pyproject.toml                 | 5 +++--
 7 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index d04804cafaf..8d5fc2e31d9 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -82,7 +82,7 @@ dependencies:
 - rich
 - rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - scipy
 - spdlog>=1.12.0,<1.13
 - sphinx
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index e2c3558030d..7b0485d7f29 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -80,7 +80,7 @@ dependencies:
 - rich
 - rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - scipy
 - spdlog>=1.12.0,<1.13
 - sphinx
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 9137f099ad1..8d7ef63715b 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -62,7 +62,7 @@ requirements:
     - python
     - cython >=3.0.3
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - scikit-build-core >=0.7.0
+    - scikit-build-core >=0.10.0
     - dlpack >=0.8,<1.0
     # TODO: Change to `2.0` for NumPy 2
     - numpy 1.23
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 1b0e0e2c236..748a32e5518 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -61,7 +61,7 @@ requirements:
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - scikit-build-core >=0.7.0
+    - scikit-build-core >=0.10.0
     {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}
diff --git a/dependencies.yaml b/dependencies.yaml
index abb55a5e011..b0d62a9fb0d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -301,10 +301,10 @@ dependencies:
           - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0
       - output_types: conda
         packages:
-          - scikit-build-core>=0.7.0
+          - scikit-build-core>=0.10.0
       - output_types: [requirements, pyproject]
         packages:
-          - scikit-build-core[pyproject]>=0.7.0
+          - scikit-build-core[pyproject]>=0.10.0
   rapids_build_setuptools:
     common:
       - output_types: [requirements, pyproject]
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index b2ddb06d8c9..60ac171f3d7 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -4,7 +4,7 @@
 build-backend = "rapids_build_backend.build"
 requires = [
     "rapids-build-backend>=0.3.0,<0.4.0.dev0",
-    "scikit-build-core[pyproject]>=0.7.0",
+    "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
@@ -133,7 +133,8 @@ requires = [
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
-cmake.minimum-version = "3.26.4"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
 ninja.make-fallback = true
 sdist.exclude = ["*tests*"]
 sdist.reproducible = true
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index a9b60133f42..63c5b07c5f3 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -4,7 +4,7 @@
 build-backend = "rapids_build_backend.build"
 requires = [
     "rapids-build-backend>=0.3.0,<0.4.0.dev0",
-    "scikit-build-core[pyproject]>=0.7.0",
+    "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
@@ -86,7 +86,8 @@ filterwarnings = [
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
-cmake.minimum-version = "3.26.4"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
 ninja.make-fallback = true
 sdist.exclude = ["*tests*"]
 sdist.reproducible = true

From c146eed6f36e7c82052a3288e1bf6ab8c2216637 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 7 Aug 2024 22:19:46 -0700
Subject: [PATCH 034/270] Expose `stream` param in transform APIs (#16452)

Exposes the `stream` param in transform APIs

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16452
---
 cpp/include/cudf/transform.hpp       |  21 +++-
 cpp/src/interop/to_arrow.cu          |   2 +-
 cpp/src/interop/to_arrow_device.cu   |   4 +-
 cpp/src/interop/to_arrow_host.cu     |   2 +-
 cpp/src/transform/bools_to_mask.cu   |   4 +-
 cpp/src/transform/compute_column.cu  |   3 +-
 cpp/src/transform/encode.cu          |   4 +-
 cpp/src/transform/mask_to_bools.cu   |   3 +-
 cpp/src/transform/nans_to_nulls.cu   |   4 +-
 cpp/src/transform/one_hot_encode.cu  |   3 +-
 cpp/src/transform/row_bit_count.cu   |  11 +-
 cpp/src/transform/transform.cpp      |   3 +-
 cpp/tests/CMakeLists.txt             |   1 +
 cpp/tests/streams/transform_test.cpp | 164 +++++++++++++++++++++++++++
 14 files changed, 210 insertions(+), 19 deletions(-)
 create mode 100644 cpp/tests/streams/transform_test.cpp

diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index adc5bdb2af8..f16214260f7 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -47,6 +47,7 @@ namespace CUDF_EXPORT cudf {
  * @param unary_udf     The PTX/CUDA string of the unary function to apply
  * @param output_type   The output type that is compatible with the output type in the UDF
  * @param is_ptx        true: the UDF is treated as PTX code; false: the UDF is treated as CUDA code
+ * @param stream        CUDA stream used for device memory operations and kernel launches
  * @param mr            Device memory resource used to allocate the returned column's device memory
  * @return              The column resulting from applying the unary function to
  *                      every element of the input
@@ -56,6 +57,7 @@ std::unique_ptr<column> transform(
   std::string const& unary_udf,
   data_type output_type,
   bool is_ptx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -65,12 +67,14 @@ std::unique_ptr<column> transform(
  * @throws cudf::logic_error if `input.type()` is a non-floating type
  *
  * @param input         An immutable view of the input column of floating-point type
+ * @param stream        CUDA stream used for device memory operations and kernel launches
  * @param mr            Device memory resource used to allocate the returned bitmask
  * @return A pair containing a `device_buffer` with the new bitmask and it's
  * null count obtained by replacing `NaN` in `input` with null.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -83,12 +87,14 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  *
  * @param table The table used for expression evaluation
  * @param expr The root of the expression tree
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource
  * @return Output column
  */
 std::unique_ptr<column> compute_column(
   table_view const& table,
   ast::expression const& expr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -101,6 +107,7 @@ std::unique_ptr<column> compute_column(
  * @throws cudf::logic_error if `input.type()` is a non-boolean type
  *
  * @param input        Boolean elements to convert to a bitmask
+ * @param stream       CUDA stream used for device memory operations and kernel launches
  * @param mr           Device memory resource used to allocate the returned bitmask
  * @return A pair containing a `device_buffer` with the new bitmask and it's
  * null count obtained from input considering `true` represent `valid`/`1` and
@@ -108,6 +115,7 @@ std::unique_ptr<column> compute_column(
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -130,12 +138,14 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  * @endcode
  *
  * @param input Table containing values to be encoded
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return A pair containing the distinct row of the input table in sorter order,
  * and a column of integer indices representing the encoded rows.
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -162,12 +172,14 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
  *
  * @param input Column containing values to be encoded
  * @param categories Column containing categories
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return A pair containing the owner to all encoded data and a table view into the data
  */
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
   column_view const& input,
   column_view const& categories,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -188,6 +200,7 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
  * @param bitmask A device pointer to the bitmask which needs to be converted
  * @param begin_bit position of the bit from which the conversion should start
  * @param end_bit position of the bit before which the conversion should stop
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned columns' device memory
  * @return A boolean column representing the given mask from [begin_bit, end_bit)
  */
@@ -195,6 +208,7 @@ std::unique_ptr<column> mask_to_bools(
   bitmask_type const* bitmask,
   size_type begin_bit,
   size_type end_bit,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -219,11 +233,14 @@ std::unique_ptr<column> mask_to_bools(
  * row_bit_count(column(x)) >= row_bit_count(gather(column(x)))
  *
  * @param t The table view to perform the computation on
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned columns' device memory
  * @return A 32-bit integer column containing the per-row bit counts
  */
 std::unique_ptr<column> row_bit_count(
-  table_view const& t, rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  table_view const& t,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -240,12 +257,14 @@ std::unique_ptr<column> row_bit_count(
  *
  * @param t The table view to perform the computation on
  * @param segment_length The number of rows in each segment for which the total size is computed
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned columns' device memory
  * @return A 32-bit integer column containing the bit counts for each segment of rows
  */
 std::unique_ptr<column> segmented_row_bit_count(
   table_view const& t,
   size_type segment_length,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 3d41f856f4f..a867d4adfa1 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -247,7 +247,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
                                                                   arrow::MemoryPool* ar_mr,
                                                                   rmm::cuda_stream_view stream)
 {
-  auto bitmask = bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
+  auto bitmask = detail::bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
 
   auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
 
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index cea7cdebcba..a5f3f9d87f5 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -200,7 +200,7 @@ int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
 
-  auto bitmask  = bools_to_mask(column.view(), stream, mr);
+  auto bitmask  = detail::bools_to_mask(column.view(), stream, mr);
   auto contents = column.release();
   NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
   NANOARROW_RETURN_NOT_OK(
@@ -442,7 +442,7 @@ int dispatch_to_arrow_device_view::operator()<bool>(ArrowArray* out) const
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
 
-  auto bitmask = bools_to_mask(column, stream, mr);
+  auto bitmask = detail::bools_to_mask(column, stream, mr);
   NANOARROW_RETURN_NOT_OK(
     set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get()));
   NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
index 193b3a3b5a2..26f7c7e6e53 100644
--- a/cpp/src/interop/to_arrow_host.cu
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -147,7 +147,7 @@ int dispatch_to_arrow_host::operator()<bool>(ArrowArray* out) const
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
 
   NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
-  auto bitmask = bools_to_mask(column, stream, mr);
+  auto bitmask = detail::bools_to_mask(column, stream, mr);
   NANOARROW_RETURN_NOT_OK(populate_data_buffer(
     device_span<uint8_t const>(reinterpret_cast<const uint8_t*>(bitmask.first->data()),
                                bitmask.first->size()),
diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu
index c12f65deb46..452aebf4428 100644
--- a/cpp/src/transform/bools_to_mask.cu
+++ b/cpp/src/transform/bools_to_mask.cu
@@ -59,10 +59,10 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::device_async_resource_ref mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::bools_to_mask(input, cudf::get_default_stream(), mr);
+  return detail::bools_to_mask(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 7960731f3a1..c4fc8d58552 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -138,10 +138,11 @@ std::unique_ptr<column> compute_column(table_view const& table,
 
 std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::compute_column(table, expr, cudf::get_default_stream(), mr);
+  return detail::compute_column(table, expr, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index 7a044b9f6f7..1c9d52bce1b 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -72,10 +72,10 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(table_view con
 }  // namespace detail
 
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
-  cudf::table_view const& input, rmm::device_async_resource_ref mr)
+  cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::encode(input, cudf::get_default_stream(), mr);
+  return detail::encode(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu
index adf5db02d9c..be0b80a2633 100644
--- a/cpp/src/transform/mask_to_bools.cu
+++ b/cpp/src/transform/mask_to_bools.cu
@@ -62,9 +62,10 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
 std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
                                       size_type begin_bit,
                                       size_type end_bit,
+                                      rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::mask_to_bools(bitmask, begin_bit, end_bit, cudf::get_default_stream(), mr);
+  return detail::mask_to_bools(bitmask, begin_bit, end_bit, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu
index fd4f33c594c..a24ba304004 100644
--- a/cpp/src/transform/nans_to_nulls.cu
+++ b/cpp/src/transform/nans_to_nulls.cu
@@ -93,10 +93,10 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
-  column_view const& input, rmm::device_async_resource_ref mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::nans_to_nulls(input, cudf::get_default_stream(), mr);
+  return detail::nans_to_nulls(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 808f2d1b284..46e6e55b0b7 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -115,9 +115,10 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
 
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
                                                               column_view const& categories,
+                                                              rmm::cuda_stream_view stream,
                                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::one_hot_encode(input, categories, cudf::get_default_stream(), mr);
+  return detail::one_hot_encode(input, categories, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 12a15eb7e34..4530fabf889 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -561,23 +561,26 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  return segmented_row_bit_count(t, 1, stream, mr);
+  return detail::segmented_row_bit_count(t, 1, stream, mr);
 }
 
 }  // namespace detail
 
 std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 size_type segment_length,
+                                                rmm::cuda_stream_view stream,
                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::segmented_row_bit_count(t, segment_length, cudf::get_default_stream(), mr);
+  return detail::segmented_row_bit_count(t, segment_length, stream, mr);
 }
 
-std::unique_ptr<column> row_bit_count(table_view const& t, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> row_bit_count(table_view const& t,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::row_bit_count(t, cudf::get_default_stream(), mr);
+  return detail::row_bit_count(t, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 98ec44758b9..f5e9048fa0a 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -97,10 +97,11 @@ std::unique_ptr<column> transform(column_view const& input,
                                   std::string const& unary_udf,
                                   data_type output_type,
                                   bool is_ptx,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::transform(input, unary_udf, output_type, is_ptx, cudf::get_default_stream(), mr);
+  return detail::transform(input, unary_udf, output_type, is_ptx, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 5e85b3e8adf..8c4b0f1e367 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -736,6 +736,7 @@ ConfigureTest(
   STREAM_MODE
   testing
 )
+ConfigureTest(STREAM_TRANSFORM_TEST streams/transform_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_UNARY_TEST streams/unary_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp
new file mode 100644
index 00000000000..9187672221c
--- /dev/null
+++ b/cpp/tests/streams/transform_test.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/types.hpp>
+
+class TransformTest : public cudf::test::BaseFixture {};
+
+template <class dtype, class Data>
+void test_udf(char const udf[], Data data_init, cudf::size_type size, bool is_ptx)
+{
+  auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init);
+  cudf::test::fixed_width_column_wrapper<dtype, typename decltype(data_iter)::value_type> in(
+    data_iter, data_iter + size, all_valid);
+  cudf::transform(
+    in, udf, cudf::data_type(cudf::type_to_id<dtype>()), is_ptx, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, Transform)
+{
+  char const* cuda =
+    R"***(
+__device__ inline void    fdsf   (
+       float* C,
+       float a
+)
+{
+  *C = a*a*a*a;
+}
+)***";
+
+  char const* ptx =
+    R"***(
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-24817639
+// Cuda compilation tools, release 10.0, V10.0.130
+// Based on LLVM 3.4svn
+//
+
+.version 6.3
+.target sm_70
+.address_size 64
+
+	// .globl	_ZN8__main__7add$241Ef
+.common .global .align 8 .u64 _ZN08NumbaEnv8__main__7add$241Ef;
+.common .global .align 8 .u64 _ZN08NumbaEnv5numba7targets7numbers14int_power_impl12$3clocals$3e13int_power$242Efx;
+
+.visible .func  (.param .b32 func_retval0) _ZN8__main__7add$241Ef(
+	.param .b64 _ZN8__main__7add$241Ef_param_0,
+	.param .b32 _ZN8__main__7add$241Ef_param_1
+)
+{
+	.reg .f32 	%f<4>;
+	.reg .b32 	%r<2>;
+	.reg .b64 	%rd<2>;
+
+
+	ld.param.u64 	%rd1, [_ZN8__main__7add$241Ef_param_0];
+	ld.param.f32 	%f1, [_ZN8__main__7add$241Ef_param_1];
+	mul.f32 	%f2, %f1, %f1;
+	mul.f32 	%f3, %f2, %f2;
+	st.f32 	[%rd1], %f3;
+	mov.u32 	%r1, 0;
+	st.param.b32	[func_retval0+0], %r1;
+	ret;
+}
+)***";
+
+  auto data_init = [](cudf::size_type row) { return row % 3; };
+  test_udf<float>(cuda, data_init, 500, false);
+  test_udf<float>(ptx, data_init, 500, true);
+}
+
+TEST_F(TransformTest, ComputeColumn)
+{
+  auto c_0        = cudf::test::fixed_width_column_wrapper<cudf::size_type>{3, 20, 1, 50};
+  auto c_1        = cudf::test::fixed_width_column_wrapper<cudf::size_type>{10, 7, 20, 0};
+  auto table      = cudf::table_view{{c_0, c_1}};
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
+  cudf::compute_column(table, expression, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, BoolsToMask)
+{
+  std::vector<bool> input({1, 0, 1, 0, 1, 0, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> input_column(input.begin(), input.end());
+  cudf::bools_to_mask(input_column, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, MaskToBools)
+{
+  cudf::mask_to_bools(nullptr, 0, 0, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, Encode)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input{{1, 2, 3, 2, 3, 2, 1}};
+  cudf::encode(cudf::table_view({input}), cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, OneHotEncode)
+{
+  auto input    = cudf::test::fixed_width_column_wrapper<cudf::size_type>{8, 8, 8, 9, 9};
+  auto category = cudf::test::fixed_width_column_wrapper<cudf::size_type>{8, 9};
+  cudf::one_hot_encode(input, category, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, NaNsToNulls)
+{
+  std::vector<float> input = {1, 2, 3, 4, 5};
+  std::vector<bool> mask   = {true, true, true, true, false, false};
+  auto input_column =
+    cudf::test::fixed_width_column_wrapper<float>(input.begin(), input.end(), mask.begin());
+  cudf::nans_to_nulls(input_column, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, RowBitCount)
+{
+  std::vector<std::string> strings{"abc", "ï", "", "z", "bananas", "warp", "", "zing"};
+  cudf::test::strings_column_wrapper col(strings.begin(), strings.end());
+  cudf::row_bit_count(cudf::table_view({col}), cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, SegmentedRowBitCount)
+{
+  // clang-format off
+  std::vector<std::string> const strings { "daïs", "def", "", "z", "bananas", "warp", "", "zing" };
+  std::vector<bool>        const valids  {  1,      0,    0,  1,   0,          1,      1,  1 };
+  // clang-format on
+  cudf::test::strings_column_wrapper const col(strings.begin(), strings.end(), valids.begin());
+  auto const input              = cudf::table_view({col});
+  auto constexpr segment_length = 2;
+  cudf::segmented_row_bit_count(input, segment_length, cudf::test::get_default_stream());
+}

From a94512a568bd0351fd20b0c2cbcd6067fd4d504b Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 7 Aug 2024 22:20:57 -0700
Subject: [PATCH 035/270] Add interop example for `arrow::StringViewArray` to
 `cudf::column` (#16498)

Demonstrates the conversion from an `arrow:StringViewArray` to a `cudf::column`

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16498
---
 cpp/examples/build.sh               |   1 +
 cpp/examples/interop/CMakeLists.txt |  20 ++++
 cpp/examples/interop/interop.cpp    | 176 ++++++++++++++++++++++++++++
 3 files changed, 197 insertions(+)
 create mode 100644 cpp/examples/interop/CMakeLists.txt
 create mode 100644 cpp/examples/interop/interop.cpp

diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index dce81fb1677..2d6f6f316c7 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -61,3 +61,4 @@ build_example tpch
 build_example strings
 build_example nested_types
 build_example parquet_io
+build_example interop
diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt
new file mode 100644
index 00000000000..a1f99c1d2fd
--- /dev/null
+++ b/cpp/examples/interop/CMakeLists.txt
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+include(../set_cuda_architecture.cmake)
+
+rapids_cuda_init_architectures(interop_example)
+rapids_cuda_set_architectures(RAPIDS)
+
+project(
+  interop_example
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+add_executable(interop interop.cpp)
+target_link_libraries(interop PRIVATE cudf::cudf)
+target_compile_features(interop PRIVATE cxx_std_17)
diff --git a/cpp/examples/interop/interop.cpp b/cpp/examples/interop/interop.cpp
new file mode 100644
index 00000000000..8271c3836e4
--- /dev/null
+++ b/cpp/examples/interop/interop.cpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/io/csv.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <arrow/array/array_binary.h>
+#include <arrow/type.h>
+
+// Helper functuons to create StringViews
+inline arrow::StringViewType::c_type to_inline_string_view(const void* data, int32_t const& size)
+{
+  arrow::StringViewType::c_type out;
+  out.inlined = {size, {}};
+  memcpy(&out.inlined.data, data, size);
+  return out;
+}
+inline arrow::StringViewType::c_type to_inline_string_view(std::string_view const& v)
+{
+  return to_inline_string_view(v.data(), static_cast<int32_t>(v.size()));
+}
+inline arrow::StringViewType::c_type to_string_view(const void* data,
+                                                    int32_t const& size,
+                                                    int32_t const& buffer_index,
+                                                    int32_t const& offset)
+{
+  if (size <= arrow::StringViewType::kInlineSize) { return to_inline_string_view(data, size); }
+  arrow::StringViewType::c_type out;
+  out.ref = {size, {}, buffer_index, offset};
+  memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix));
+  return out;
+}
+inline arrow::StringViewType::c_type to_string_view(std::string_view const& v,
+                                                    int32_t const& buffer_index,
+                                                    int32_t const& offset)
+{
+  return to_string_view(v.data(), static_cast<int32_t>(v.size()), buffer_index, offset);
+}
+
+/**
+ * @brief Create a StringViewArray
+ *
+ * @param data_buffers The data buffers
+ * @param views The string views
+ * @param validate Whether to validate the array
+ */
+arrow::Result<std::shared_ptr<arrow::StringViewArray>> make_string_view_array(
+  arrow::BufferVector const& data_buffers,
+  std::vector<arrow::StringViewType::c_type> const& views,
+  bool validate = true)
+{
+  auto const length = static_cast<int64_t>(views.size());
+  auto const arr    = std::make_shared<arrow::StringViewArray>(
+    arrow::utf8_view(), length, arrow::Buffer::FromVector(views), std::move(data_buffers));
+  if (validate) { RETURN_NOT_OK(arr->ValidateFull()); }
+  return arr;
+}
+
+/**
+ * @brief Convert a vector of strings into a vector of the
+ * constituent chars and a vector of offsets.
+ *
+ * @param strings The vector of strings
+ */
+auto make_chars_and_offsets(std::vector<std::string> const& strings)
+{
+  std::vector<char> chars{};
+  std::vector<cudf::size_type> offsets(1, 0);
+  for (auto& str : strings) {
+    chars.insert(chars.end(), std::cbegin(str), std::cend(str));
+    auto const last_offset = static_cast<std::size_t>(offsets.back());
+    auto const next_offset = last_offset + str.length();
+    CUDF_EXPECTS(
+      next_offset < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+      "Cannot use arrow_string_view_to_cudf_column to build a large strings column");
+    offsets.push_back(static_cast<cudf::size_type>(next_offset));
+  }
+  return std::make_tuple(std::move(chars), std::move(offsets));
+};
+
+/**
+ * @brief Convert an Arrow StringViewArray to a cudf::column
+ *
+ * @param array The Arrow StringViewArray
+ * @param stream The CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> arrow_string_view_to_cudf_column(
+  std::shared_ptr<arrow::StringViewArray> const& array,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  // Convert the string views into chars and offsets
+  std::vector<std::string> strings;
+  for (auto i = 0; i < array->length(); i++) {
+    strings.push_back(array->GetString(i));
+  }
+  auto const [chars, offsets] = make_chars_and_offsets(strings);
+
+  // Copy the chars vector to the device
+  rmm::device_uvector<char> d_chars(chars.size(), stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
+    d_chars.data(), chars.data(), chars.size() * sizeof(char), cudaMemcpyDefault, stream.value()));
+
+  // Copy the offsets vector to the device
+  // and wrap it in a cudf::column
+  rmm::device_uvector<cudf::size_type> d_offsets(offsets.size(), stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
+                                offsets.data(),
+                                offsets.size() * sizeof(cudf::size_type),
+                                cudaMemcpyDefault,
+                                stream.value()));
+  auto offsets_col =
+    std::make_unique<cudf::column>(std::move(d_offsets), rmm::device_buffer{0, stream, mr}, 0);
+
+  // Create a string column out of the chars and offsets
+  return cudf::make_strings_column(array->length(),
+                                   std::move(offsets_col),
+                                   d_chars.release(),
+                                   0,
+                                   rmm::device_buffer{0, stream, mr});
+}
+
+int main(int argc, char** argv)
+{
+  std::vector<std::shared_ptr<arrow::Buffer>> data_buffers;
+  std::vector<arrow::StringViewType::c_type> views;
+
+  // Define the data buffers and string views
+  auto const buffer_a =
+    arrow::Buffer::FromString("hello rapids teamapache arrow interopnvidiacudf");
+  data_buffers.push_back(buffer_a);
+  views.push_back(to_string_view("hello rapid steam", 0, 0));
+  views.push_back(to_string_view("apache arrow interop", 0, 17));
+  views.push_back(to_inline_string_view("nvidia"));
+  views.push_back(to_inline_string_view("cudf"));
+
+  // Create a StringViewArray
+  auto const string_view_col = make_string_view_array(data_buffers, views, true).ValueOrDie();
+  std::cout << string_view_col->ToString() << std::endl;
+
+  // Convert the StringViewArray to a cudf::column
+  auto const cudf_col = arrow_string_view_to_cudf_column(string_view_col);
+
+  // Write the cudf::column as CSV
+  auto const tbl_view                  = cudf::table_view({cudf_col->view()});
+  std::vector<std::string> const names = {"col_a"};
+
+  std::vector<char> h_buffer;
+  cudf::io::csv_writer_options writer_options =
+    cudf::io::csv_writer_options::builder(cudf::io::sink_info(&h_buffer), tbl_view)
+      .include_header(not names.empty())
+      .names(names);
+
+  cudf::io::write_csv(writer_options);
+  auto const result = std::string(h_buffer.data(), h_buffer.size());
+  std::cout << result << std::endl;
+
+  return 0;
+}

From cc75b05b426920e6522c49527f8b684f780f38e3 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 8 Aug 2024 10:00:22 -0400
Subject: [PATCH 036/270] Change IPv4 convert APIs to support UINT32 instead of
 INT64 (#16489)

Changes the integer type for `cudf::strings::ipv4_to_integers` and `cudf::strings::integers_to_ipv4` to use UINT32 types instead of INT64. The INT64 type was originally chosen because libcudf did not support unsigned types at the time.
This is a breaking change since the basic input/output type is changed.

Closes #16324

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - https://github.com/brandon-b-miller
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16489
---
 cpp/include/cudf/strings/convert/convert_ipv4.hpp | 11 +++--------
 cpp/src/strings/convert/convert_ipv4.cu           | 14 +++++++-------
 cpp/tests/strings/ipv4_tests.cpp                  |  8 ++++----
 python/cudf/cudf/core/column/numerical.py         |  4 ++--
 python/cudf/cudf/tests/test_string.py             |  6 ++++--
 5 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
index 04a04907c12..97d1dfee017 100644
--- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp
+++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
@@ -44,15 +44,12 @@ namespace strings {
  * No checking is done on the format. If a string is not in IPv4 format, the resulting
  * integer is undefined.
  *
- * The resulting 32-bit integer is placed in an int64_t to avoid setting the sign-bit
- * in an int32_t type. This could be changed if cudf supported a UINT32 type in the future.
- *
  * Any null entries will result in corresponding null entries in the output column.
  *
  * @param input Strings instance for this operation
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New INT64 column converted from strings
+ * @return New UINT32 column converted from strings
  */
 std::unique_ptr<column> ipv4_to_integers(
   strings_column_view const& input,
@@ -68,13 +65,11 @@ std::unique_ptr<column> ipv4_to_integers(
  * Each input integer is dissected into four integers by dividing the input into 8-bit sections.
  * These sub-integers are then converted into [0-9] characters and placed between '.' characters.
  *
- * No checking is done on the input integer value. Only the lower 32-bits are used.
- *
  * Any null entries will result in corresponding null entries in the output column.
  *
- * @throw cudf::logic_error if the input column is not INT64 type.
+ * @throw cudf::logic_error if the input column is not UINT32 type.
  *
- * @param integers Integer (INT64) column to convert
+ * @param integers Integer (UINT32) column to convert
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New strings column
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 68a24e000ae..13d6e9bc3ba 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -46,7 +46,7 @@ namespace {
 struct ipv4_to_integers_fn {
   column_device_view const d_strings;
 
-  __device__ int64_t operator()(size_type idx)
+  __device__ uint32_t operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) return 0;
     string_view d_str  = d_strings.element<string_view>(idx);
@@ -66,7 +66,7 @@ struct ipv4_to_integers_fn {
       }
     }
     uint32_t result = (ipvals[0] << 24) + (ipvals[1] << 16) + (ipvals[2] << 8) + ipvals[3];
-    return static_cast<int64_t>(result);
+    return result;
   }
 };
 
@@ -79,18 +79,18 @@ std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
-    return make_numeric_column(data_type{type_id::INT64}, 0, mask_state::UNALLOCATED, stream);
+    return make_numeric_column(data_type{type_id::UINT32}, 0, mask_state::UNALLOCATED, stream);
   }
 
   auto strings_column = column_device_view::create(input.parent(), stream);
   // create output column copying the strings' null-mask
-  auto results   = make_numeric_column(data_type{type_id::INT64},
+  auto results   = make_numeric_column(data_type{type_id::UINT32},
                                      strings_count,
                                      cudf::detail::copy_bitmask(input.parent(), stream, mr),
                                      input.null_count(),
                                      stream,
                                      mr);
-  auto d_results = results->mutable_view().data<int64_t>();
+  auto d_results = results->mutable_view().data<uint32_t>();
   // fill output column with ipv4 integers
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
@@ -135,7 +135,7 @@ struct integers_to_ipv4_fn {
       return;
     }
 
-    auto const ip_number = d_column.element<int64_t>(idx);
+    auto const ip_number = d_column.element<uint32_t>(idx);
 
     char* out_ptr   = d_chars ? d_chars + d_offsets[idx] : nullptr;
     int shift_bits  = 24;
@@ -165,7 +165,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 {
   if (integers.is_empty()) return make_empty_column(type_id::STRING);
 
-  CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
+  CUDF_EXPECTS(integers.type().id() == type_id::UINT32, "Input column must be UINT32 type");
 
   auto d_column = column_device_view::create(integers, stream);
   auto [offsets_column, chars] =
diff --git a/cpp/tests/strings/ipv4_tests.cpp b/cpp/tests/strings/ipv4_tests.cpp
index 3bfe0f9727e..ea3ac439e62 100644
--- a/cpp/tests/strings/ipv4_tests.cpp
+++ b/cpp/tests/strings/ipv4_tests.cpp
@@ -40,8 +40,8 @@ TEST_F(StringsConvertTest, IPv4ToIntegers)
   auto strings_view = cudf::strings_column_view(strings);
   auto results      = cudf::strings::ipv4_to_integers(strings_view);
 
-  std::vector<int64_t> h_expected{0, 0, 0, 698875905, 2130706433, 700776449, 3232235521};
-  cudf::test::fixed_width_column_wrapper<int64_t> expected(
+  std::vector<uint32_t> h_expected{0, 0, 0, 698875905, 2130706433, 700776449, 3232235521};
+  cudf::test::fixed_width_column_wrapper<uint32_t> expected(
     h_expected.cbegin(),
     h_expected.cend(),
     thrust::make_transform_iterator(h_strings.begin(),
@@ -59,8 +59,8 @@ TEST_F(StringsConvertTest, IntegersToIPv4)
     thrust::make_transform_iterator(h_strings.begin(),
                                     [](auto const str) { return str != nullptr; }));
 
-  std::vector<int64_t> h_column{3232235521, 167772161, 0, 0, 700055553, 700776449};
-  cudf::test::fixed_width_column_wrapper<int64_t> column(
+  std::vector<uint32_t> h_column{3232235521, 167772161, 0, 0, 700055553, 700776449};
+  cudf::test::fixed_width_column_wrapper<uint32_t> column(
     h_column.cbegin(),
     h_column.cend(),
     thrust::make_transform_iterator(h_strings.begin(),
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index df27134d458..b83d7600c82 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -313,8 +313,8 @@ def normalize_binop_value(
             return NotImplemented
 
     def int2ip(self) -> "cudf.core.column.StringColumn":
-        if self.dtype != cudf.dtype("int64"):
-            raise TypeError("Only int64 type can be converted to ip")
+        if self.dtype != cudf.dtype("uint32"):
+            raise TypeError("Only uint32 type can be converted to ip")
 
         return libcudf.string_casting.int2ip(self)
 
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 4bd084a3938..a2a3e874c91 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -2672,7 +2672,9 @@ def test_string_ip4_to_int():
 
 
 def test_string_int_to_ipv4():
-    gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449])
+    gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]).astype(
+        "uint32"
+    )
     expected = cudf.Series(
         ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"]
     )
@@ -2718,7 +2720,7 @@ def test_string_isipv4():
 
 
 @pytest.mark.parametrize(
-    "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"int64", "uint64"}))
+    "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"uint32"}))
 )
 def test_string_int_to_ipv4_dtype_fail(dtype):
     gsr = cudf.Series([1, 2, 3, 4, 5]).astype(dtype)

From da51cad6c25f54ab344b0aa25e3dc1e4adb4550a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 8 Aug 2024 10:25:11 -0500
Subject: [PATCH 037/270] Improve update-version.sh (#16506)

A few small tweaks to `update-version.sh` for alignment across RAPIDS.

The `UCX_PY` curl call is unused.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16506
---
 ci/release/update-version.sh | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index ad96aff3930..132e58249e6 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -18,18 +18,16 @@ CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
 CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
 CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 
-#Get <major>.<minor> for next version
+# Get <major>.<minor> for next version
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
-NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
 # Need to distutils-normalize the versions for some use cases
 CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))")
 NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
 PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))")
-echo "current is ${CURRENT_SHORT_TAG_PEP440}, next is ${NEXT_SHORT_TAG_PEP440}"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -61,7 +59,7 @@ for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
   for FILE in python/*/pyproject.toml; do
-    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
+    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
   done
 done
 
@@ -77,7 +75,7 @@ sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_
 # CI files
 for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
-  sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
+  sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/branch-[0-9]+\.[0-9]+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
 

From 792dd0686f4970c70f9bdba62c54a3de0a495fd5 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 8 Aug 2024 12:56:36 -0400
Subject: [PATCH 038/270] Update pre-commit hooks (#16510)

This PR updates pre-commit hooks to the latest versions that are supported without causing style check errors.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16510
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bbcd78d051f..1b17eae0842 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -144,7 +144,7 @@ repos:
       - id: ruff-format
         files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.2.0
+    rev: v0.3.1
     hooks:
       - id: verify-copyright
         exclude: |

From 1bbe440ee7ddbc021f945e4156220f9bc270a443 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 8 Aug 2024 12:25:29 -0500
Subject: [PATCH 039/270] Add keep option to distinct nvbench (#16497)

This PR adopts some work from @srinivasyadav18 with additional modifications. This is meant to complement #16484.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Srinivas Yadav (https://github.com/srinivasyadav18)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Srinivas Yadav (https://github.com/srinivasyadav18)

URL: https://github.com/rapidsai/cudf/pull/16497
---
 cpp/benchmarks/CMakeLists.txt                 |  1 +
 cpp/benchmarks/stream_compaction/distinct.cpp | 45 ++++++++++++-------
 .../stream_compaction/stable_distinct.cpp     | 45 ++++++++++++-------
 .../stream_compaction_common.cpp              | 35 +++++++++++++++
 .../stream_compaction_common.hpp              | 19 ++++++++
 5 files changed, 113 insertions(+), 32 deletions(-)
 create mode 100644 cpp/benchmarks/stream_compaction/stream_compaction_common.cpp
 create mode 100644 cpp/benchmarks/stream_compaction/stream_compaction_common.hpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7be456ddfba..483b7b0a539 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -162,6 +162,7 @@ ConfigureNVBench(
   stream_compaction/distinct.cpp
   stream_compaction/distinct_count.cpp
   stream_compaction/stable_distinct.cpp
+  stream_compaction/stream_compaction_common.cpp
   stream_compaction/unique.cpp
   stream_compaction/unique_count.cpp
 )
diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp
index c04b6516903..d7deebca89a 100644
--- a/cpp/benchmarks/stream_compaction/distinct.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/stream_compaction/stream_compaction_common.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/list_view.hpp>
@@ -23,15 +24,29 @@
 
 #include <nvbench/nvbench.cuh>
 
+#include <limits>
+
 NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
 
 template <typename Type>
 void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
 {
-  cudf::size_type const num_rows = state.get_int64("NumRows");
+  cudf::size_type const num_rows    = state.get_int64("NumRows");
+  auto const keep                   = get_keep(state.get_string("keep"));
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
 
-  data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
-    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+  data_profile profile = data_profile_builder()
+                           .cardinality(cardinality)
+                           .null_probability(0.01)
+                           .distribution(cudf::type_to_id<Type>(),
+                                         distribution_id::UNIFORM,
+                                         static_cast<Type>(0),
+                                         std::numeric_limits<Type>::max());
 
   auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
 
@@ -40,20 +55,19 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::distinct(input_table,
-                                 {0},
-                                 cudf::duplicate_keep_option::KEEP_ANY,
-                                 cudf::null_equality::EQUAL,
-                                 cudf::nan_equality::ALL_EQUAL);
+    auto result = cudf::distinct(
+      input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
   });
 }
 
-using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+using data_type = nvbench::type_list<int32_t, int64_t>;
 
 NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
   .set_name("distinct")
   .set_type_axes_names({"Type"})
-  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
+  .add_string_axis("keep", {"any", "first", "last", "none"})
+  .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
+  .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});
 
 template <typename Type>
 void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
@@ -61,6 +75,7 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
   auto const size               = state.get_int64("ColumnSize");
   auto const dtype              = cudf::type_to_id<Type>();
   double const null_probability = state.get_float64("null_probability");
+  auto const keep               = get_keep(state.get_string("keep"));
 
   auto builder = data_profile_builder().null_probability(null_probability);
   if (dtype == cudf::type_id::LIST) {
@@ -80,11 +95,8 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::distinct(*table,
-                                 {0},
-                                 cudf::duplicate_keep_option::KEEP_ANY,
-                                 cudf::null_equality::EQUAL,
-                                 cudf::nan_equality::ALL_EQUAL);
+    auto result =
+      cudf::distinct(*table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
   });
 }
 
@@ -92,5 +104,6 @@ NVBENCH_BENCH_TYPES(nvbench_distinct_list,
                     NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
   .set_name("distinct_list")
   .set_type_axes_names({"Type"})
+  .add_string_axis("keep", {"any", "first", "last", "none"})
   .add_float64_axis("null_probability", {0.0, 0.1})
   .add_int64_axis("ColumnSize", {100'000'000});
diff --git a/cpp/benchmarks/stream_compaction/stable_distinct.cpp b/cpp/benchmarks/stream_compaction/stable_distinct.cpp
index bcee3048013..0a8836c0583 100644
--- a/cpp/benchmarks/stream_compaction/stable_distinct.cpp
+++ b/cpp/benchmarks/stream_compaction/stable_distinct.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/stream_compaction/stream_compaction_common.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/list_view.hpp>
@@ -23,15 +24,29 @@
 
 #include <nvbench/nvbench.cuh>
 
+#include <limits>
+
 NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
 
 template <typename Type>
 void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
 {
-  cudf::size_type const num_rows = state.get_int64("NumRows");
+  cudf::size_type const num_rows    = state.get_int64("NumRows");
+  auto const keep                   = get_keep(state.get_string("keep"));
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
 
-  data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
-    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+  data_profile profile = data_profile_builder()
+                           .cardinality(cardinality)
+                           .null_probability(0.01)
+                           .distribution(cudf::type_to_id<Type>(),
+                                         distribution_id::UNIFORM,
+                                         static_cast<Type>(0),
+                                         std::numeric_limits<Type>::max());
 
   auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
 
@@ -40,20 +55,19 @@ void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::stable_distinct(input_table,
-                                        {0},
-                                        cudf::duplicate_keep_option::KEEP_ANY,
-                                        cudf::null_equality::EQUAL,
-                                        cudf::nan_equality::ALL_EQUAL);
+    auto result = cudf::stable_distinct(
+      input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
   });
 }
 
-using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+using data_type = nvbench::type_list<int32_t, int64_t>;
 
 NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type))
   .set_name("stable_distinct")
   .set_type_axes_names({"Type"})
-  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
+  .add_string_axis("keep", {"any", "first", "last", "none"})
+  .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
+  .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});
 
 template <typename Type>
 void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
@@ -61,6 +75,7 @@ void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type
   auto const size               = state.get_int64("ColumnSize");
   auto const dtype              = cudf::type_to_id<Type>();
   double const null_probability = state.get_float64("null_probability");
+  auto const keep               = get_keep(state.get_string("keep"));
 
   auto builder = data_profile_builder().null_probability(null_probability);
   if (dtype == cudf::type_id::LIST) {
@@ -80,11 +95,8 @@ void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::stable_distinct(*table,
-                                        {0},
-                                        cudf::duplicate_keep_option::KEEP_ANY,
-                                        cudf::null_equality::EQUAL,
-                                        cudf::nan_equality::ALL_EQUAL);
+    auto result = cudf::stable_distinct(
+      *table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
   });
 }
 
@@ -92,5 +104,6 @@ NVBENCH_BENCH_TYPES(nvbench_stable_distinct_list,
                     NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
   .set_name("stable_distinct_list")
   .set_type_axes_names({"Type"})
+  .add_string_axis("keep", {"any", "first", "last", "none"})
   .add_float64_axis("null_probability", {0.0, 0.1})
   .add_int64_axis("ColumnSize", {100'000'000});
diff --git a/cpp/benchmarks/stream_compaction/stream_compaction_common.cpp b/cpp/benchmarks/stream_compaction/stream_compaction_common.cpp
new file mode 100644
index 00000000000..8cbb2956777
--- /dev/null
+++ b/cpp/benchmarks/stream_compaction/stream_compaction_common.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/stream_compaction/stream_compaction_common.hpp>
+
+#include <cudf/stream_compaction.hpp>
+#include <cudf/utilities/error.hpp>
+
+cudf::duplicate_keep_option get_keep(std::string const& keep_str)
+{
+  if (keep_str == "any") {
+    return cudf::duplicate_keep_option::KEEP_ANY;
+  } else if (keep_str == "first") {
+    return cudf::duplicate_keep_option::KEEP_FIRST;
+  } else if (keep_str == "last") {
+    return cudf::duplicate_keep_option::KEEP_LAST;
+  } else if (keep_str == "none") {
+    return cudf::duplicate_keep_option::KEEP_NONE;
+  } else {
+    CUDF_FAIL("Unsupported keep option.");
+  }
+}
diff --git a/cpp/benchmarks/stream_compaction/stream_compaction_common.hpp b/cpp/benchmarks/stream_compaction/stream_compaction_common.hpp
new file mode 100644
index 00000000000..d1ef2b10f41
--- /dev/null
+++ b/cpp/benchmarks/stream_compaction/stream_compaction_common.hpp
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/stream_compaction.hpp>
+
+cudf::duplicate_keep_option get_keep(std::string const& keep_str);

From 2c8de625b69bf5f7f3315c45a34bdf9ba45315a9 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 9 Aug 2024 08:25:58 -0500
Subject: [PATCH 040/270] enable list to be forced as string in JSON reader.
 (#16472)

closes #15278

This PR allows list type also forced as string when mixed type as string is enabled and a user given schema specifies a column as string, in JSON reader.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16472
---
 cpp/src/io/json/json_column.cu  |  22 ++++---
 cpp/tests/io/json/json_test.cpp | 113 ++++++++++++++++++++++----------
 2 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 17fa7abdffe..e5e21e054a6 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -567,22 +567,22 @@ void make_device_json_column(device_span<SymbolT const> input,
     thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
   };
 
-  auto initialize_json_columns = [&](auto i, auto& col) {
-    if (column_categories[i] == NC_ERR || column_categories[i] == NC_FN) {
+  auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
+    if (column_category == NC_ERR || column_category == NC_FN) {
       return;
-    } else if (column_categories[i] == NC_VAL || column_categories[i] == NC_STR) {
+    } else if (column_category == NC_VAL || column_category == NC_STR) {
       col.string_offsets.resize(max_row_offsets[i] + 1, stream);
       col.string_lengths.resize(max_row_offsets[i] + 1, stream);
       init_to_zero(col.string_offsets);
       init_to_zero(col.string_lengths);
-    } else if (column_categories[i] == NC_LIST) {
+    } else if (column_category == NC_LIST) {
       col.child_offsets.resize(max_row_offsets[i] + 2, stream);
       init_to_zero(col.child_offsets);
     }
     col.num_rows = max_row_offsets[i] + 1;
     col.validity =
       cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = to_json_col_type(column_categories[i]);
+    col.type = to_json_col_type(column_category);
   };
 
   auto reinitialize_as_string = [&](auto i, auto& col) {
@@ -764,21 +764,23 @@ void make_device_json_column(device_span<SymbolT const> input,
       }
     }
 
+    auto this_column_category = column_categories[this_col_id];
     if (is_enabled_mixed_types_as_string) {
-      // get path of this column, check if it is a struct forced as string, and enforce it
+      // get path of this column, check if it is a struct/list forced as string, and enforce it
       auto const nt                             = tree_path.get_path(this_col_id);
       std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and
-          user_dtype.value().id() == type_id::STRING) {
+      if ((column_categories[this_col_id] == NC_STRUCT or
+           column_categories[this_col_id] == NC_LIST) and
+          user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
         is_mixed_type_column[this_col_id] = 1;
-        column_categories[this_col_id]    = NC_STR;
+        this_column_category              = NC_STR;
       }
     }
 
     CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
     // move into parent
     device_json_column col(stream, mr);
-    initialize_json_columns(this_col_id, col);
+    initialize_json_columns(this_col_id, col, this_column_category);
     auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
     CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
     if (not replaced) parent_col.column_order.push_back(name);
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 993ab82f423..0a485e26b71 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2351,7 +2351,7 @@ TEST_F(JsonReaderTest, MapTypes)
   // Testing function for mixed types in JSON (for spark json reader)
   auto test_fn = [](std::string_view json_string, bool lines, std::vector<type_id> types) {
     std::map<std::string, cudf::io::schema_element> dtype_schema{
-      {"foo1", {data_type{type_id::STRING}}},  // list won't be a string
+      {"foo1", {data_type{type_id::STRING}}},  // list forced as a string
       {"foo2", {data_type{type_id::STRING}}},  // struct forced as a string
       {"1", {data_type{type_id::STRING}}},
       {"2", {data_type{type_id::STRING}}},
@@ -2378,17 +2378,17 @@ TEST_F(JsonReaderTest, MapTypes)
   test_fn(R"([{ "foo1": [1,2,3], "bar": 123 },
               { "foo2": { "a": 1 }, "bar": 456 }])",
           false,
-          {type_id::LIST, type_id::INT32, type_id::STRING});
+          {type_id::STRING, type_id::INT32, type_id::STRING});
   // jsonl
   test_fn(R"( { "foo1": [1,2,3], "bar": 123 }
               { "foo2": { "a": 1 }, "bar": 456 })",
           true,
-          {type_id::LIST, type_id::INT32, type_id::STRING});
+          {type_id::STRING, type_id::INT32, type_id::STRING});
   // jsonl-array
   test_fn(R"([123, [1,2,3]]
               [456, null,  { "a": 1 }])",
           true,
-          {type_id::INT64, type_id::LIST, type_id::STRING});
+          {type_id::INT64, type_id::STRING, type_id::STRING});
   // json-array
   test_fn(R"([[[1,2,3], null, 123],
               [null, { "a": 1 }, 456 ]])",
@@ -2678,38 +2678,81 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilter)
 
 TEST_F(JsonReaderTest, JSONMixedTypeChildren)
 {
-  std::string const json_str = R"(
-{ "Root": { "Key": [ { "EE": "A" } ] } }
-{ "Root": { "Key": {  } } }
-{ "Root": { "Key": [{ "YY": 1}] } }
-)";
-  // Column "EE" is created and destroyed
-  // Column "YY" should not be created
-
-  cudf::io::json_reader_options options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{json_str.c_str(), json_str.size()})
-      .lines(true)
-      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
-      .normalize_single_quotes(true)
-      .normalize_whitespace(false)
-      .mixed_types_as_string(true)
-      .keep_quotes(true);
-
-  auto result = cudf::io::read_json(options);
+  // struct mixed.
+  {
+    std::string const json_str = R"(
+  { "Root": { "Key": [ { "EE": "A" } ] } }
+  { "Root": { "Key": {  } } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  )";
+    // Column "EE" is created and destroyed
+    // Column "YY" should not be created
+
+    cudf::io::json_reader_options options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_str.c_str(), json_str.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .normalize_single_quotes(true)
+        .normalize_whitespace(false)
+        .mixed_types_as_string(true)
+        .keep_quotes(true);
+
+    auto result = cudf::io::read_json(options);
+
+    ASSERT_EQ(result.tbl->num_columns(), 1);
+    ASSERT_EQ(result.metadata.schema_info.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
+    ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
+    ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
+    // types
+    EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+    EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
+    cudf::test::strings_column_wrapper expected(
+      {R"([ { "EE": "A" } ])", "{  }", R"([{ "YY": 1}])"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
+  }
 
-  ASSERT_EQ(result.tbl->num_columns(), 1);
-  ASSERT_EQ(result.metadata.schema_info.size(), 1);
-  EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
-  ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
-  EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
-  ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
-  EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
-  // types
-  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
-  EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
-  cudf::test::strings_column_wrapper expected({R"([ { "EE": "A" } ])", "{  }", R"([{ "YY": 1}])"});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
+  // list mixed.
+  {
+    std::string const json_str = R"(
+  { "Root": { "Key": [ { "EE": "A" } ] } }
+  { "Root": { "Key": "abc" } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  )";
+    // Column "EE" is created and destroyed
+    // Column "YY" should not be created
+
+    cudf::io::json_reader_options options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_str.c_str(), json_str.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .normalize_single_quotes(true)
+        .normalize_whitespace(false)
+        .mixed_types_as_string(true)
+        .keep_quotes(true);
+
+    auto result = cudf::io::read_json(options);
+
+    ASSERT_EQ(result.tbl->num_columns(), 1);
+    ASSERT_EQ(result.metadata.schema_info.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
+    ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
+    ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
+    // types
+    EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+    EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
+    cudf::test::strings_column_wrapper expected(
+      {R"([ { "EE": "A" } ])", "\"abc\"", R"([{ "YY": 1}])"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
+  }
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From 9ec34ad81152a4d7889bdf1f5b92032000b09b8f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 9 Aug 2024 10:24:31 -0400
Subject: [PATCH 041/270] Remove a deprecated multibyte_split API (#16501)

Removes overloaded `cudf::io::text::multibyte_split` API deprecated in 24.08 and is no longer needed.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16501
---
 cpp/include/cudf/io/text/multibyte_split.hpp | 20 --------------------
 cpp/src/io/text/multibyte_split.cu           | 14 --------------
 2 files changed, 34 deletions(-)

diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index 8624a386d0f..3a1f9611324 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -96,26 +96,6 @@ std::unique_ptr<cudf::column> multibyte_split(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Splits the source text into a strings column using a multiple byte delimiter.
- *
- * @deprecated Since 24.08
- *
- * @param source The source input data encoded in UTF-8
- * @param delimiter UTF-8 encoded string for which to find offsets in the source
- * @param byte_range The position and size within `source` to produce the column from
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Memory resource to use for the device memory allocation
- * @return The strings found by splitting the source by the delimiter within the relevant byte
- * range.
- */
-[[deprecated]] std::unique_ptr<cudf::column> multibyte_split(
-  data_chunk_source const& source,
-  std::string const& delimiter,
-  std::optional<byte_range_info> byte_range,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of group
 
 }  // namespace text
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index be2e2b9a79c..97729a091fb 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -567,20 +567,6 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 
 }  // namespace detail
 
-// deprecated in 24.08
-std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
-                                              std::string const& delimiter,
-                                              std::optional<byte_range_info> byte_range,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::device_async_resource_ref mr)
-{
-  return multibyte_split(source,
-                         delimiter,
-                         parse_options{byte_range.value_or(create_byte_range_info_max())},
-                         stream,
-                         mr);
-}
-
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               parse_options options,

From 8009dc800bf79ba5fbacc9658235a212590640ba Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Fri, 9 Aug 2024 09:07:47 -0700
Subject: [PATCH 042/270] Update docs of the TPC-H derived examples (#16423)

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16423
---
 .gitignore                                    |  2 +
 cpp/examples/tpch/README.md                   | 37 ++++++------
 .../tpch/datagen/correct_datatypes.py         | 60 +++++++++++++++++++
 cpp/examples/tpch/datagen/datagen.sh          | 31 ++++++++++
 cpp/examples/tpch/datagen/tpch.patch          | 33 ++++++++++
 5 files changed, 145 insertions(+), 18 deletions(-)
 create mode 100644 cpp/examples/tpch/datagen/correct_datatypes.py
 create mode 100755 cpp/examples/tpch/datagen/datagen.sh
 create mode 100644 cpp/examples/tpch/datagen/tpch.patch

diff --git a/.gitignore b/.gitignore
index c89fb49697a..153c7f59744 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,6 +79,8 @@ Debug
 build/
 cpp/build/
 cpp/examples/*/install/
+cpp/examples/*/build/
+cpp/examples/tpch/datagen/datafusion
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 
diff --git a/cpp/examples/tpch/README.md b/cpp/examples/tpch/README.md
index 1ea71ae9824..8c046c3f1e8 100644
--- a/cpp/examples/tpch/README.md
+++ b/cpp/examples/tpch/README.md
@@ -1,38 +1,39 @@
-# TPC-H Inspired Examples
+# TPC-H Derived Examples
 
 Implements TPC-H queries using `libcudf`. We leverage the data generator (wrapper around official TPC-H datagen) from [Apache Datafusion](https://github.com/apache/datafusion) for generating data in Parquet format.
 
 ## Requirements
 
 - Rust
+- [libcudf](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment)
 
-## Generating the Dataset
+## Running Queries
 
-1. Clone the datafusion repository.
+1. Build the `libcudf` examples.
 ```bash
-git clone git@github.com:apache/datafusion.git
+cd cudf/cpp/examples
+./build.sh
 ```
+The TPC-H query binaries would be built inside `tpch/build`.
 
-2. Run the data generator. The data will be placed in a `data/` subdirectory.
+2. Generate the dataset.
 ```bash
-cd datafusion/benchmarks/
-./bench.sh data tpch
-
-# for scale factor 10,
-./bench.sh data tpch10
+cd tpch/datagen
+./datagen.sh [scale factor (1/10)]
 ```
 
-## Running Queries
+The parquet files will be generated in `tpch/datagen/datafusion/benchmarks/data/tpch_sf[scale factor]`.
 
-1. Build the examples.
+3. Set these environment variables for optimized runtimes.
 ```bash
-cd cpp/examples
-./build.sh
+export KVIKIO_COMPAT_MODE="on"
+export LIBCUDF_CUFILE_POLICY="KVIKIO"
+export CUDA_MODULE_LOADING="EAGER"
 ```
-The TPC-H query binaries would be built inside `examples/tpch/build`.
 
-2. Execute the queries.
+4. Execute the queries.
 ```bash
-./tpch/build/tpch_q1
+./tpch/build/tpch_q[query no] [path to dataset] [memory resource type (cuda/pool/managed/managed_pool)]
 ```
-A parquet file named `q1.parquet` would be generated holding the results of the query.
+
+A parquet file named `q[query no].parquet` would be generated containing the results of the query.
diff --git a/cpp/examples/tpch/datagen/correct_datatypes.py b/cpp/examples/tpch/datagen/correct_datatypes.py
new file mode 100644
index 00000000000..8564774647b
--- /dev/null
+++ b/cpp/examples/tpch/datagen/correct_datatypes.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import os
+import sys
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pandas as pd
+
+if __name__ == "__main__":
+    dataset_path = str(sys.argv[1])
+    tables = ["lineitem", "part", "partsupp", "orders", "supplier", "customer", "nation", "region"]
+    for table in tables:
+        filepath = os.path.join(dataset_path, f"{table}.parquet")
+        print("Reading file ", filepath)
+
+        if filepath.endswith("lineitem.parquet"):
+            df = pd.read_parquet(filepath)
+            df["l_linenumber"] = df["l_linenumber"].astype("int64")
+            df["l_quantity"] = df["l_quantity"].astype("int64")
+            df["l_extendedprice"] = df["l_extendedprice"].astype("float64")
+            df["l_discount"] = df["l_discount"].astype("float64")
+            df["l_tax"] = df["l_tax"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("part.parquet"):
+            df = pd.read_parquet(filepath)
+            df["p_size"] = df["p_size"].astype("int64")
+            df["p_retailprice"] = df["p_retailprice"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("partsupp.parquet"):
+            df = pd.read_parquet(filepath)
+            df["ps_availqty"] = df["ps_availqty"].astype("int64")
+            df["ps_supplycost"] = df["ps_supplycost"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("orders.parquet"):
+            df = pd.read_parquet(filepath)
+            df["o_totalprice"] = df["o_totalprice"].astype("float64")
+            df["o_shippriority"] = df["o_shippriority"].astype("int64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("supplier.parquet"):
+            df = pd.read_parquet(filepath)
+            df["s_acctbal"] = df["s_acctbal"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("customer.parquet"):
+            df = pd.read_parquet(filepath)
+            df["c_acctbal"] = df["c_acctbal"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("nation.parquet"):
+            df = pd.read_parquet(filepath)
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("region.parquet"):
+            df = pd.read_parquet(filepath)
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
diff --git a/cpp/examples/tpch/datagen/datagen.sh b/cpp/examples/tpch/datagen/datagen.sh
new file mode 100755
index 00000000000..0b03753daea
--- /dev/null
+++ b/cpp/examples/tpch/datagen/datagen.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -e
+
+scale_factor=$1
+script_dir=$(pwd)
+
+# Clone the datafusion repository and apply a patch
+# for single threaded data generation so that a
+# single parquet file is generated for each table
+rm -rf datafusion
+git clone https://github.com/apache/datafusion.git datafusion
+cd datafusion/
+git checkout 679a85f
+git apply ${script_dir}/tpch.patch
+cd benchmarks/
+
+# Generate the data
+# Currently, we support only scale factor 1 and 10
+if [ ${scale_factor} -eq 1 ]; then
+    ./bench.sh data tpch
+elif [ ${scale_factor} -eq 10 ]; then
+    ./bench.sh data tpch10
+else
+    echo "Unsupported scale factor"
+    exit 1
+fi
+
+# Correct the datatypes of the parquet files
+python3 ${script_dir}/correct_datatypes.py data/tpch_sf${scale_factor}
diff --git a/cpp/examples/tpch/datagen/tpch.patch b/cpp/examples/tpch/datagen/tpch.patch
new file mode 100644
index 00000000000..42727aa9904
--- /dev/null
+++ b/cpp/examples/tpch/datagen/tpch.patch
@@ -0,0 +1,33 @@
+diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
+index 3b854f6dc..f000f09c0 100755
+--- a/benchmarks/bench.sh
++++ b/benchmarks/bench.sh
+@@ -311,6 +311,15 @@ data_tpch() {
+         $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
+         popd > /dev/null
+     fi
++
++    cp ${TPCH_DIR}/lineitem/part-0.parquet ${TPCH_DIR}/lineitem.parquet
++    cp ${TPCH_DIR}/orders/part-0.parquet ${TPCH_DIR}/orders.parquet
++    cp ${TPCH_DIR}/part/part-0.parquet ${TPCH_DIR}/part.parquet
++    cp ${TPCH_DIR}/partsupp/part-0.parquet ${TPCH_DIR}/partsupp.parquet
++    cp ${TPCH_DIR}/customer/part-0.parquet ${TPCH_DIR}/customer.parquet
++    cp ${TPCH_DIR}/supplier/part-0.parquet ${TPCH_DIR}/supplier.parquet
++    cp ${TPCH_DIR}/nation/part-0.parquet ${TPCH_DIR}/nation.parquet
++    cp ${TPCH_DIR}/region/part-0.parquet ${TPCH_DIR}/region.parquet
+ }
+
+ # Runs the tpch benchmark
+diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
+index b5204b343..84fd2e78d 100644
+--- a/datafusion/common/src/config.rs
++++ b/datafusion/common/src/config.rs
+@@ -250,7 +250,7 @@ config_namespace! {
+         /// concurrency.
+         ///
+         /// Defaults to the number of CPU cores on the system
+-        pub target_partitions: usize, default = num_cpus::get()
++        pub target_partitions: usize, default = 1
+
+         /// The default time zone
+         ///

From 4446cf0188c03b82cbec28493aa131027f25dffa Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 9 Aug 2024 12:43:23 -0500
Subject: [PATCH 043/270] Update json normalization to take device_buffer
 (#16520)

This change updates json normalization calls (quote and whitespace normalization) to take owning buffer of device_buffer as input rather than device_uvector. It makes it easy to hand over a string_column's char buffer to normalization calls.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16520
---
 cpp/include/cudf/io/detail/json.hpp           |  4 ++--
 cpp/src/io/json/json_normalization.cu         | 20 +++++++++----------
 cpp/src/io/json/read_json.cu                  | 16 +++++++--------
 .../io/json/json_quote_normalization_test.cpp |  9 ++++-----
 .../json_whitespace_normalization_test.cu     |  7 +++----
 5 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 42b10a78ce8..38ba4f675c3 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -61,7 +61,7 @@ void write_json(data_sink* sink,
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 
@@ -72,7 +72,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr);
 }  // namespace io::json::detail
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 760b2214365..cb8b4e97ebb 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -298,7 +298,7 @@ struct TransduceToNormalizedWS {
 
 namespace detail {
 
-void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
@@ -311,22 +311,22 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<Symbo
                             normalize_quotes::TransduceToNormalizedQuotes{}),
                           stream);
 
-  rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
+  rmm::device_buffer outbuf(indata.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(indata.data(),
+  parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
                    static_cast<SymbolOffsetT>(indata.size()),
-                   outbuf.data(),
+                   static_cast<SymbolT*>(outbuf.data()),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
                    normalize_quotes::start_state,
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  datasource::owning_buffer<rmm::device_buffer> outdata(std::move(outbuf));
   std::swap(indata, outdata);
 }
 
-void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr)
 {
@@ -339,18 +339,18 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>
                             normalize_whitespace::TransduceToNormalizedWS{}),
                           stream);
 
-  rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
+  rmm::device_buffer outbuf(indata.size(), stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(indata.data(),
+  parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
                    static_cast<SymbolOffsetT>(indata.size()),
-                   outbuf.data(),
+                   static_cast<SymbolT*>(outbuf.data()),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
                    normalize_whitespace::start_state,
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  datasource::owning_buffer<rmm::device_buffer> outdata(std::move(outbuf));
   std::swap(indata, outdata);
 }
 
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 590f70864b1..e0d0497e0a2 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -168,7 +168,7 @@ size_t estimate_size_per_subchunk(size_t chunk_size)
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @returns Data source owning buffer enclosing the bytes read
  */
-datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
+datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
   host_span<std::unique_ptr<datasource>> sources,
   json_reader_options const& reader_opts,
   rmm::cuda_stream_view stream)
@@ -200,8 +200,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
       ? total_source_size * estimated_compression_ratio + header_size
       : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
           num_extra_delimiters;
-  rmm::device_uvector<char> buffer(buffer_size, stream);
-  device_span<char> bufspan(buffer);
+  rmm::device_buffer buffer(buffer_size, stream);
+  device_span<char> bufspan(reinterpret_cast<char*>(buffer.data()), buffer.size());
 
   // Offset within buffer indicating first read position
   std::int64_t buffer_offset = 0;
@@ -213,8 +213,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
     chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream);
   if (first_delim_pos == -1) {
     // return empty owning datasource buffer
-    auto empty_buf = rmm::device_uvector<char>(0, stream);
-    return datasource::owning_buffer<rmm::device_uvector<char>>(std::move(empty_buf));
+    auto empty_buf = rmm::device_buffer(0, stream);
+    return datasource::owning_buffer<rmm::device_buffer>(std::move(empty_buf));
   } else if (!should_load_all_sources) {
     // Find next delimiter
     std::int64_t next_delim_pos = -1;
@@ -232,12 +232,12 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
     }
     if (next_delim_pos < buffer_offset) next_delim_pos = buffer_offset + readbufspan.size();
 
-    return datasource::owning_buffer<rmm::device_uvector<char>>(
+    return datasource::owning_buffer<rmm::device_buffer>(
       std::move(buffer),
       reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
       next_delim_pos - first_delim_pos - shift_for_nonzero_offset);
   }
-  return datasource::owning_buffer<rmm::device_uvector<char>>(
+  return datasource::owning_buffer<rmm::device_buffer>(
     std::move(buffer),
     reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
     readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
@@ -249,7 +249,7 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  datasource::owning_buffer<rmm::device_uvector<char>> bufview =
+  datasource::owning_buffer<rmm::device_buffer> bufview =
     get_record_range_raw_input(sources, reader_opts, stream);
 
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
index 55ad0afe499..3a9ba8d9f3b 100644
--- a/cpp/tests/io/json/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json/json_quote_normalization_test.cpp
@@ -26,7 +26,7 @@
 #include <cudf/io/json.hpp>
 #include <cudf/io/types.hpp>
 
-#include <rmm/device_uvector.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -42,12 +42,11 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
   auto stream_view  = cudf::test::get_default_stream();
-  auto device_input = cudf::detail::make_device_uvector_async(
-    host_input, stream_view, rmm::mr::get_current_device_resource());
+  auto device_input = rmm::device_buffer(
+    host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource());
 
   // Preprocessing FST
-  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
-    std::move(device_input));
+  cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
   cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
 
   std::string preprocessed_host_output(device_data.size(), 0);
diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
index 8ed5fa81b12..01dd17fab98 100644
--- a/cpp/tests/io/json/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu
@@ -38,12 +38,11 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
   // Prepare cuda stream for data transfers & kernels
   auto stream_view = cudf::test::get_default_stream();
 
-  auto device_input = cudf::detail::make_device_uvector_async(
-    host_input, stream_view, rmm::mr::get_current_device_resource());
+  auto device_input = rmm::device_buffer(
+    host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource());
 
   // Preprocessing FST
-  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
-    std::move(device_input));
+  cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
   cudf::io::json::detail::normalize_whitespace(
     device_data, stream_view, rmm::mr::get_current_device_resource());
 

From 16aa0eaa54d00d88f897766d91f9e531f64b0070 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 9 Aug 2024 09:33:19 -1000
Subject: [PATCH 044/270] Allow DataFrame.sort_values(by=) to select an index
 level (#16519)

closes #14794

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16519
---
 python/cudf/cudf/core/index.py         | 13 ++++++++++++-
 python/cudf/cudf/core/indexed_frame.py | 26 +++++++++++++++++++++++++-
 python/cudf/cudf/tests/test_sorting.py | 20 ++++++++++++++++++++
 3 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 094da09ab08..7f40428c1b8 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,6 +5,7 @@
 import operator
 import pickle
 import warnings
+from collections.abc import Hashable
 from functools import cache, cached_property
 from numbers import Number
 from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
@@ -60,7 +61,7 @@
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Hashable, Iterable
+    from collections.abc import Generator, Iterable
     from datetime import tzinfo
 
 
@@ -450,6 +451,16 @@ def __getitem__(self, index):
             return self.start + index * self.step
         return self._as_int_index()[index]
 
+    def _get_columns_by_label(self, labels) -> Index:
+        # used in .sort_values
+        if isinstance(labels, Hashable):
+            if labels == self.name:
+                return self._as_int_index()
+        elif is_list_like(labels):
+            if list(self.names) == list(labels):
+                return self._as_int_index()
+        raise KeyError(labels)
+
     @_performance_tracking
     def equals(self, other) -> bool:
         if isinstance(other, RangeIndex):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 24d947a574a..3b44a0f5864 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3592,10 +3592,34 @@ def sort_values(
         if len(self) == 0:
             return self
 
+        try:
+            by_in_columns = self._get_columns_by_label(by)
+        except KeyError:
+            by_in_columns = None
+        if self.ndim == 1:
+            # For Series case, we're never selecting an index level.
+            by_in_index = None
+        else:
+            try:
+                by_in_index = self.index._get_columns_by_label(by)
+            except KeyError:
+                by_in_index = None
+
+        if by_in_columns is not None and by_in_index is not None:
+            raise ValueError(
+                f"{by=} appears in the {type(self).__name__} columns "
+                "and as an index level which is ambiguous."
+            )
+        elif by_in_columns is not None:
+            by_columns = by_in_columns
+        elif by_in_index is not None:
+            by_columns = by_in_index
+        else:
+            raise KeyError(by)
         # argsort the `by` column
         out = self._gather(
             GatherMap.from_column_unchecked(
-                self._get_columns_by_label(by)._get_sorted_inds(
+                by_columns._get_sorted_inds(
                     ascending=ascending, na_position=na_position
                 ),
                 len(self),
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index a8ffce6e88b..2cf2259d9ec 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -405,3 +405,23 @@ def test_dataframe_scatter_by_map_empty():
     df = DataFrame({"a": [], "b": []}, dtype="float64")
     scattered = df.scatter_by_map(df["a"])
     assert len(scattered) == 0
+
+
+def test_sort_values_by_index_level():
+    df = pd.DataFrame({"a": [1, 3, 2]}, index=pd.Index([1, 3, 2], name="b"))
+    cudf_df = DataFrame.from_pandas(df)
+    result = cudf_df.sort_values("b")
+    expected = df.sort_values("b")
+    assert_eq(result, expected)
+
+
+def test_sort_values_by_ambiguous():
+    df = pd.DataFrame({"a": [1, 3, 2]}, index=pd.Index([1, 3, 2], name="a"))
+    cudf_df = DataFrame.from_pandas(df)
+
+    assert_exceptions_equal(
+        lfunc=df.sort_values,
+        rfunc=cudf_df.sort_values,
+        lfunc_args_and_kwargs=(["a"], {}),
+        rfunc_args_and_kwargs=(["a"], {}),
+    )

From 4cd87d3fdb0de6154504f8486ed49b685a9dceec Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 9 Aug 2024 09:33:53 -1000
Subject: [PATCH 045/270] Fix `date_range(start, end, freq)` when end-start is
 divisible by freq (#16516)

xref https://github.com/rapidsai/cudf/issues/16507

`date_range` generates its dates via `range`, and the end of this range was calculated via `math.ceil((end - start) / freq)`. If `(end - start) / freq` did not produce a remainder, `math.ceil` would not correctly increment this value by `1` to capture the last date.

Instead, this PR uses `math.floor((end - start) / freq) + 1` to always ensure the last date is captured

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16516
---
 python/cudf/cudf/core/index.py           | 6 ++++--
 python/cudf/cudf/core/series.py          | 3 +++
 python/cudf/cudf/core/tools/datetimes.py | 9 +++++----
 python/cudf/cudf/tests/test_datetime.py  | 6 ++++++
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 7f40428c1b8..3eab27bd165 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2414,11 +2414,13 @@ def day_name(self, locale: str | None = None) -> Index:
         >>> datetime_index = cudf.date_range("2016-12-31", "2017-01-08", freq="D")
         >>> datetime_index
         DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03',
-                       '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07'],
+                       '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07',
+                       '2017-01-08'],
                       dtype='datetime64[ns]', freq='D')
         >>> datetime_index.day_name()
         Index(['Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
-               'Friday', 'Saturday'], dtype='object')
+               'Friday', 'Saturday', 'Sunday'],
+              dtype='object')
         """
         day_names = self._column.get_day_names(locale)
         return Index._from_data({self.name: day_names})
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index de57ac5f290..53675d339ac 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -801,14 +801,17 @@ def dt(self):
         >>> s.dt.hour
         0    12
         1    13
+        2    14
         dtype: int16
         >>> s.dt.second
         0    0
         1    0
+        2    0
         dtype: int16
         >>> s.dt.day
         0    3
         1    3
+        2    3
         dtype: int16
 
         Returns
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 2f77778116f..c50a36b68b5 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -951,7 +951,7 @@ def date_range(
         end = cudf.Scalar(end, dtype=dtype)
         _is_increment_sequence = end >= start
 
-        periods = math.ceil(
+        periods = math.floor(
             int(end - start) / _offset_to_nanoseconds_lower_bound(offset)
         )
 
@@ -959,9 +959,10 @@ def date_range(
             # Mismatched sign between (end-start) and offset, return empty
             # column
             periods = 0
-        elif periods == 0:
-            # end == start, return exactly 1 timestamp (start)
-            periods = 1
+        else:
+            # If end == start, periods == 0 and we return exactly 1 timestamp (start).
+            # Otherwise, since closed="both", we ensure the end point is included.
+            periods += 1
 
     # We compute `end_estim` (the estimated upper bound of the date
     # range) below, but don't always use it.  We do this to ensure
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 6bc775d2a2c..7be4faa42c3 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2536,3 +2536,9 @@ def test_dti_methods(method, kwargs):
     result = getattr(cudf_dti, method)(**kwargs)
     expected = getattr(pd_dti, method)(**kwargs)
     assert_eq(result, expected)
+
+
+def test_date_range_start_end_divisible_by_freq():
+    result = cudf.date_range("2011-01-01", "2011-01-02", freq="h")
+    expected = pd.date_range("2011-01-01", "2011-01-02", freq="h")
+    assert_eq(result, expected)

From 45b20d135a290d5f14e291316e94674653f71737 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 9 Aug 2024 12:22:15 -1000
Subject: [PATCH 046/270] Preserve array name in MultiIndex.from_arrays
 (#16515)

xref https://github.com/rapidsai/cudf/issues/16507

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16515
---
 python/cudf/cudf/core/multiindex.py       |  4 ++++
 python/cudf/cudf/tests/test_multiindex.py | 10 ++++++++++
 2 files changed, 14 insertions(+)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 9646b34830f..ab88b191570 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1394,12 +1394,16 @@ def from_arrays(
             raise TypeError(error_msg)
         codes = []
         levels = []
+        names_from_arrays = []
         for array in arrays:
             if not (is_list_like(array) or is_column_like(array)):
                 raise TypeError(error_msg)
             code, level = factorize(array, sort=True)
             codes.append(code)
             levels.append(level)
+            names_from_arrays.append(getattr(array, "name", None))
+        if names is None:
+            names = names_from_arrays
         return cls(
             codes=codes, levels=levels, sortorder=sortorder, names=names
         )
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index b7314a36e73..a68f4574da3 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2179,3 +2179,13 @@ def test_unique_level():
     result = pd_mi.unique(level=1)
     expected = cudf_mi.unique(level=1)
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "idx", [pd.Index, pd.CategoricalIndex, pd.DatetimeIndex, pd.TimedeltaIndex]
+)
+def test_from_arrays_infer_names(idx):
+    arrays = [idx([1], name="foo"), idx([2], name="bar")]
+    expected = pd.MultiIndex.from_arrays(arrays)
+    result = cudf.MultiIndex.from_arrays(arrays)
+    assert_eq(result, expected)

From a3dc14fcea938729c7c9468bd6a64331395b2f78 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 12 Aug 2024 07:56:48 -1000
Subject: [PATCH 047/270] Disallow indexing by selecting duplicate labels
 (#16514)

xref https://github.com/rapidsai/cudf/issues/16507

I would say this was a bug before because we would silently return a new DataFrame with just `len(set(column_labels))` when selecting by column. Now this operation raises since duplicate column labels are generally not supported.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16514
---
 python/cudf/cudf/core/column_accessor.py | 4 ++++
 python/cudf/cudf/tests/test_indexing.py  | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 819d351b2c4..83596704672 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -530,6 +530,10 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
             )
         else:
             data = {k: self._grouped_data[k] for k in key}
+            if len(data) != len(key):
+                raise ValueError(
+                    "Selecting duplicate column labels is not supported."
+                )
         if self.multiindex:
             data = dict(_to_flat_dict_inner(data))
         return self.__class__(
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 7005cbc6834..716b4dc6acd 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2361,3 +2361,11 @@ def test_sliced_categorical_as_ordered():
         name="a",
     )
     assert_eq(result, expected)
+
+
+def test_duplicate_labels_raises():
+    df = cudf.DataFrame([[1, 2]], columns=["a", "b"])
+    with pytest.raises(ValueError):
+        df[["a", "a"]]
+    with pytest.raises(ValueError):
+        df.loc[:, ["a", "a"]]

From 091cb72294a394deb176600e74c7cb115cfff05a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 12 Aug 2024 14:48:02 -0400
Subject: [PATCH 048/270] Remove deprecated public APIs from libcudf (#16524)

Removing some more deprecated public libcudf APIs.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16524
---
 cpp/include/cudf/strings/replace.hpp       | 12 ------------
 cpp/include/cudf/utilities/type_checks.hpp | 19 -------------------
 cpp/src/strings/replace/multi.cu           | 11 -----------
 cpp/src/utilities/type_checks.cpp          |  5 -----
 4 files changed, 47 deletions(-)

diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index 5b4ffb98f99..f450b77ad7a 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -160,18 +160,6 @@ std::unique_ptr<column> replace_multiple(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-/**
- * @copydoc cudf::strings::replace_multiple
- *
- * @deprecated since 24.08
- */
-[[deprecated]] std::unique_ptr<column> replace(
-  strings_column_view const& input,
-  strings_column_view const& targets,
-  strings_column_view const& repls,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
index 4fcbca09d17..aeb5db57830 100644
--- a/cpp/include/cudf/utilities/type_checks.hpp
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -22,25 +22,6 @@
 
 namespace CUDF_EXPORT cudf {
 
-/**
- * @brief Compare the types of two `column_view`s
- *
- * @deprecated Since 24.06. Use cudf::have_same_types instead.
- *
- * This function returns true if the type of `lhs` equals that of `rhs`.
- * - For fixed point types, the scale is compared.
- * - For dictionary types, the type of the keys are compared if both are
- *   non-empty columns.
- * - For lists types, the type of child columns are compared recursively.
- * - For struct types, the type of each field are compared in order.
- * - For all other types, the `id` of `data_type` is compared.
- *
- * @param lhs The first `column_view` to compare
- * @param rhs The second `column_view` to compare
- * @return true if column types match
- */
-[[deprecated]] bool column_types_equal(column_view const& lhs, column_view const& rhs);
-
 /**
  * @brief Compare the type IDs of two `column_view`s
  *
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 2ca22f0e017..b5248700d53 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -533,16 +533,5 @@ std::unique_ptr<column> replace_multiple(strings_column_view const& strings,
   return detail::replace_multiple(strings, targets, repls, stream, mr);
 }
 
-// deprecated in 24.08
-std::unique_ptr<column> replace(strings_column_view const& strings,
-                                strings_column_view const& targets,
-                                strings_column_view const& repls,
-                                rmm::cuda_stream_view stream,
-                                rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::replace_multiple(strings, targets, repls, stream, mr);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index dac981fb532..3095b342748 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -139,11 +139,6 @@ bool have_same_types(column_view const& lhs, column_view const& rhs)
   return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
 }
 
-bool column_types_equal(column_view const& lhs, column_view const& rhs)
-{
-  return have_same_types(lhs, rhs);
-}
-
 bool have_same_types(column_view const& lhs, scalar const& rhs)
 {
   return type_dispatcher(lhs.type(), column_scalar_equal_fn{}, lhs, rhs);

From cce00c00b0ae374ee72332aaea5fcd1cc121e85a Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 12 Aug 2024 14:38:37 -0700
Subject: [PATCH 049/270] Pass batch size to JSON reader using environment
 variable (#16502)

The JSON reader set the batch size to `INT_MAX` bytes since the motivation for implementing a batched JSON reader was to parse source files whose total size is larger than `INT_MAX` (#16138, #16162). However, we can use a much smaller batch size to evaluate the correctness of the reader and speed up tests significantly.
This PR focuses on reducing runtime of the batched reader test by setting the batch size to be used by the reader as an environment variable.
The runtime of `JsonLargeReaderTest.MultiBatch` in `LARGE_STRINGS_TEST` gtest  drops from ~52s to ~3s.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16502
---
 cpp/CMakeLists.txt                    |   1 -
 cpp/src/io/json/byte_range_info.cu    |  37 ----
 cpp/src/io/json/read_json.cu          | 291 +++++++++++++++-----------
 cpp/src/io/json/read_json.hpp         |  28 ++-
 cpp/tests/large_strings/json_tests.cu |  20 +-
 5 files changed, 204 insertions(+), 173 deletions(-)
 delete mode 100644 cpp/src/io/json/byte_range_info.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 310bc99b279..eeafc411874 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -392,7 +392,6 @@ add_library(
   src/io/csv/reader_impl.cu
   src/io/csv/writer_impl.cu
   src/io/functions.cpp
-  src/io/json/byte_range_info.cu
   src/io/json/json_column.cu
   src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
diff --git a/cpp/src/io/json/byte_range_info.cu b/cpp/src/io/json/byte_range_info.cu
deleted file mode 100644
index 258a40b0dd3..00000000000
--- a/cpp/src/io/json/byte_range_info.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/types.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/find.h>
-
-namespace cudf::io::json::detail {
-
-// Extract the first character position in the string.
-size_type find_first_delimiter(device_span<char const> d_data,
-                               char const delimiter,
-                               rmm::cuda_stream_view stream)
-{
-  auto const first_delimiter_position =
-    thrust::find(rmm::exec_policy(stream), d_data.begin(), d_data.end(), delimiter);
-  return first_delimiter_position != d_data.end() ? first_delimiter_position - d_data.begin() : -1;
-}
-
-}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index e0d0497e0a2..2658cbbed2f 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -31,6 +31,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <thrust/distance.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 
@@ -38,11 +39,14 @@
 
 namespace cudf::io::json::detail {
 
-size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
-                    size_t range_offset,
-                    size_t range_size)
+namespace {
+
+// Return total size of sources enclosing the passed range
+std::size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
+                         std::size_t range_offset,
+                         std::size_t range_size)
 {
-  return std::accumulate(sources.begin(), sources.end(), 0ul, [=](size_t sum, auto& source) {
+  return std::accumulate(sources.begin(), sources.end(), 0ul, [=](std::size_t sum, auto& source) {
     auto const size = source->size();
     // TODO take care of 0, 0, or *, 0 case.
     return sum +
@@ -50,109 +54,55 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
   });
 }
 
+// Return estimated size of subchunk using a heuristic involving the byte range size and the minimum
+// subchunk size
+std::size_t estimate_size_per_subchunk(std::size_t chunk_size)
+{
+  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
+  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
+  // 10kb) and the byte range size
+  return geometric_mean(std::ceil(static_cast<double>(chunk_size) / num_subchunks),
+                        min_subchunk_size);
+}
+
 /**
- * @brief Read from array of data sources into RMM buffer. The size of the returned device span
-          can be larger than the number of bytes requested from the list of sources when
-          the range to be read spans across multiple sources. This is due to the delimiter
-          characters inserted after the end of each accessed source.
+ * @brief Return the upper bound on the batch size for the JSON reader.
  *
- * @param buffer Device span buffer to which data is read
- * @param sources Array of data sources
- * @param compression Compression format of source
- * @param range_offset Number of bytes to skip from source start
- * @param range_size Number of bytes to read from source
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @returns A subspan of the input device span containing data read
+ * The datasources passed to the JSON reader are split into batches demarcated by byte range
+ * offsets and read iteratively. The batch size is capped at INT_MAX bytes, which is the
+ * default value returned by the function. This value can be overridden at runtime using the
+ * environment variable LIBCUDF_JSON_BATCH_SIZE
+ *
+ * @return size in bytes
  */
-device_span<char> ingest_raw_input(device_span<char> buffer,
-                                   host_span<std::unique_ptr<datasource>> sources,
-                                   compression_type compression,
-                                   size_t range_offset,
-                                   size_t range_size,
-                                   rmm::cuda_stream_view stream)
+std::size_t get_batch_size_upper_bound()
 {
-  CUDF_FUNC_RANGE();
-  // We append a line delimiter between two files to make sure the last line of file i and the first
-  // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
-  // delimiter.
-  auto constexpr num_delimiter_chars = 1;
-
-  if (compression == compression_type::NONE) {
-    auto delimiter_map = cudf::detail::make_empty_host_vector<size_t>(sources.size(), stream);
-    std::vector<size_t> prefsum_source_sizes(sources.size());
-    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    size_t bytes_read = 0;
-    std::transform_inclusive_scan(sources.begin(),
-                                  sources.end(),
-                                  prefsum_source_sizes.begin(),
-                                  std::plus<size_t>{},
-                                  [](std::unique_ptr<datasource> const& s) { return s->size(); });
-    auto upper =
-      std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
-    size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
-
-    auto const total_bytes_to_read =
-      std::min(range_size, prefsum_source_sizes.back() - range_offset);
-    range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
-    for (size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) {
-      if (sources[i]->is_empty()) continue;
-      auto data_size =
-        std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
-      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
-                         (num_delimiter_chars * delimiter_map.size());
-      if (sources[i]->is_device_read_preferred(data_size)) {
-        bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
-      } else {
-        h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
-        auto const& h_buffer = h_buffers.back();
-        CUDF_CUDA_TRY(cudaMemcpyAsync(
-          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value()));
-        bytes_read += h_buffer->size();
-      }
-      range_offset = 0;
-      delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size()));
-    }
-    // Removing delimiter inserted after last non-empty source is read
-    if (!delimiter_map.empty()) { delimiter_map.pop_back(); }
-
-    // If this is a multi-file source, we scatter the JSON line delimiters between files
-    if (sources.size() > 1) {
-      static_assert(num_delimiter_chars == 1,
-                    "Currently only single-character delimiters are supported");
-      auto const delimiter_source = thrust::make_constant_iterator('\n');
-      auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
-        delimiter_map, stream, rmm::mr::get_current_device_resource());
-      thrust::scatter(rmm::exec_policy_nosync(stream),
-                      delimiter_source,
-                      delimiter_source + d_delimiter_map.size(),
-                      d_delimiter_map.data(),
-                      buffer.data());
-    }
-    stream.synchronize();
-    return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
-  }
-  // TODO: allow byte range reading from multiple compressed files.
-  auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
-  auto hbuffer                 = std::vector<uint8_t>(remaining_bytes_to_read);
-  // Single read because only a single compressed source is supported
-  // Reading to host because decompression of a single block is much faster on the CPU
-  sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
-  auto uncomp_data = decompress(compression, hbuffer);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
-                                reinterpret_cast<char*>(uncomp_data.data()),
-                                uncomp_data.size() * sizeof(char),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
-  stream.synchronize();
-  return buffer.first(uncomp_data.size());
+  auto const batch_size_str         = std::getenv("LIBCUDF_JSON_BATCH_SIZE");
+  int64_t const batch_size          = batch_size_str != nullptr ? std::atol(batch_size_str) : 0L;
+  auto const batch_limit            = static_cast<int64_t>(std::numeric_limits<int32_t>::max());
+  auto const batch_size_upper_bound = static_cast<std::size_t>(
+    (batch_size > 0 && batch_size < batch_limit) ? batch_size : batch_limit);
+  return batch_size_upper_bound;
 }
 
-size_t estimate_size_per_subchunk(size_t chunk_size)
+/**
+ * @brief Extract the first delimiter character position in the string
+ *
+ * @param d_data Device span in which to search for delimiter character
+ * @param delimiter Delimiter character to search for
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return Position of first delimiter character in device array
+ */
+size_type find_first_delimiter(device_span<char const> d_data,
+                               char const delimiter,
+                               rmm::cuda_stream_view stream)
 {
-  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
-  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
-  // 10kb) and the byte range size
-  return geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
+  auto const first_delimiter_position =
+    thrust::find(rmm::exec_policy(stream), d_data.begin(), d_data.end(), delimiter);
+  return first_delimiter_position != d_data.end()
+           ? static_cast<size_type>(thrust::distance(d_data.begin(), first_delimiter_position))
+           : -1;
 }
 
 /**
@@ -175,12 +125,12 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
 {
   CUDF_FUNC_RANGE();
 
-  size_t const total_source_size            = sources_size(sources, 0, 0);
+  std::size_t const total_source_size       = sources_size(sources, 0, 0);
   auto constexpr num_delimiter_chars        = 1;
   auto const num_extra_delimiters           = num_delimiter_chars * (sources.size() - 1);
   compression_type const reader_compression = reader_opts.get_compression();
-  size_t const chunk_offset                 = reader_opts.get_byte_range_offset();
-  size_t chunk_size                         = reader_opts.get_byte_range_size();
+  std::size_t const chunk_offset            = reader_opts.get_byte_range_offset();
+  std::size_t chunk_size                    = reader_opts.get_byte_range_size();
 
   CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
                "Invalid offsetting",
@@ -188,14 +138,14 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
-  int const num_subchunks_prealloced = should_load_all_sources ? 0 : max_subchunks_prealloced;
-  size_t const size_per_subchunk     = estimate_size_per_subchunk(chunk_size);
+  int const num_subchunks_prealloced  = should_load_all_sources ? 0 : max_subchunks_prealloced;
+  std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
 
   // The allocation for single source compressed input is estimated by assuming a ~4:1
   // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
   // of subchunks.
   auto constexpr header_size = 4096;
-  size_t const buffer_size =
+  std::size_t const buffer_size =
     reader_compression != compression_type::NONE
       ? total_source_size * estimated_compression_ratio + header_size
       : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
@@ -217,8 +167,8 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
     return datasource::owning_buffer<rmm::device_buffer>(std::move(empty_buf));
   } else if (!should_load_all_sources) {
     // Find next delimiter
-    std::int64_t next_delim_pos = -1;
-    size_t next_subchunk_start  = chunk_offset + chunk_size;
+    std::int64_t next_delim_pos     = -1;
+    std::size_t next_subchunk_start = chunk_offset + chunk_size;
     while (next_subchunk_start < total_source_size && next_delim_pos < buffer_offset) {
       buffer_offset += readbufspan.size();
       readbufspan    = ingest_raw_input(bufspan.last(buffer_size - buffer_offset),
@@ -243,6 +193,8 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
     readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
 }
 
+// Helper function to read the current batch using byte range offsets and size
+// passed
 table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
                                json_reader_options const& reader_opts,
                                rmm::cuda_stream_view stream,
@@ -270,6 +222,92 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
 }
 
+}  // anonymous namespace
+
+device_span<char> ingest_raw_input(device_span<char> buffer,
+                                   host_span<std::unique_ptr<datasource>> sources,
+                                   compression_type compression,
+                                   std::size_t range_offset,
+                                   std::size_t range_size,
+                                   rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  // We append a line delimiter between two files to make sure the last line of file i and the first
+  // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
+  // delimiter.
+  auto constexpr num_delimiter_chars = 1;
+
+  if (compression == compression_type::NONE) {
+    auto delimiter_map = cudf::detail::make_empty_host_vector<std::size_t>(sources.size(), stream);
+    std::vector<std::size_t> prefsum_source_sizes(sources.size());
+    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
+    std::size_t bytes_read = 0;
+    std::transform_inclusive_scan(sources.begin(),
+                                  sources.end(),
+                                  prefsum_source_sizes.begin(),
+                                  std::plus<std::size_t>{},
+                                  [](std::unique_ptr<datasource> const& s) { return s->size(); });
+    auto upper =
+      std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
+    std::size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
+
+    auto const total_bytes_to_read =
+      std::min(range_size, prefsum_source_sizes.back() - range_offset);
+    range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
+    for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read;
+         i++) {
+      if (sources[i]->is_empty()) continue;
+      auto data_size =
+        std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
+      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
+                         (num_delimiter_chars * delimiter_map.size());
+      if (sources[i]->is_device_read_preferred(data_size)) {
+        bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
+      } else {
+        h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
+        auto const& h_buffer = h_buffers.back();
+        CUDF_CUDA_TRY(cudaMemcpyAsync(
+          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value()));
+        bytes_read += h_buffer->size();
+      }
+      range_offset = 0;
+      delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size()));
+    }
+    // Removing delimiter inserted after last non-empty source is read
+    if (!delimiter_map.empty()) { delimiter_map.pop_back(); }
+
+    // If this is a multi-file source, we scatter the JSON line delimiters between files
+    if (sources.size() > 1) {
+      static_assert(num_delimiter_chars == 1,
+                    "Currently only single-character delimiters are supported");
+      auto const delimiter_source = thrust::make_constant_iterator('\n');
+      auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
+        delimiter_map, stream, rmm::mr::get_current_device_resource());
+      thrust::scatter(rmm::exec_policy_nosync(stream),
+                      delimiter_source,
+                      delimiter_source + d_delimiter_map.size(),
+                      d_delimiter_map.data(),
+                      buffer.data());
+    }
+    stream.synchronize();
+    return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
+  }
+  // TODO: allow byte range reading from multiple compressed files.
+  auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
+  auto hbuffer                 = std::vector<uint8_t>(remaining_bytes_to_read);
+  // Single read because only a single compressed source is supported
+  // Reading to host because decompression of a single block is much faster on the CPU
+  sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
+  auto uncomp_data = decompress(compression, hbuffer);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
+                                reinterpret_cast<char*>(uncomp_data.data()),
+                                uncomp_data.size() * sizeof(char),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
+  stream.synchronize();
+  return buffer.first(uncomp_data.size());
+}
+
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
@@ -296,15 +334,16 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
    * Note that the batched reader does not work for compressed inputs or for regular
    * JSON inputs.
    */
-  size_t const total_source_size = sources_size(sources, 0, 0);
-  size_t chunk_offset            = reader_opts.get_byte_range_offset();
-  size_t chunk_size              = reader_opts.get_byte_range_size();
-  chunk_size                     = !chunk_size ? total_source_size - chunk_offset
-                                               : std::min(chunk_size, total_source_size - chunk_offset);
+  std::size_t const total_source_size = sources_size(sources, 0, 0);
+  std::size_t chunk_offset            = reader_opts.get_byte_range_offset();
+  std::size_t chunk_size              = reader_opts.get_byte_range_size();
+  chunk_size                          = !chunk_size ? total_source_size - chunk_offset
+                                                    : std::min(chunk_size, total_source_size - chunk_offset);
 
-  size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
-  size_t const batch_size_ub =
-    std::numeric_limits<int>::max() - (max_subchunks_prealloced * size_per_subchunk);
+  std::size_t const size_per_subchunk      = estimate_size_per_subchunk(chunk_size);
+  std::size_t const batch_size_upper_bound = get_batch_size_upper_bound();
+  std::size_t const batch_size =
+    batch_size_upper_bound - (max_subchunks_prealloced * size_per_subchunk);
 
   /*
    * Identify the position (zero-indexed) of starting source file from which to begin
@@ -314,10 +353,10 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
    */
 
   // Prefix sum of source file sizes
-  size_t pref_source_size = 0;
+  std::size_t pref_source_size = 0;
   // Starting source file from which to being batching evaluated using byte range offset
-  size_t const start_source = [chunk_offset, &sources, &pref_source_size]() {
-    for (size_t src_idx = 0; src_idx < sources.size(); ++src_idx) {
+  std::size_t const start_source = [chunk_offset, &sources, &pref_source_size]() {
+    for (std::size_t src_idx = 0; src_idx < sources.size(); ++src_idx) {
       if (pref_source_size + sources[src_idx]->size() > chunk_offset) { return src_idx; }
       pref_source_size += sources[src_idx]->size();
     }
@@ -329,16 +368,16 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
    * batch begins, and `end_bytes_size` gives the terminal bytes position after which reading
    * stops.
    */
-  size_t pref_bytes_size = chunk_offset;
-  size_t end_bytes_size  = chunk_offset + chunk_size;
-  std::vector<size_t> batch_offsets{pref_bytes_size};
-  for (size_t i = start_source; i < sources.size() && pref_bytes_size < end_bytes_size;) {
+  std::size_t pref_bytes_size = chunk_offset;
+  std::size_t end_bytes_size  = chunk_offset + chunk_size;
+  std::vector<std::size_t> batch_offsets{pref_bytes_size};
+  for (std::size_t i = start_source; i < sources.size() && pref_bytes_size < end_bytes_size;) {
     pref_source_size += sources[i]->size();
     // If the current source file can subsume multiple batches, we split the file until the
     // boundary of the last batch exceeds the end of the file (indexed by `pref_source_size`)
     while (pref_bytes_size < end_bytes_size &&
-           pref_source_size >= std::min(pref_bytes_size + batch_size_ub, end_bytes_size)) {
-      auto next_batch_size = std::min(batch_size_ub, end_bytes_size - pref_bytes_size);
+           pref_source_size >= std::min(pref_bytes_size + batch_size, end_bytes_size)) {
+      auto next_batch_size = std::min(batch_size, end_bytes_size - pref_bytes_size);
       batch_offsets.push_back(batch_offsets.back() + next_batch_size);
       pref_bytes_size += next_batch_size;
     }
@@ -356,7 +395,7 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
   // Dispatch individual batches to read_batch and push the resulting table into
   // partial_tables array. Note that the reader options need to be updated for each
   // batch to adjust byte range offset and byte range size.
-  for (size_t i = 0; i < batch_offsets.size() - 1; i++) {
+  for (std::size_t i = 0; i < batch_offsets.size() - 1; i++) {
     batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
     batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
     partial_tables.emplace_back(
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 32de4ebabfa..7e3a920f00d 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -37,6 +37,20 @@ constexpr size_t min_subchunk_size        = 10000;
 constexpr int estimated_compression_ratio = 4;
 constexpr int max_subchunks_prealloced    = 3;
 
+/**
+ * @brief Read from array of data sources into RMM buffer. The size of the returned device span
+          can be larger than the number of bytes requested from the list of sources when
+          the range to be read spans across multiple sources. This is due to the delimiter
+          characters inserted after the end of each accessed source.
+ *
+ * @param buffer Device span buffer to which data is read
+ * @param sources Array of data sources
+ * @param compression Compression format of source
+ * @param range_offset Number of bytes to skip from source start
+ * @param range_size Number of bytes to read from source
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @returns A subspan of the input device span containing data read
+ */
 device_span<char> ingest_raw_input(device_span<char> buffer,
                                    host_span<std::unique_ptr<datasource>> sources,
                                    compression_type compression,
@@ -44,14 +58,20 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                                    size_t range_size,
                                    rmm::cuda_stream_view stream);
 
+/**
+ * @brief Reads and returns the entire data set in batches.
+ *
+ * @param sources Input `datasource` objects to read the dataset from
+ * @param reader_opts Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ *
+ * @return cudf::table object that contains the array of cudf::column.
+ */
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
-size_type find_first_delimiter(device_span<char const> d_data,
-                               char const delimiter,
-                               rmm::cuda_stream_view stream);
-
 }  // namespace io::json::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/tests/large_strings/json_tests.cu b/cpp/tests/large_strings/json_tests.cu
index 49abf7b484d..e34ab991c11 100644
--- a/cpp/tests/large_strings/json_tests.cu
+++ b/cpp/tests/large_strings/json_tests.cu
@@ -28,13 +28,17 @@ struct JsonLargeReaderTest : public cudf::test::StringsLargeTest {};
 
 TEST_F(JsonLargeReaderTest, MultiBatch)
 {
-  std::string json_string             = R"(
+  std::string json_string = R"(
     { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
     { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
     { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
     { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
-  constexpr size_t batch_size_ub      = std::numeric_limits<int>::max();
-  constexpr size_t expected_file_size = 1.5 * static_cast<double>(batch_size_ub);
+
+  std::size_t const batch_size_upper_bound = std::numeric_limits<int32_t>::max() / 16;
+  // set smaller batch_size to reduce file size and execution time
+  setenv("LIBCUDF_JSON_BATCH_SIZE", std::to_string(batch_size_upper_bound).c_str(), 1);
+
+  constexpr std::size_t expected_file_size = 1.5 * static_cast<double>(batch_size_upper_bound);
   std::size_t const log_repetitions =
     static_cast<std::size_t>(std::ceil(std::log2(expected_file_size / json_string.size())));
 
@@ -66,8 +70,11 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
     datasources.emplace_back(cudf::io::datasource::create(hb));
   }
   // Test for different chunk sizes
-  std::vector<size_t> chunk_sizes{
-    batch_size_ub / 4, batch_size_ub / 2, batch_size_ub, static_cast<size_t>(batch_size_ub * 2)};
+  std::vector<std::size_t> chunk_sizes{batch_size_upper_bound / 4,
+                                       batch_size_upper_bound / 2,
+                                       batch_size_upper_bound,
+                                       static_cast<std::size_t>(batch_size_upper_bound * 2)};
+
   for (auto chunk_size : chunk_sizes) {
     auto const tables =
       split_byte_range_reading<std::int64_t>(datasources,
@@ -86,4 +93,7 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
     // cannot use EQUAL due to concatenate removing null mask
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
   }
+
+  // go back to normal batch_size
+  unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
 }

From e5f8dd33d78a2c964f8d6bac895deb73a9be7aa6 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Mon, 12 Aug 2024 16:52:52 -0500
Subject: [PATCH 050/270] Update the java code to properly deal with lists
 being returned as strings (#16536)

Recently some JSON parsing was updated so lists could be returned as strings. This updates the java code so that when cleaning up the results to match the desired schema that it can handle corner cases associated with lists and structs properly.

Tests are covered in the Spark plugin, but I am happy to add some here if we really want to validate that part of this.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16536
---
 java/src/main/java/ai/rapids/cudf/Table.java | 29 +++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 4e737451ed6..36e342cae13 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1084,7 +1084,12 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest
         // The types don't match so just return the input unchanged...
         return DidViewChange.no();
       } else {
-        String[] foundNames = children.getNames();
+        String[] foundNames;
+        if (children == null) {
+          foundNames = new String[0];
+        } else {
+          foundNames = children.getNames();
+        }
         HashMap<String, Integer> indices = new HashMap<>();
         for (int i = 0; i < foundNames.length; i++) {
           indices.put(foundNames[i], i);
@@ -1101,8 +1106,9 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest
           for (int i = 0; i < columns.length; i++) {
             String neededColumnName = neededNames[i];
             Integer index = indices.get(neededColumnName);
+            Schema childSchema = schema.getChild(i);
             if (index != null) {
-              if (schema.getChild(i).isStructOrHasStructDescendant()) {
+              if (childSchema.isStructOrHasStructDescendant()) {
                 ColumnView child = cv.getChildColumnView(index);
                 boolean shouldCloseChild = true;
                 try {
@@ -1131,8 +1137,23 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest
               }
             } else {
               somethingChanged = true;
-              try (Scalar s = Scalar.fromNull(types[i])) {
-                columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
+              if (types[i] == DType.LIST) {
+                try (Scalar s = Scalar.listFromNull(childSchema.getChild(0).asHostDataType())) {
+                  columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
+                }
+              } else if (types[i] == DType.STRUCT) {
+                int numStructChildren = childSchema.getNumChildren();
+                HostColumnVector.DataType[] structChildren = new HostColumnVector.DataType[numStructChildren];
+                for (int structChildIndex = 0; structChildIndex < numStructChildren; structChildIndex++) {
+                  structChildren[structChildIndex] = childSchema.getChild(structChildIndex).asHostDataType();
+                }
+                try (Scalar s = Scalar.structFromNull(structChildren)) {
+                  columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
+                }
+              } else {
+                try (Scalar s = Scalar.fromNull(types[i])) {
+                  columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
+                }
               }
             }
           }

From 7178bf2eb34334db909a151926d8112c441b3b09 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 13 Aug 2024 08:45:44 -0400
Subject: [PATCH 051/270] Rework cudf::io::text::byte_range_info class member
 functions (#16518)

Adds `const` declarations to appropriate member functions in class `cudf::io::text::byte_range_info` and moves the ctor implementation to .cpp file.
This helps with using the `byte_range_info` objects in `const` variables and inside of `const` functions.

Found while working on #15983

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16518
---
 cpp/include/cudf/io/text/byte_range_info.hpp | 21 ++++++++------------
 cpp/src/io/text/byte_range_info.cpp          |  7 +++++++
 cpp/src/io/text/multibyte_split.cu           |  2 +-
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp
index 7e9256be1d3..5f3c91dc99c 100644
--- a/cpp/include/cudf/io/text/byte_range_info.hpp
+++ b/cpp/include/cudf/io/text/byte_range_info.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <cstdint>
@@ -40,53 +39,49 @@ class byte_range_info {
   int64_t _size{};    ///< size in bytes
 
  public:
-  constexpr byte_range_info() = default;
+  byte_range_info() = default;
   /**
    * @brief Constructs a byte_range_info object
    *
    * @param offset offset in bytes
    * @param size size in bytes
    */
-  constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
-  {
-    CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
-    CUDF_EXPECTS(size >= 0, "size must be non-negative");
-  }
+  byte_range_info(int64_t offset, int64_t size);
 
   /**
    * @brief Copy constructor
    *
    * @param other byte_range_info object to copy
    */
-  constexpr byte_range_info(byte_range_info const& other) noexcept = default;
+  byte_range_info(byte_range_info const& other) noexcept = default;
   /**
    * @brief  Copy assignment operator
    *
    * @param other byte_range_info object to copy
    * @return this object after copying
    */
-  constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default;
+  byte_range_info& operator=(byte_range_info const& other) noexcept = default;
 
   /**
    * @brief Get the offset in bytes
    *
    * @return Offset in bytes
    */
-  [[nodiscard]] constexpr int64_t offset() { return _offset; }
+  [[nodiscard]] int64_t offset() const { return _offset; }
 
   /**
    * @brief Get the size in bytes
    *
    * @return Size in bytes
    */
-  [[nodiscard]] constexpr int64_t size() { return _size; }
+  [[nodiscard]] int64_t size() const { return _size; }
 
   /**
    * @brief Returns whether the span is empty.
    *
-   * @return true iff the span is empty, i.e. `size() == 0`
+   * @return true iff the range is empty, i.e. `size() == 0`
    */
-  [[nodiscard]] constexpr bool empty() { return size() == 0; }
+  [[nodiscard]] bool is_empty() const { return size() == 0; }
 };
 
 /**
diff --git a/cpp/src/io/text/byte_range_info.cpp b/cpp/src/io/text/byte_range_info.cpp
index 6a7836ed4e1..fe811739b97 100644
--- a/cpp/src/io/text/byte_range_info.cpp
+++ b/cpp/src/io/text/byte_range_info.cpp
@@ -16,6 +16,7 @@
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <limits>
 
@@ -23,6 +24,12 @@ namespace cudf {
 namespace io {
 namespace text {
 
+byte_range_info::byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
+{
+  CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
+  CUDF_EXPECTS(size >= 0, "size must be non-negative");
+}
+
 byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits<int64_t>::max()}; }
 
 std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 97729a091fb..e3435a24b18 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -310,7 +310,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 {
   CUDF_FUNC_RANGE();
 
-  if (byte_range.empty()) { return make_empty_column(type_id::STRING); }
+  if (byte_range.is_empty()) { return make_empty_column(type_id::STRING); }
 
   auto device_delim = cudf::string_scalar(delimiter, true, stream, mr);
 

From 419fb99fa9ac471ae00ebe7787543b8e9cc154b5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 13 Aug 2024 08:52:30 -0400
Subject: [PATCH 052/270] Fix all-empty input column for strings split APIs
 (#16466)

Fixes specialized behavior for all empty input column on the strings split APIs.
Verifying behavior with Pandas `str.split( pat, expand, regex )`
`pat=None     -- whitespace`
`expand=False -- record APIs`
`regex=True   -- re APIs`

- [x] `split`
- [x] `split` - whitespace
- [x] `rsplit`
- [x] `rsplit` - whitespace
- [x] `split_record`
- [x] `split_record` - whitespace
- [x] `rsplit_record`
- [x] `rsplit_record` - whitespace
- [x] `split_re`
- [x] `rsplit_re`
- [x] `split_record_re`
- [x] `rsplit_record_re`

Closes #16453

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16466
---
 cpp/src/strings/split/split.cuh       | 24 ++++++--------
 cpp/src/strings/split/split_re.cu     |  4 +++
 cpp/tests/strings/split_tests.cpp     | 47 ++++++++++++++++++++++++---
 python/cudf/cudf/tests/test_string.py | 16 +++++++++
 4 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 4d7096c02ca..af70367678e 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -142,7 +142,7 @@ struct base_split_tokenizer {
 
     // max_tokens already included in token counts
     if (d_tokens.size() == 1) {
-      d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()};
+      d_tokens[0] = string_index_pair{(d_str.empty() ? "" : d_str.data()), d_str.size_bytes()};
       return;
     }
 
@@ -357,24 +357,20 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
   auto const chars_bytes =
     get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
     get_offset_value(input.offsets(), input.offset(), stream);
-  if (chars_bytes == 0) {
-    auto offsets = cudf::make_column_from_scalar(
-      numeric_scalar<int32_t>(0, true, stream), strings_count + 1, stream, mr);
-    auto tokens = rmm::device_uvector<string_index_pair>(0, stream);
-    return std::pair{std::move(offsets), std::move(tokens)};
-  }
   auto const d_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
   // count the number of delimiters in the entire column
   rmm::device_scalar<int64_t> d_count(0, stream);
-  constexpr int64_t block_size         = 512;
-  constexpr size_type bytes_per_thread = 4;
-  auto const num_blocks                = util::div_rounding_up_safe(
-    util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
-  count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
-    <<<num_blocks, block_size, 0, stream.value()>>>(
-      tokenizer, d_offsets, chars_bytes, d_count.data());
+  if (chars_bytes > 0) {
+    constexpr int64_t block_size         = 512;
+    constexpr size_type bytes_per_thread = 4;
+    auto const num_blocks                = util::div_rounding_up_safe(
+      util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
+    count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
+      <<<num_blocks, block_size, 0, stream.value()>>>(
+        tokenizer, d_offsets, chars_bytes, d_count.data());
+  }
 
   // Create a vector of every delimiter position in the chars column.
   // These may include overlapping or otherwise out-of-bounds delimiters which
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index d72ec1085b5..e0aacf07ef0 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -71,6 +71,10 @@ struct token_reader_fn {
     auto const token_offset = d_token_offsets[idx];
     auto const token_count  = d_token_offsets[idx + 1] - token_offset;
     auto const d_result     = d_tokens + token_offset;  // store tokens here
+    if (nchars == 0) {
+      d_result[0] = string_index_pair{"", 0};
+      return;
+    }
 
     int64_t token_idx = 0;
     auto itr          = d_str.begin();
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 4c020cb4c29..7ece08b19f2 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -307,24 +307,46 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
-TEST_F(StringsSplitTest, SplitRecordAllEmpty)
+TEST_F(StringsSplitTest, SplitAllEmpty)
 {
   auto input     = cudf::test::strings_column_wrapper({"", "", "", ""});
   auto sv        = cudf::strings_column_view(input);
+  auto empty     = cudf::string_scalar("");
   auto delimiter = cudf::string_scalar("s");
+
+  auto result = cudf::strings::split(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+  result = cudf::strings::rsplit(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+
+  // whitespace hits a special case where nothing matches returns an all-null column
+  auto expected = cudf::test::strings_column_wrapper({"", "", "", ""}, {0, 0, 0, 0});
+  result        = cudf::strings::split(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected);
+  result = cudf::strings::rsplit(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected);
+}
+
+TEST_F(StringsSplitTest, SplitRecordAllEmpty)
+{
+  auto input     = cudf::test::strings_column_wrapper({"", "", "", ""});
+  auto sv        = cudf::strings_column_view(input);
   auto empty     = cudf::string_scalar("");
+  auto delimiter = cudf::string_scalar("s");
 
   using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-  LCW expected({LCW{}, LCW{}, LCW{}, LCW{}});
+  LCW expected({LCW{""}, LCW{""}, LCW{""}, LCW{""}});
+  LCW expected_empty({LCW{}, LCW{}, LCW{}, LCW{}});
+
   auto result = cudf::strings::split_record(sv, delimiter);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
   result = cudf::strings::split_record(sv, empty);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty);
 
   result = cudf::strings::rsplit_record(sv, delimiter);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
   result = cudf::strings::rsplit_record(sv, empty);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty);
 }
 
 TEST_F(StringsSplitTest, MultiByteDelimiters)
@@ -575,6 +597,23 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary)
   }
 }
 
+TEST_F(StringsSplitTest, SplitRegexAllEmpty)
+{
+  auto input = cudf::test::strings_column_wrapper({"", "", "", ""});
+  auto sv    = cudf::strings_column_view(input);
+  auto prog  = cudf::strings::regex_program::create("[ _]");
+
+  auto result = cudf::strings::split_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+  result = cudf::strings::rsplit_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+
+  auto rec_result = cudf::strings::split_record_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input);
+  rec_result = cudf::strings::rsplit_record_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input);
+}
+
 TEST_F(StringsSplitTest, RSplitRecord)
 {
   std::vector<char const*> h_strings{
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index a2a3e874c91..30880f074c0 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -978,6 +978,22 @@ def test_string_split_re(data, pat, n, expand):
     assert_eq(expect, got)
 
 
+@pytest.mark.parametrize("pat", [None, "\\s+"])
+@pytest.mark.parametrize("regex", [False, True])
+@pytest.mark.parametrize("expand", [False, True])
+def test_string_split_all_empty(pat, regex, expand):
+    ps = pd.Series(["", "", "", ""], dtype="str")
+    gs = cudf.Series(["", "", "", ""], dtype="str")
+
+    expect = ps.str.split(pat=pat, expand=expand, regex=regex)
+    got = gs.str.split(pat=pat, expand=expand, regex=regex)
+
+    if isinstance(got, cudf.DataFrame):
+        assert_eq(expect, got, check_column_type=False)
+    else:
+        assert_eq(expect, got)
+
+
 @pytest.mark.parametrize(
     "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
 )

From 3a791cb8a83ca2cf446a910cb94d5a4e3edf2b9f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 13 Aug 2024 08:56:43 -0400
Subject: [PATCH 053/270] Remove unneeded pair-iterator benchmark (#16511)

Removes the pair-iterator benchmark logic. The remaining benchmarks use the null-replacement-iterator which uses the libcudf pair-iterator internally. There is no need for benchmarking this unique iterator pattern that is not used by libcudf.

The `cpp/benchmarks/iterator/iterator.cu` failed to compile with gcc 12 because the sum-reduce function cannot resolve adding `thrust::pair` objects together likely due to some recent changes in CCCL. Regardless, adding `thrust::pair` objects is not something we need to benchmark. The existing benchmark benchmarks libcudf's usage of the internal pair-iterator correctly.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16511
---
 cpp/benchmarks/iterator/iterator.cu | 77 -----------------------------
 1 file changed, 77 deletions(-)

diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index ada7a9bd73d..fd0cebb12ea 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -30,7 +30,6 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/pair.h>
 #include <thrust/reduce.h>
 
 #include <random>
@@ -161,68 +160,6 @@ void BM_iterator(benchmark::State& state)
                           sizeof(TypeParam));
 }
 
-// operator+ defined for pair iterator reduction
-template <typename T>
-__device__ thrust::pair<T, bool> operator+(thrust::pair<T, bool> lhs, thrust::pair<T, bool> rhs)
-{
-  return thrust::pair<T, bool>{lhs.first * lhs.second + rhs.first * rhs.second,
-                               lhs.second + rhs.second};
-}
-// -----------------------------------------------------------------------------
-template <typename T, bool has_null>
-void pair_iterator_bench_cub(cudf::column_view& col,
-                             rmm::device_uvector<thrust::pair<T, bool>>& result)
-{
-  thrust::pair<T, bool> init{0, false};
-  auto d_col    = cudf::column_device_view::create(col);
-  int num_items = col.size();
-  auto begin    = d_col->pair_begin<T, has_null>();
-  reduce_by_cub(result.begin(), begin, num_items, init);
-}
-
-template <typename T, bool has_null>
-void pair_iterator_bench_thrust(cudf::column_view& col,
-                                rmm::device_uvector<thrust::pair<T, bool>>& result)
-{
-  thrust::pair<T, bool> init{0, false};
-  auto d_col = cudf::column_device_view::create(col);
-  auto d_in  = d_col->pair_begin<T, has_null>();
-  auto d_end = d_in + col.size();
-  thrust::reduce(thrust::device, d_in, d_end, init, cudf::DeviceSum{});
-}
-
-template <class TypeParam, bool cub_or_thrust>
-void BM_pair_iterator(benchmark::State& state)
-{
-  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  using T      = TypeParam;
-  auto num_gen = thrust::counting_iterator<cudf::size_type>(0);
-  auto null_gen =
-    thrust::make_transform_iterator(num_gen, [](cudf::size_type row) { return row % 2 == 0; });
-
-  cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_F(num_gen, num_gen + column_size);
-  cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_T(
-    num_gen, num_gen + column_size, null_gen);
-  cudf::column_view hasnull_F = wrap_hasnull_F;
-  cudf::column_view hasnull_T = wrap_hasnull_T;
-
-  // Initialize dev_result to false
-  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    if (cub_or_thrust) {
-      pair_iterator_bench_cub<T, false>(hasnull_T,
-                                        dev_result);  // driven by pair iterator with nulls
-    } else {
-      pair_iterator_bench_thrust<T, false>(hasnull_T,
-                                           dev_result);  // driven by pair iterator with nulls
-    }
-  }
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
-                          sizeof(TypeParam));
-}
-
 #define ITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust, raw_or_iterator) \
   BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state)             \
   {                                                                          \
@@ -238,17 +175,3 @@ ITER_BM_BENCHMARK_DEFINE(double_cub_raw, double, true, true);
 ITER_BM_BENCHMARK_DEFINE(double_cub_iter, double, true, false);
 ITER_BM_BENCHMARK_DEFINE(double_thrust_raw, double, false, true);
 ITER_BM_BENCHMARK_DEFINE(double_thrust_iter, double, false, false);
-
-#define PAIRITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust)  \
-  BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \
-  {                                                              \
-    BM_pair_iterator<type, cub_or_thrust>(state);                \
-  }                                                              \
-  BENCHMARK_REGISTER_F(Iterator, name)                           \
-    ->RangeMultiplier(10)                                        \
-    ->Range(1000, 10000000)                                      \
-    ->UseManualTime()                                            \
-    ->Unit(benchmark::kMillisecond);
-
-PAIRITER_BM_BENCHMARK_DEFINE(double_cub_pair, double, true);
-PAIRITER_BM_BENCHMARK_DEFINE(double_thrust_pair, double, false);

From 3801f811ab7713e4cb9cc3bb34d282f8a04e71e4 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 13 Aug 2024 12:40:40 -0500
Subject: [PATCH 054/270] Remove hardcoded versions from workflows. (#16540)

This PR removes hardcoded Python versions from CI workflows. It is a prerequisite for dropping Python 3.9. See https://github.com/rapidsai/build-planning/issues/88.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16540
---
 .github/workflows/pandas-tests.yaml | 3 ++-
 .github/workflows/pr.yaml           | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index cf0c2b377dd..10c803f7921 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -19,7 +19,8 @@ jobs:
       secrets: inherit
       uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
       with:
-        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
+        # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+        matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
         build_type: nightly
         branch: ${{ inputs.branch }}
         date: ${{ inputs.date }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c2e7f64f952..ea8a1762b2c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -187,6 +187,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/run_tests.sh
@@ -196,7 +197,8 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.

From 5780c4d8fb5afac2e04988a2ff5531f94c22d3a3 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 13 Aug 2024 13:46:31 -0700
Subject: [PATCH 055/270] Register `read_parquet` and `read_csv` with dask-expr
 (#16535)

After https://github.com/dask/dask-expr/pull/1114, Dask cuDF must register specific `read_parquet` and `read_csv` functions to be used when query-planning is enabled (the default).

**This PR is required for CI to pass with dask>2024.8.0**

**NOTE**: It probably doesn't make sense to add specific tests for this change. Once the 2014.7.1 dask pin is removed, all `dask_cudf` tests using `read_parquet` and  `read_csv` will fail without this change...

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/16535
---
 python/dask_cudf/dask_cudf/backends.py | 35 ++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 2b1f745fc04..01bab30190a 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -667,6 +667,41 @@ def from_dict(
             constructor=constructor,
         )
 
+    @staticmethod
+    def read_parquet(*args, engine=None, **kwargs):
+        import dask_expr as dx
+
+        from dask_cudf.io.parquet import CudfEngine
+
+        return _default_backend(
+            dx.read_parquet, *args, engine=CudfEngine, **kwargs
+        )
+
+    @staticmethod
+    def read_csv(
+        path,
+        *args,
+        header="infer",
+        dtype_backend=None,
+        storage_options=None,
+        **kwargs,
+    ):
+        import dask_expr as dx
+        from fsspec.utils import stringify_path
+
+        if not isinstance(path, str):
+            path = stringify_path(path)
+        return dx.new_collection(
+            dx.io.csv.ReadCSV(
+                path,
+                dtype_backend=dtype_backend,
+                storage_options=storage_options,
+                kwargs=kwargs,
+                header=header,
+                dataframe_backend="cudf",
+            )
+        )
+
     @staticmethod
     def read_json(*args, **kwargs):
         from dask_cudf.io.json import read_json as read_json_impl

From cf3fabf7d090dcd983080e3c844002ebb7280e77 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 13 Aug 2024 22:59:47 +0200
Subject: [PATCH 056/270] Ensure comparisons with pyints and integer series
 always succeed (#16532)

When Python integers are compared to a series of integers, the result can always be correctly defined no matter the values of the Python integer.

This was always a very mild issue.  But with NumPy 2 behavior not upcasting the computation result type based on the value anymore, even things like:
```
cudf.Series([1, 2, 3], dtype="int8") < 1000
```
would fail.
(Similar paths could be taken for other integer scalars, but there would be mostly nice for performance.)

N.B. NumPy/pandas also support exact comparisons when mixing e.g. uint64 and int64.  This is another rare exception that cudf currently does not support.

Closes gh-16282

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16532
---
 python/cudf/cudf/core/column/numerical.py | 54 +++++++++++++++++------
 python/cudf/cudf/tests/test_binops.py     | 41 +++++++++++++++++
 2 files changed, 81 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b83d7600c82..bbc74ef349e 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -199,16 +199,53 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             np.bool_: np.float32,
         }
 
+        out_dtype = None
         if op in {"__truediv__", "__rtruediv__"}:
             # Division with integer types results in a suitable float.
             if truediv_type := int_float_dtype_mapping.get(self.dtype.type):
                 return self.astype(truediv_type)._binaryop(other, op)
+        elif op in {
+            "__lt__",
+            "__gt__",
+            "__le__",
+            "__ge__",
+            "__eq__",
+            "__ne__",
+        }:
+            out_dtype = "bool"
+
+            # If `other` is a Python integer and it is out-of-bounds
+            # promotion could fail but we can trivially define the result
+            # in terms of `notnull` or `NULL_NOT_EQUALS`.
+            if type(other) is int and self.dtype.kind in "iu":  # noqa: E721
+                truthiness = None
+                iinfo = np.iinfo(self.dtype)
+                if iinfo.min > other:
+                    truthiness = op in {"__ne__", "__gt__", "__ge__"}
+                elif iinfo.max < other:
+                    truthiness = op in {"__ne__", "__lt__", "__le__"}
+
+                # Compare with minimum value so that the result is true/false
+                if truthiness is True:
+                    other = iinfo.min
+                    op = "__ge__"
+                elif truthiness is False:
+                    other = iinfo.min
+                    op = "__lt__"
+
+        elif op in {"NULL_EQUALS", "NULL_NOT_EQUALS"}:
+            out_dtype = "bool"
 
         reflect, op = self._check_reflected_op(op)
         if (other := self._wrap_binop_normalization(other)) is NotImplemented:
             return NotImplemented
-        out_dtype = self.dtype
-        if other is not None:
+
+        if out_dtype is not None:
+            pass  # out_dtype was already set to bool
+        if other is None:
+            # not a binary operator, so no need to promote
+            out_dtype = self.dtype
+        elif out_dtype is None:
             out_dtype = np.result_type(self.dtype, other.dtype)
             if op in {"__mod__", "__floordiv__"}:
                 tmp = self if reflect else other
@@ -225,17 +262,6 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                             out_dtype = cudf.dtype("float64")
                     elif is_scalar(tmp) and tmp == 0:
                         out_dtype = cudf.dtype("float64")
-        if op in {
-            "__lt__",
-            "__gt__",
-            "__le__",
-            "__ge__",
-            "__eq__",
-            "__ne__",
-            "NULL_EQUALS",
-            "NULL_NOT_EQUALS",
-        }:
-            out_dtype = "bool"
 
         if op in {"__and__", "__or__", "__xor__"}:
             if self.dtype.kind == "f" or other.dtype.kind == "f":
@@ -247,7 +273,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             if self.dtype.kind == "b" or other.dtype.kind == "b":
                 out_dtype = "bool"
 
-        if (
+        elif (
             op == "__pow__"
             and self.dtype.kind in "iu"
             and (is_integer(other) or other.dtype.kind in "iu")
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 503b1a975b4..4256ec872e6 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -290,6 +290,47 @@ def test_series_compare(cmpop, obj_class, dtype):
     np.testing.assert_equal(result3.to_numpy(), cmpop(arr1, arr2))
 
 
+@pytest.mark.parametrize(
+    "dtype,val",
+    [("int8", 200), ("int32", 2**32), ("uint8", -128), ("uint64", -1)],
+)
+@pytest.mark.parametrize(
+    "op",
+    [
+        operator.eq,
+        operator.ne,
+        operator.lt,
+        operator.le,
+        operator.gt,
+        operator.ge,
+    ],
+)
+@pytest.mark.parametrize("reverse", [False, True])
+def test_series_compare_integer(dtype, val, op, reverse):
+    # Tests that these actually work, even though they are out of bound.
+    force_cast_val = np.array(val).astype(dtype)
+    sr = Series(
+        [np.iinfo(dtype).min, np.iinfo(dtype).max, force_cast_val, None],
+        dtype=dtype,
+    )
+
+    if reverse:
+        _op = op
+
+        def op(x, y):
+            return _op(y, x)
+
+    # We expect the same result as comparing to a value within range (e.g. 0)
+    # except that a NULL value evaluates to False
+    if op(0, val):
+        expected = Series([True, True, True, None])
+    else:
+        expected = Series([False, False, False, None])
+
+    res = op(sr, val)
+    assert_eq(res, expected)
+
+
 def _series_compare_nulls_typegen():
     return [
         *combinations_with_replacement(DATETIME_TYPES, 2),

From 1f0d0c93f315f64698ffcc80082926896facf13a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 14 Aug 2024 09:07:22 -0400
Subject: [PATCH 057/270] Change cudf::empty_like to not include offsets for
 empty strings columns (#16529)

Fixes `cudf::empty_like` to only create empty child columns for nested types. The empty child columns are needed to store the types for consistency with `cudf::make_empty_column`.

Closes #16490

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16529
---
 cpp/src/copying/copy.cpp                  | 6 ++++++
 cpp/tests/copying/pack_tests.cpp          | 6 ++++--
 cpp/tests/replace/replace_nulls_tests.cpp | 2 +-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 98ee6aa8f68..bac8dbe5d95 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -143,6 +143,12 @@ std::unique_ptr<column> empty_like(column_view const& input)
 {
   CUDF_FUNC_RANGE();
 
+  // test_dataframe.py passes an EMPTY column type here;
+  // this causes is_nested to throw an error since it uses the type-dispatcher
+  if ((input.type().id() == type_id::EMPTY) || !cudf::is_nested(input.type())) {
+    return make_empty_column(input.type());
+  }
+
   std::vector<std::unique_ptr<column>> children;
   std::transform(input.child_begin(),
                  input.child_end(),
diff --git a/cpp/tests/copying/pack_tests.cpp b/cpp/tests/copying/pack_tests.cpp
index ea4408efa6a..8a50e071cb9 100644
--- a/cpp/tests/copying/pack_tests.cpp
+++ b/cpp/tests/copying/pack_tests.cpp
@@ -573,6 +573,8 @@ TEST_F(PackUnpackTest, SlicedEmpty)
 
   cudf::table_view t({a, b, c, d});
 
-  auto sliced = cudf::split(t, {0});
-  this->run_test(sliced[0]);
+  auto sliced   = cudf::split(t, {0});
+  auto packed   = cudf::pack(t);
+  auto unpacked = cudf::unpack(packed);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t, unpacked);
 }
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 9603ea44a76..fcee27305f2 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -674,7 +674,7 @@ TEST_F(ReplaceDictionaryTest, ReplaceNullsEmpty)
   cudf::test::fixed_width_column_wrapper<int64_t> input_empty_w({});
   auto input_empty = cudf::dictionary::encode(input_empty_w);
   auto result      = cudf::replace_nulls(input_empty->view(), input_empty->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), input_empty->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), input_empty->view());
 }
 
 TEST_F(ReplaceDictionaryTest, ReplaceNullsNoNulls)

From c20d6b3a3588c70d985e0d737fed844a9c0c6426 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 14 Aug 2024 09:07:51 -0400
Subject: [PATCH 058/270] Remove unneeded output size parameter from internal
 count_matches utility (#16531)

Removes `output_size` parameter from `cudf::strings::detail::count_matches` utility since the output size should equal the input size from the first parameter. This also removes an unnecessary `assert()` call. The parameter became unnecessary as part of the large strings work.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16531
---
 cpp/src/strings/contains.cu            | 2 +-
 cpp/src/strings/count_matches.cu       | 9 +++------
 cpp/src/strings/count_matches.hpp      | 2 --
 cpp/src/strings/extract/extract_all.cu | 2 +-
 cpp/src/strings/search/findall.cu      | 2 +-
 cpp/src/strings/split/split_re.cu      | 6 +++---
 6 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 718ac41e36c..79d241205df 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -112,7 +112,7 @@ std::unique_ptr<column> count_re(strings_column_view const& input,
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
 
-  auto result = count_matches(*d_strings, *d_prog, input.size(), stream, mr);
+  auto result = count_matches(*d_strings, *d_prog, stream, mr);
   if (input.has_nulls()) {
     result->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr),
                           input.null_count());
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index e8672ea5335..4ad3a75baf7 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -60,18 +60,15 @@ struct count_fn {
 
 std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       reprog_device& d_prog,
-                                      size_type output_size,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  assert(output_size >= d_strings.size() and "Unexpected output size");
-
   auto results = make_numeric_column(
-    data_type{type_to_id<size_type>()}, output_size, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, d_strings.size(), mask_state::UNALLOCATED, stream, mr);
 
-  if (d_strings.size() == 0) return results;
+  if (d_strings.size() == 0) { return results; }
 
-  auto d_results = results->mutable_view().data<int32_t>();
+  auto d_results = results->mutable_view().data<cudf::size_type>();
 
   launch_transform_kernel(count_fn{d_strings}, d_prog, d_results, d_strings.size(), stream);
 
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
index 4a5efac37fd..eab9863b975 100644
--- a/cpp/src/strings/count_matches.hpp
+++ b/cpp/src/strings/count_matches.hpp
@@ -37,14 +37,12 @@ class reprog_device;
  *
  * @param d_strings Device view of the input strings column.
  * @param d_prog Regex instance to evaluate on each string.
- * @param output_size Number of rows for the output column.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Integer column of match counts
  */
 std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       reprog_device& d_prog,
-                                      size_type output_size,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr);
 
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 27691068d5a..897eba58833 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -119,7 +119,7 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,
 
   // Get the match counts for each string.
   // This column will become the output lists child offsets column.
-  auto counts   = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
+  auto counts   = count_matches(*d_strings, *d_prog, stream, mr);
   auto d_counts = counts->mutable_view().data<size_type>();
 
   // Compute null output rows
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 0d0962258cf..2f7e7352458 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -104,7 +104,7 @@ std::unique_ptr<column> findall(strings_column_view const& input,
   auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   // Create lists offsets column
-  auto const sizes              = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
+  auto const sizes              = count_matches(*d_strings, *d_prog, stream, mr);
   auto [offsets, total_matches] = cudf::detail::make_offsets_child_column(
     sizes->view().begin<size_type>(), sizes->view().end<size_type>(), stream, mr);
   auto const d_offsets = offsets->view().data<size_type>();
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index e0aacf07ef0..d273c93ec12 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -210,8 +210,8 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // count the number of delimiters matched in each string
-  auto const counts = count_matches(
-    *d_strings, *d_prog, strings_count, stream, rmm::mr::get_current_device_resource());
+  auto const counts =
+    count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
 
   // get the split tokens from the input column; this also converts the counts into offsets
   auto [tokens, offsets] =
@@ -275,7 +275,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // count the number of delimiters matched in each string
-  auto counts = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
+  auto counts = count_matches(*d_strings, *d_prog, stream, mr);
 
   // get the split tokens from the input column; this also converts the counts into offsets
   auto [tokens, offsets] =

From bf3372b1aa02939db32b2df62ab816a0eb9abdde Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 14 Aug 2024 12:06:29 -0500
Subject: [PATCH 059/270] Switch python version to `3.10` in `cudf.pandas`
 pandas test scripts (#16559)

python 3.9 support was recently dropped in rapids, hence changing the python version to 3.10

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16559
---
 ci/cudf_pandas_scripts/pandas-tests/diff.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index 6cf70a2347f..5dbb4ba991c 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -12,7 +12,7 @@ RAPIDS_FULL_VERSION=$(<./VERSION)
 rapids-logger "Github job name: ${GH_JOB_NAME}"
 rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}"
 
-PY_VER="39"
+PY_VER="310"
 MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json
 PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json
 

From d684ae0e80d179d4d711c00278d00b5f66625303 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Aug 2024 12:36:51 -1000
Subject: [PATCH 060/270] Raise NotImplementedError for Series.rename that's
 not a scalar (#16525)

xref https://github.com/rapidsai/cudf/issues/16507

Raising a `NotImplementedError` gives a chance for this work in `cudf.pandas`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16525
---
 python/cudf/cudf/core/series.py       | 4 ++++
 python/cudf/cudf/tests/test_series.py | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 53675d339ac..822b966364f 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3589,6 +3589,10 @@ def rename(
             raise NotImplementedError("level is currently not supported.")
         if errors != "ignore":
             raise NotImplementedError("errors is currently not supported.")
+        if not is_scalar(index):
+            raise NotImplementedError(
+                ".rename does not currently support relabeling the index."
+            )
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 6a1887afb1f..c7aea563535 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2289,6 +2289,13 @@ def test_series_rename(initial_name, name):
     assert_eq(actual, expected)
 
 
+@pytest.mark.parametrize("index", [lambda x: x * 2, {1: 2}])
+def test_rename_index_not_supported(index):
+    ser = cudf.Series(range(2))
+    with pytest.raises(NotImplementedError):
+        ser.rename(index=index)
+
+
 @pytest.mark.parametrize(
     "data",
     [

From 0253e976ede25d954c607663da61b445e213523f Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 14 Aug 2024 21:27:52 -0400
Subject: [PATCH 061/270] [FEA] Support named aggregations in
 `df.groupby().agg()` (#16528)

Closes #15967

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16528
---
 python/cudf/cudf/core/column_accessor.py      |  7 +---
 python/cudf/cudf/core/groupby/groupby.py      | 41 ++++++++++++-------
 python/cudf/cudf/tests/groupby/test_agg.py    | 30 ++++++++++++++
 .../cudf/cudf/tests/test_column_accessor.py   |  4 ++
 python/cudf/cudf/tests/test_dataframe.py      |  1 -
 5 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 83596704672..48bc84070b1 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -610,7 +610,7 @@ def _pad_key(self, key: Any, pad_value="") -> Any:
         return key + (pad_value,) * (self.nlevels - len(key))
 
     def rename_levels(
-        self, mapper: Mapping[Any, Any] | Callable, level: int | None
+        self, mapper: Mapping[Any, Any] | Callable, level: int | None = None
     ) -> ColumnAccessor:
         """
         Rename the specified levels of the given ColumnAccessor
@@ -653,10 +653,7 @@ def rename_column(x):
                 return x
 
             if level is None:
-                raise NotImplementedError(
-                    "Renaming columns with a MultiIndex and level=None is"
-                    "not supported"
-                )
+                level = 0
             new_col_names = (rename_column(k) for k in self.keys())
 
         else:
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 92c4b73ceaa..9b71ea57f1f 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -548,7 +548,7 @@ def _groupby(self):
         )
 
     @_performance_tracking
-    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
+    def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
         """
         Apply aggregation(s) to the groups.
 
@@ -648,11 +648,10 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
             raise NotImplementedError(
                 "Passing args to func is currently not supported."
             )
-        if kwargs:
-            raise NotImplementedError(
-                "Passing kwargs to func is currently not supported."
-            )
-        column_names, columns, normalized_aggs = self._normalize_aggs(func)
+
+        column_names, columns, normalized_aggs = self._normalize_aggs(
+            func, **kwargs
+        )
         orig_dtypes = tuple(c.dtype for c in columns)
 
         # Note: When there are no key columns, the below produces
@@ -1266,11 +1265,11 @@ def _grouped(self, *, include_groups: bool = True):
         return (group_names, offsets, grouped_keys, grouped_values)
 
     def _normalize_aggs(
-        self, aggs: MultiColumnAggType
+        self, aggs: MultiColumnAggType, **kwargs
     ) -> tuple[Iterable[Any], tuple[ColumnBase, ...], list[list[AggType]]]:
         """
         Normalize aggs to a list of list of aggregations, where `out[i]`
-        is a list of aggregations for column `self.obj[i]`. We support three
+        is a list of aggregations for column `self.obj[i]`. We support four
         different form of `aggs` input here:
         - A single agg, such as "sum". This agg is applied to all value
         columns.
@@ -1279,18 +1278,30 @@ def _normalize_aggs(
         - A mapping of column name to aggs, such as
         {"a": ["sum"], "b": ["mean"]}, the aggs are applied to specified
         column.
+        - Pairs of column name and agg tuples passed as kwargs
+        eg. col1=("a", "sum"), col2=("b", "prod"). The output column names are
+        the keys. The aggs are applied to the corresponding column in the tuple.
         Each agg can be string or lambda functions.
         """
 
         aggs_per_column: Iterable[AggType | Iterable[AggType]]
-        if isinstance(aggs, dict):
-            column_names, aggs_per_column = aggs.keys(), aggs.values()
-            columns = tuple(self.obj._data[col] for col in column_names)
+        # TODO: Remove isinstance condition when the legacy dask_cudf API is removed.
+        # See https://github.com/rapidsai/cudf/pull/16528#discussion_r1715482302 for information.
+        if aggs or isinstance(aggs, dict):
+            if isinstance(aggs, dict):
+                column_names, aggs_per_column = aggs.keys(), aggs.values()
+                columns = tuple(self.obj._data[col] for col in column_names)
+            else:
+                values = self.grouping.values
+                column_names = values._column_names
+                columns = values._columns
+                aggs_per_column = (aggs,) * len(columns)
+        elif not aggs and kwargs:
+            column_names, aggs_per_column = kwargs.keys(), kwargs.values()
+            columns = tuple(self.obj._data[x[0]] for x in kwargs.values())
+            aggs_per_column = tuple(x[1] for x in kwargs.values())
         else:
-            values = self.grouping.values
-            column_names = values._column_names
-            columns = values._columns
-            aggs_per_column = (aggs,) * len(columns)
+            raise TypeError("Must provide at least one aggregation function.")
 
         # is_list_like performs type narrowing but type-checkers don't
         # know it. One could add a TypeGuard annotation to
diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py
index f8847f02d5a..99e7523031b 100644
--- a/python/cudf/cudf/tests/groupby/test_agg.py
+++ b/python/cudf/cudf/tests/groupby/test_agg.py
@@ -3,6 +3,7 @@
 import pytest
 
 import cudf
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize(
@@ -26,3 +27,32 @@ def test_series_agg(attr):
     pd_agg = getattr(pdf.groupby(["a"])["a"], attr)("count")
 
     assert agg.ndim == pd_agg.ndim
+
+
+@pytest.mark.parametrize("func", ["sum", "prod", "mean", "count"])
+@pytest.mark.parametrize("attr", ["agg", "aggregate"])
+def test_dataframe_agg(attr, func):
+    df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]})
+    pdf = df.to_pandas()
+
+    agg = getattr(df.groupby("a"), attr)(func)
+    pd_agg = getattr(pdf.groupby(["a"]), attr)(func)
+
+    assert_eq(agg, pd_agg)
+
+    agg = getattr(df.groupby("a"), attr)({"b": func})
+    pd_agg = getattr(pdf.groupby(["a"]), attr)({"b": func})
+
+    assert_eq(agg, pd_agg)
+
+    agg = getattr(df.groupby("a"), attr)([func])
+    pd_agg = getattr(pdf.groupby(["a"]), attr)([func])
+
+    assert_eq(agg, pd_agg)
+
+    agg = getattr(df.groupby("a"), attr)(foo=("b", func), bar=("a", func))
+    pd_agg = getattr(pdf.groupby(["a"]), attr)(
+        foo=("b", func), bar=("a", func)
+    )
+
+    assert_eq(agg, pd_agg)
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index e84e1433c10..2d7bc809d4d 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -362,6 +362,10 @@ def test_replace_level_values_MultiColumn():
     got = ca.rename_levels(mapper={"a": "f"}, level=0)
     check_ca_equal(expect, got)
 
+    # passing without level kwarg assumes level=0
+    got = ca.rename_levels(mapper={"a": "f"})
+    check_ca_equal(expect, got)
+
 
 def test_clear_nrows_empty_before():
     ca = ColumnAccessor({})
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2c59253d500..89eb5a12c71 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9409,7 +9409,6 @@ def test_rename_for_level_RangeIndex_dataframe():
     assert_eq(expect, got)
 
 
-@pytest_xfail(reason="level=None not implemented yet")
 def test_rename_for_level_is_None_MC():
     gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
     gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])

From 19846b6c0ac40fc91ad28573af04ac7403754acb Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Aug 2024 17:15:03 -1000
Subject: [PATCH 062/270] Disallow cudf.Index accepting column in favor of
 ._from_column (#16549)

Similar to https://github.com/rapidsai/cudf/pull/16454, this PR disallows the public `cudf.Index` accepting a private `ColumnBase` object in favor of `_from_column` (which was added in the linked PR)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16549
---
 python/cudf/cudf/_lib/parquet.pyx           |   4 +-
 python/cudf/cudf/_lib/utils.pyx             |   6 +-
 python/cudf/cudf/api/types.py               |   2 +-
 python/cudf/cudf/core/_base_index.py        |   2 +-
 python/cudf/cudf/core/algorithms.py         |   6 +-
 python/cudf/cudf/core/column/categorical.py |   8 +-
 python/cudf/cudf/core/column/datetime.py    |  10 +-
 python/cudf/cudf/core/column/methods.py     |   6 +-
 python/cudf/cudf/core/column/string.py      |   2 +-
 python/cudf/cudf/core/cut.py                |   2 +-
 python/cudf/cudf/core/dataframe.py          |   8 +-
 python/cudf/cudf/core/dtypes.py             |  14 +-
 python/cudf/cudf/core/groupby/groupby.py    |   9 +-
 python/cudf/cudf/core/index.py              | 238 ++++++++++++--------
 python/cudf/cudf/core/indexed_frame.py      |  24 +-
 python/cudf/cudf/core/multiindex.py         |   7 +-
 python/cudf/cudf/core/resample.py           |   4 +-
 python/cudf/cudf/core/series.py             |   4 +-
 python/cudf/cudf/core/tools/datetimes.py    |  16 +-
 python/cudf/cudf/testing/testing.py         |   8 +-
 python/cudf/cudf/tests/test_multiindex.py   |   4 +-
 python/cudf/cudf/tests/test_string.py       |   2 +-
 22 files changed, 232 insertions(+), 154 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 4a4b13b0b31..0fffb6ade58 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -222,7 +222,7 @@ cdef object _process_metadata(object df,
                 if len(filtered_idx) > 0:
                     idx = cudf.concat(filtered_idx)
                 else:
-                    idx = cudf.Index(cudf.core.column.column_empty(0))
+                    idx = cudf.Index._from_column(cudf.core.column.column_empty(0))
             else:
                 start = range_index_meta["start"] + skip_rows
                 stop = range_index_meta["stop"]
@@ -240,7 +240,7 @@ cdef object _process_metadata(object df,
             index_data = df[index_col]
             actual_index_names = list(index_col_names.values())
             if len(index_data._data) == 1:
-                idx = cudf.Index(
+                idx = cudf.Index._from_column(
                     index_data._data.columns[0],
                     name=actual_index_names[0]
                 )
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index f136cd997a7..267432a0182 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -93,12 +93,12 @@ cpdef generate_pandas_metadata(table, index):
     materialize_index = False
     if index is not False:
         for level, name in enumerate(table._index.names):
-            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
+            if isinstance(table._index, cudf.MultiIndex):
                 idx = table.index.get_level_values(level)
             else:
                 idx = table.index
 
-            if isinstance(idx, cudf.core.index.RangeIndex):
+            if isinstance(idx, cudf.RangeIndex):
                 if index is None:
                     descr = {
                         "kind": "range",
@@ -110,7 +110,7 @@ cpdef generate_pandas_metadata(table, index):
                 else:
                     materialize_index = True
                     # When `index=True`, RangeIndex needs to be materialized.
-                    materialized_idx = cudf.Index(idx._values, name=idx.name)
+                    materialized_idx = idx._as_int_index()
                     descr = _index_level_name(
                         index_name=materialized_idx.name,
                         level=level,
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 294ae2fd985..9c436dfad18 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -249,7 +249,7 @@ def _union_categoricals(
             new_categories=sorted_categories
         )
 
-    return cudf.Index(result_col)
+    return cudf.CategoricalIndex._from_column(result_col)
 
 
 def is_bool_dtype(arr_or_dtype):
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index c91514202c5..d13351c49dd 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1979,7 +1979,7 @@ def from_pandas(cls, index: pd.Index, nan_as_null=no_default):
                 name=index.name,
             )
         else:
-            return cudf.Index(
+            return cudf.Index._from_column(
                 column.as_column(index, nan_as_null=nan_as_null),
                 name=index.name,
             )
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 6c69fbd2637..e27d6ec8d3e 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 from cudf.core.column import as_column
-from cudf.core.index import RangeIndex, ensure_index
+from cudf.core.index import Index, RangeIndex
 from cudf.core.scalar import Scalar
 from cudf.options import get_option
 from cudf.utils.dtypes import can_convert_to_column
@@ -112,7 +112,9 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
-    return labels, cats.values if return_cupy_array else ensure_index(cats)
+    return labels, cats.values if return_cupy_array else Index._from_column(
+        cats
+    )
 
 
 def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 6fa69eb9cc1..d25983842f9 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -601,11 +601,13 @@ def __setitem__(self, key, value):
             to_add_categories = 0
         else:
             if cudf.api.types.is_scalar(value):
-                arr = [value]
+                arr = column.as_column(value, length=1, nan_as_null=False)
             else:
-                arr = value
+                arr = column.as_column(value, nan_as_null=False)
             to_add_categories = len(
-                cudf.Index(arr, nan_as_null=False).difference(self.categories)
+                cudf.Index._from_column(arr).difference(
+                    cudf.Index._from_column(self.categories)
+                )
             )
 
         if to_add_categories > 0:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index ce67ce81e6b..1dbc94384d3 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -250,6 +250,10 @@ def __contains__(self, item: ScalarLike) -> bool:
     def time_unit(self) -> str:
         return np.datetime_data(self.dtype)[0]
 
+    @property
+    def quarter(self) -> ColumnBase:
+        return libcudf.datetime.extract_quarter(self)
+
     @property
     def year(self) -> ColumnBase:
         return self.get_dt_field("year")
@@ -308,7 +312,7 @@ def is_quarter_start(self) -> ColumnBase:
     @property
     def is_year_end(self) -> ColumnBase:
         day_of_year = self.day_of_year
-        leap_dates = libcudf.datetime.is_leap_year(self)
+        leap_dates = self.is_leap_year
 
         leap = day_of_year == cudf.Scalar(366)
         non_leap = day_of_year == cudf.Scalar(365)
@@ -316,6 +320,10 @@ def is_year_end(self) -> ColumnBase:
             False
         )
 
+    @property
+    def is_leap_year(self) -> ColumnBase:
+        return libcudf.datetime.is_leap_year(self)
+
     @property
     def is_year_start(self) -> ColumnBase:
         return (self.day_of_year == 1).fillna(False)
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 8c46d238057..05a0ab2e09a 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -65,8 +65,8 @@ def _return_or_inplace(
         """
         if inplace:
             self._parent._mimic_inplace(
-                self._parent.__class__._from_data(
-                    {self._parent.name: new_col}
+                type(self._parent)._from_column(
+                    new_col, name=self._parent.name
                 ),
                 inplace=True,
             )
@@ -92,6 +92,6 @@ def _return_or_inplace(
                     index=self._parent.index if retain_index else None,
                 )
             elif isinstance(self._parent, cudf.BaseIndex):
-                return cudf.Index(new_col, name=self._parent.name)
+                return cudf.Index._from_column(new_col, name=self._parent.name)
             else:
                 return self._parent._mimic_inplace(new_col, inplace=False)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 1a4b558749d..a710a9f46c2 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4693,7 +4693,7 @@ def character_tokenize(self) -> SeriesOrIndex:
                 result_col, name=self._parent.name, index=index
             )
         elif isinstance(self._parent, cudf.BaseIndex):
-            return cudf.Index(result_col, name=self._parent.name)
+            return cudf.Index._from_column(result_col, name=self._parent.name)
         else:
             return result_col
 
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 197f46ee9fe..a4ceea266b4 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -292,7 +292,7 @@ def cut(
     )
 
     # we return a categorical index, as we don't have a Categorical method
-    categorical_index = cudf.CategoricalIndex._from_data({None: col})
+    categorical_index = cudf.CategoricalIndex._from_column(col)
 
     if isinstance(orig_x, (pd.Series, cudf.Series)):
         # if we have a series input we return a series output
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a53c7bcc63c..3033abd53f5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -326,7 +326,7 @@ def _getitem_tuple_arg(self, arg):
                                 range(len(tmp_arg[0]))
                             )
                         },
-                        index=cudf.Index(tmp_arg[0]),
+                        index=cudf.Index._from_column(tmp_arg[0]),
                     )
                     columns_df[cantor_name] = column.as_column(
                         range(len(columns_df))
@@ -1758,7 +1758,7 @@ def _concat(
         for cols in columns:
             table_index = None
             if 1 == first_data_column_position:
-                table_index = cudf.Index(cols[0])
+                table_index = cudf.Index._from_column(cols[0])
             elif first_data_column_position > 1:
                 table_index = cudf.MultiIndex._from_data(
                     data=dict(
@@ -1810,7 +1810,7 @@ def _concat(
             if not isinstance(out.index, MultiIndex) and isinstance(
                 out.index.dtype, cudf.CategoricalDtype
             ):
-                out = out.set_index(cudf.Index(out.index._values))
+                out = out.set_index(out.index)
         for name, col in out._data.items():
             out._data[name] = col._with_type_metadata(
                 tables[0]._data[name].dtype
@@ -3007,7 +3007,7 @@ def set_index(
             and not isinstance(keys[0], (cudf.MultiIndex, pd.MultiIndex))
         ):
             # Don't turn single level MultiIndex into an Index
-            idx = cudf.Index(data_to_add[0], name=names[0])
+            idx = cudf.Index._from_column(data_to_add[0], name=names[0])
         else:
             idx = MultiIndex._from_data(dict(enumerate(data_to_add)))
             idx.names = names
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 27afec18b4e..6d532e01cba 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -182,7 +182,7 @@ def __init__(self, categories=None, ordered: bool = False) -> None:
         self._ordered = ordered
 
     @property
-    def categories(self) -> "cudf.core.index.Index":
+    def categories(self) -> cudf.Index:
         """
         An ``Index`` containing the unique categories allowed.
 
@@ -194,10 +194,12 @@ def categories(self) -> "cudf.core.index.Index":
         Index(['b', 'a'], dtype='object')
         """
         if self._categories is None:
-            return cudf.Index(
-                cudf.core.column.column_empty(0, dtype="object", masked=False)
+            col = cudf.core.column.column_empty(
+                0, dtype="object", masked=False
             )
-        return cudf.Index(self._categories, copy=False)
+        else:
+            col = self._categories
+        return cudf.Index._from_column(col)
 
     @property
     def type(self):
@@ -259,7 +261,9 @@ def to_pandas(self) -> pd.CategoricalDtype:
             categories = self._categories.to_pandas()
         return pd.CategoricalDtype(categories=categories, ordered=self.ordered)
 
-    def _init_categories(self, categories: Any):
+    def _init_categories(
+        self, categories: Any
+    ) -> cudf.core.column.ColumnBase | None:
         if categories is None:
             return categories
         if len(categories) == 0 and not isinstance(
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 9b71ea57f1f..4f283d41b17 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -403,8 +403,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]:
         if len(group_keys) > 1:
             index = cudf.MultiIndex.from_arrays(group_keys)
         else:
-            (group_keys,) = group_keys
-            index = cudf.Index(group_keys)
+            index = cudf.Index._from_column(group_keys[0])
         return dict(
             zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))
         )
@@ -2583,7 +2582,7 @@ def _mimic_pandas_order(
             # corresponding output rows in pandas, to do that here
             # expand the result by reindexing.
             ri = cudf.RangeIndex(0, len(self.obj))
-            result.index = cudf.Index(ordering)
+            result.index = cudf.Index._from_column(ordering)
             # This reorders and expands
             result = result.reindex(ri)
         else:
@@ -3154,7 +3153,9 @@ def keys(self):
                 dict(zip(range(nkeys), self._key_columns))
             )._set_names(self.names)
         else:
-            return cudf.Index(self._key_columns[0], name=self.names[0])
+            return cudf.Index._from_column(
+                self._key_columns[0], name=self.names[0]
+            )
 
     @property
     def values(self) -> cudf.core.frame.Frame:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 3eab27bd165..c55f86d48e1 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -18,7 +18,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.datetime import extract_quarter, is_leap_year
 from cudf._lib.filling import sequence
 from cudf._lib.search import search_sorted
 from cudf._lib.types import size_type_dtype
@@ -819,22 +818,23 @@ def sort_values(
     @_performance_tracking
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
-        return cudf.Index._from_data(
-            {self.name: self._values.take(gather_map, nullify, check_bounds)}
+        return cudf.Index._from_column(
+            self._column.take(gather_map, nullify, check_bounds),
+            name=self.name,
         )
 
     @_performance_tracking
     def _apply_boolean_mask(self, boolean_mask):
-        return cudf.Index._from_data(
-            {self.name: self._values.apply_boolean_mask(boolean_mask)}
+        return cudf.Index._from_column(
+            self._column.apply_boolean_mask(boolean_mask), name=self.name
         )
 
     def repeat(self, repeats, axis=None):
         return self._as_int_index().repeat(repeats, axis)
 
     def _split(self, splits):
-        return cudf.Index._from_data(
-            {self.name: self._as_int_index()._split(splits)}
+        return cudf.Index._from_column(
+            self._as_int_index()._split(splits), name=self.name
         )
 
     def _binaryop(self, other, op: str):
@@ -1087,10 +1087,13 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
     def _from_column(
         cls, column: ColumnBase, *, name: Hashable = None
     ) -> Self:
-        ca = cudf.core.column_accessor.ColumnAccessor(
-            {name: column}, verify=False
-        )
-        return _index_from_data(ca)
+        if cls is Index:
+            ca = cudf.core.column_accessor.ColumnAccessor(
+                {name: column}, verify=False
+            )
+            return _index_from_data(ca)
+        else:
+            return super()._from_column(column, name=name)
 
     @classmethod
     @_performance_tracking
@@ -1223,8 +1226,8 @@ def _concat(cls, objs):
         if all(isinstance(obj, RangeIndex) for obj in non_empties):
             result = _concat_range_index(non_empties)
         else:
-            data = concat_columns([o._values for o in non_empties])
-            result = Index(data)
+            data = concat_columns([o._column for o in non_empties])
+            result = Index._from_column(data)
 
         names = {obj.name for obj in objs}
         if len(names) == 1:
@@ -1491,7 +1494,7 @@ def __repr__(self):
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
         if isinstance(res, ColumnBase):
-            res = Index(res, name=self.name)
+            res = Index._from_column(res, name=self.name)
         return res
 
     @property  # type: ignore
@@ -1610,8 +1613,8 @@ def _clean_nulls_from_index(self):
                 if isinstance(self, (DatetimeIndex, TimedeltaIndex))
                 else str(cudf.NA)
             )
-            return cudf.Index(
-                self._values.astype("str").fillna(fill_value),
+            return cudf.Index._from_column(
+                self._column.astype("str").fillna(fill_value),
                 name=self.name,
             )
 
@@ -1866,6 +1869,17 @@ def _from_data(
         result._freq = _validate_freq(freq)
         return result
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None
+    ) -> Self:
+        if column.dtype.kind != "M":
+            raise ValueError("column must have a datetime type.")
+        result = super()._from_column(column, name=name)
+        result._freq = _validate_freq(freq)
+        return result
+
     def __getitem__(self, index):
         value = super().__getitem__(index)
         if cudf.get_option("mode.pandas_compatible") and isinstance(
@@ -1923,8 +1937,8 @@ def strftime(self, date_format: str) -> Index:
         date_format : str
             Date format string (e.g. "%Y-%m-%d").
         """
-        return Index._from_data(
-            {self.name: self._column.strftime(date_format)}
+        return Index._from_column(
+            self._column.strftime(date_format), name=self.name
         )
 
     @property
@@ -1989,7 +2003,9 @@ def to_pydatetime(self) -> np.ndarray:
         return self.to_pandas().to_pydatetime()
 
     def to_julian_date(self) -> Index:
-        return Index._from_data({self.name: self._column.to_julian_date()})
+        return Index._from_column(
+            self._column.to_julian_date(), name=self.name
+        )
 
     def to_period(self, freq) -> pd.PeriodIndex:
         return self.to_pandas().to_period(freq=freq)
@@ -2000,7 +2016,9 @@ def normalize(self) -> Self:
 
         Currently not implemented.
         """
-        return type(self)._from_data({self.name: self._column.normalize()})
+        return type(self)._from_column(
+            self._column.normalize(), name=self.name
+        )
 
     @property
     def time(self) -> np.ndarray:
@@ -2084,7 +2102,7 @@ def days_in_month(self) -> Index:
         """
         Get the total number of days in the month that the date falls on.
         """
-        return Index._from_data({self.name: self._column.days_in_month})
+        return Index._from_column(self._column.days_in_month, name=self.name)
 
     daysinmonth = days_in_month
 
@@ -2093,7 +2111,7 @@ def day_of_week(self) -> Index:
         """
         Get the day of week that the date falls on.
         """
-        return Index._from_data({self.name: self._column.day_of_week})
+        return Index._from_column(self._column.day_of_week, name=self.name)
 
     @property  # type: ignore
     @_performance_tracking
@@ -2234,15 +2252,15 @@ def microsecond(self):
         >>> datetime_index.microsecond
         Index([0, 1, 2], dtype='int32')
         """  # noqa: E501
-        return Index(
+        return Index._from_column(
             (
                 # Need to manually promote column to int32 because
                 # pandas-matching binop behaviour requires that this
                 # __mul__ returns an int16 column.
-                self._values.get_dt_field("millisecond").astype("int32")
+                self._column.get_dt_field("millisecond").astype("int32")
                 * cudf.Scalar(1000, dtype="int32")
             )
-            + self._values.get_dt_field("microsecond"),
+            + self._column.get_dt_field("microsecond"),
             name=self.name,
         )
 
@@ -2374,7 +2392,7 @@ def is_leap_year(self) -> cupy.ndarray:
         ndarray
         Booleans indicating if dates belong to a leap year.
         """
-        res = is_leap_year(self._values).fillna(False)
+        res = self._column.is_leap_year.fillna(False)
         return cupy.asarray(res)
 
     @property  # type: ignore
@@ -2400,8 +2418,7 @@ def quarter(self):
         >>> gIndex.quarter
         Index([2, 4], dtype='int8')
         """
-        res = extract_quarter(self._values)
-        return Index(res, dtype="int8")
+        return Index._from_column(self._column.quarter.astype("int8"))
 
     @_performance_tracking
     def day_name(self, locale: str | None = None) -> Index:
@@ -2423,7 +2440,7 @@ def day_name(self, locale: str | None = None) -> Index:
               dtype='object')
         """
         day_names = self._column.get_day_names(locale)
-        return Index._from_data({self.name: day_names})
+        return Index._from_column(day_names, name=self.name)
 
     @_performance_tracking
     def month_name(self, locale: str | None = None) -> Index:
@@ -2442,7 +2459,7 @@ def month_name(self, locale: str | None = None) -> Index:
         Index(['December', 'January', 'January', 'January', 'January', 'February'], dtype='object')
         """
         month_names = self._column.get_month_names(locale)
-        return Index._from_data({self.name: month_names})
+        return Index._from_column(month_names, name=self.name)
 
     @_performance_tracking
     def isocalendar(self) -> cudf.DataFrame:
@@ -2481,14 +2498,14 @@ def to_pandas(
     @_performance_tracking
     def _get_dt_field(self, field: str) -> Index:
         """Return an Index of a numerical component of the DatetimeIndex."""
-        out_column = self._values.get_dt_field(field)
+        out_column = self._column.get_dt_field(field)
         out_column = NumericalColumn(
             data=out_column.base_data,
             dtype=out_column.dtype,
             mask=out_column.base_mask,
             offset=out_column.offset,
         )
-        return Index(out_column, name=self.name)
+        return Index._from_column(out_column, name=self.name)
 
     def _is_boolean(self):
         return False
@@ -2522,9 +2539,7 @@ def ceil(self, freq):
         >>> gIndex.ceil("T")
         DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]')
         """  # noqa: E501
-        out_column = self._values.ceil(freq)
-
-        return self.__class__._from_data({self.name: out_column})
+        return type(self)._from_column(self._column.ceil(freq), name=self.name)
 
     @_performance_tracking
     def floor(self, freq):
@@ -2555,9 +2570,9 @@ def floor(self, freq):
         >>> gIndex.floor("T")
         DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]')
         """  # noqa: E501
-        out_column = self._values.floor(freq)
-
-        return self.__class__._from_data({self.name: out_column})
+        return type(self)._from_column(
+            self._column.floor(freq), name=self.name
+        )
 
     @_performance_tracking
     def round(self, freq):
@@ -2595,9 +2610,9 @@ def round(self, freq):
         >>> dt_idx.round('T')
         DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]')
         """  # noqa: E501
-        out_column = self._values.round(freq)
-
-        return self.__class__._from_data({self.name: out_column})
+        return type(self)._from_column(
+            self._column.round(freq), name=self.name
+        )
 
     def tz_localize(
         self,
@@ -2647,8 +2662,8 @@ def tz_localize(
         to 'NaT'.
         """  # noqa: E501
         result_col = self._column.tz_localize(tz, ambiguous, nonexistent)
-        return DatetimeIndex._from_data(
-            {self.name: result_col}, freq=self._freq
+        return DatetimeIndex._from_column(
+            result_col, name=self.name, freq=self._freq
         )
 
     def tz_convert(self, tz: str | None):
@@ -2684,7 +2699,7 @@ def tz_convert(self, tz: str | None):
                       dtype='datetime64[ns, Europe/London]')
         """  # noqa: E501
         result_col = self._column.tz_convert(tz)
-        return DatetimeIndex._from_data({self.name: result_col})
+        return DatetimeIndex._from_column(result_col, name=self.name)
 
     def repeat(self, repeats, axis=None):
         res = super().repeat(repeats, axis=axis)
@@ -2794,6 +2809,15 @@ def __init__(
 
         super().__init__(data, name=name)
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None
+    ) -> Self:
+        if column.dtype.kind != "m":
+            raise ValueError("column must have a timedelta type.")
+        return super()._from_column(column, name=name)
+
     def __getitem__(self, index):
         value = super().__getitem__(index)
         if cudf.get_option("mode.pandas_compatible") and isinstance(
@@ -2876,7 +2900,7 @@ def ceil(self, freq: str) -> Self:
 
         This method is currently not implemented.
         """
-        return type(self)._from_data({self.name: self._column.ceil(freq)})
+        return type(self)._from_column(self._column.ceil(freq), name=self.name)
 
     def floor(self, freq: str) -> Self:
         """
@@ -2884,7 +2908,9 @@ def floor(self, freq: str) -> Self:
 
         This method is currently not implemented.
         """
-        return type(self)._from_data({self.name: self._column.floor(freq)})
+        return type(self)._from_column(
+            self._column.floor(freq), name=self.name
+        )
 
     def round(self, freq: str) -> Self:
         """
@@ -2892,41 +2918,51 @@ def round(self, freq: str) -> Self:
 
         This method is currently not implemented.
         """
-        return type(self)._from_data({self.name: self._column.round(freq)})
+        return type(self)._from_column(
+            self._column.round(freq), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
-    def days(self):
+    def days(self) -> cudf.Index:
         """
         Number of days for each element.
         """
         # Need to specifically return `int64` to avoid overflow.
-        return Index(self._values.days, name=self.name, dtype="int64")
+        return Index._from_column(
+            self._column.days.astype("int64"), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
-    def seconds(self):
+    def seconds(self) -> cudf.Index:
         """
         Number of seconds (>= 0 and less than 1 day) for each element.
         """
-        return Index(self._values.seconds, name=self.name, dtype="int32")
+        return Index._from_column(
+            self._column.seconds.astype("int32"), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
-    def microseconds(self):
+    def microseconds(self) -> cudf.Index:
         """
         Number of microseconds (>= 0 and less than 1 second) for each element.
         """
-        return Index(self._values.microseconds, name=self.name, dtype="int32")
+        return Index._from_column(
+            self._column.microseconds.astype("int32"), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
-    def nanoseconds(self):
+    def nanoseconds(self) -> cudf.Index:
         """
         Number of nanoseconds (>= 0 and less than 1 microsecond) for each
         element.
         """
-        return Index(self._values.nanoseconds, name=self.name, dtype="int32")
+        return Index._from_column(
+            self._column.nanoseconds.astype("int32"), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -3061,17 +3097,26 @@ def __init__(
             data = data.as_ordered(ordered=False)
         super().__init__(data, name=name)
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None
+    ) -> Self:
+        if not isinstance(column.dtype, cudf.CategoricalDtype):
+            raise ValueError("column must have a categorial type.")
+        return super()._from_column(column, name=name)
+
     @property
     def ordered(self) -> bool:
         return self._column.ordered
 
     @property  # type: ignore
     @_performance_tracking
-    def codes(self):
+    def codes(self) -> cudf.Index:
         """
         The category codes of this categorical.
         """
-        return Index(self._values.codes)
+        return Index._from_column(self._column.codes)
 
     @property  # type: ignore
     @_performance_tracking
@@ -3094,24 +3139,24 @@ def add_categories(self, new_categories) -> Self:
         `new_categories` will be included at the last/highest place in the
         categories and will be unused directly after this call.
         """
-        return type(self)._from_data(
-            {self.name: self._column.add_categories(new_categories)}
+        return type(self)._from_column(
+            self._column.add_categories(new_categories), name=self.name
         )
 
     def as_ordered(self) -> Self:
         """
         Set the Categorical to be ordered.
         """
-        return type(self)._from_data(
-            {self.name: self._column.as_ordered(ordered=True)}
+        return type(self)._from_column(
+            self._column.as_ordered(ordered=True), name=self.name
         )
 
     def as_unordered(self) -> Self:
         """
         Set the Categorical to be unordered.
         """
-        return type(self)._from_data(
-            {self.name: self._column.as_ordered(ordered=False)}
+        return type(self)._from_column(
+            self._column.as_ordered(ordered=False), name=self.name
         )
 
     def remove_categories(self, removals) -> Self:
@@ -3125,8 +3170,8 @@ def remove_categories(self, removals) -> Self:
         removals : category or list of categories
            The categories which should be removed.
         """
-        return type(self)._from_data(
-            {self.name: self._column.remove_categories(removals)}
+        return type(self)._from_column(
+            self._column.remove_categories(removals), name=self.name
         )
 
     def remove_unused_categories(self) -> Self:
@@ -3135,8 +3180,8 @@ def remove_unused_categories(self) -> Self:
 
         This method is currently not supported.
         """
-        return type(self)._from_data(
-            {self.name: self._column.remove_unused_categories()}
+        return type(self)._from_column(
+            self._column.remove_unused_categories(), name=self.name
         )
 
     def rename_categories(self, new_categories) -> Self:
@@ -3145,8 +3190,8 @@ def rename_categories(self, new_categories) -> Self:
 
         This method is currently not supported.
         """
-        return type(self)._from_data(
-            {self.name: self._column.rename_categories(new_categories)}
+        return type(self)._from_column(
+            self._column.rename_categories(new_categories), name=self.name
         )
 
     def reorder_categories(self, new_categories, ordered=None) -> Self:
@@ -3164,12 +3209,9 @@ def reorder_categories(self, new_categories, ordered=None) -> Self:
            Whether or not the categorical is treated as a ordered categorical.
            If not given, do not change the ordered information.
         """
-        return type(self)._from_data(
-            {
-                self.name: self._column.reorder_categories(
-                    new_categories, ordered=ordered
-                )
-            }
+        return type(self)._from_column(
+            self._column.reorder_categories(new_categories, ordered=ordered),
+            name=self.name,
         )
 
     def set_categories(
@@ -3191,12 +3233,11 @@ def set_categories(
             considered as a rename of the old categories
             or as reordered categories.
         """
-        return type(self)._from_data(
-            {
-                self.name: self._column.set_categories(
-                    new_categories, ordered=ordered, rename=rename
-                )
-            }
+        return type(self)._from_column(
+            self._column.set_categories(
+                new_categories, ordered=ordered, rename=rename
+            ),
+            name=self.name,
         )
 
 
@@ -3411,6 +3452,15 @@ def __init__(
     def closed(self):
         return self.dtype.closed
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None
+    ) -> Self:
+        if not isinstance(column.dtype, cudf.IntervalDtype):
+            raise ValueError("column must have a interval type.")
+        return super()._from_column(column, name=name)
+
     @classmethod
     @_performance_tracking
     def from_breaks(
@@ -3593,8 +3643,8 @@ def set_closed(
             Whether the intervals are closed on the left-side, right-side, both
             or neither.
         """
-        return type(self)._from_data(
-            {self.name: self._column.set_closed(closed)}
+        return type(self)._from_column(
+            self._column.set_closed(closed), name=self.name
         )
 
     def to_tuples(self, na_tuple: bool = True) -> pd.Index:
@@ -3680,15 +3730,7 @@ def as_index(
     elif isinstance(arbitrary, BaseIndex):
         idx = arbitrary.copy(deep=copy).rename(name)
     elif isinstance(arbitrary, ColumnBase):
-        idx = _index_from_data({name: arbitrary})
-    elif isinstance(arbitrary, cudf.Series):
-        return as_index(
-            arbitrary._column,
-            nan_as_null=nan_as_null,
-            copy=copy,
-            name=name,
-            dtype=dtype,
-        )
+        raise ValueError("Use cudf.Index._from_column instead.")
     elif isinstance(arbitrary, (pd.RangeIndex, range)):
         idx = RangeIndex(
             start=arbitrary.start,
@@ -3708,11 +3750,9 @@ def as_index(
     elif isinstance(arbitrary, cudf.DataFrame) or is_scalar(arbitrary):
         raise ValueError("Index data must be 1-dimensional and list-like")
     else:
-        return as_index(
+        return Index._from_column(
             column.as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null),
-            copy=copy,
             name=name,
-            dtype=dtype,
         )
     if dtype is not None:
         idx = idx.astype(dtype)
@@ -3749,7 +3789,9 @@ def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
         elif step is None:
             # First non-empty index had only one element
             if obj.start == start:
-                result = Index(concat_columns([x._values for x in indexes]))
+                result = Index._from_column(
+                    concat_columns([x._column for x in indexes])
+                )
                 return result
             step = obj.start - start
 
@@ -3757,7 +3799,9 @@ def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
             next_ is not None and obj.start != next_
         )
         if non_consecutive:
-            result = Index(concat_columns([x._values for x in indexes]))
+            result = Index._from_column(
+                concat_columns([x._column for x in indexes])
+            )
             return result
         if step is not None:
             next_ = obj[-1] + step
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 3b44a0f5864..8be9f0ad78e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -182,11 +182,16 @@ def _indices_from_labels(obj, labels):
             )
         else:
             labels = labels.astype(obj.index.dtype)
+        idx_labels = cudf.Index._from_column(labels)
+    else:
+        idx_labels = labels
 
     # join is not guaranteed to maintain the index ordering
     # so we will sort it with its initial ordering which is stored
     # in column "__"
-    lhs = cudf.DataFrame({"__": as_column(range(len(labels)))}, index=labels)
+    lhs = cudf.DataFrame(
+        {"__": as_column(range(len(idx_labels)))}, index=idx_labels
+    )
     rhs = cudf.DataFrame({"_": as_column(range(len(obj)))}, index=obj.index)
     return lhs.join(rhs).sort_values(by=["__", "_"])["_"]
 
@@ -6642,7 +6647,11 @@ def _drop_rows_by_labels(
         # 3. Use "leftanti" join to drop
         # TODO: use internal API with "leftanti" and specify left and right
         # join keys to bypass logic check
-        to_join = cudf.DataFrame(index=cudf.Index(labels, name=level))
+        if isinstance(labels, ColumnBase):
+            join_index = cudf.Index._from_column(labels, name=level)
+        else:
+            join_index = cudf.Index(labels, name=level)
+        to_join = cudf.DataFrame(index=join_index)
         join_res = working_df.join(to_join, how="leftanti")
 
         # 4. Reconstruct original layout, and rename
@@ -6669,12 +6678,11 @@ def _drop_rows_by_labels(
         if errors == "raise" and not labels.isin(obj.index).all():
             raise KeyError("One or more values not found in axis")
 
-        key_df = cudf.DataFrame._from_data(
-            data={},
-            index=cudf.Index(
-                labels, name=getattr(labels, "name", obj.index.name)
-            ),
-        )
+        if isinstance(labels, ColumnBase):
+            idx = cudf.Index._from_column(labels, name=obj.index.name)
+        else:
+            idx = cudf.Index(labels, name=labels.name)
+        key_df = cudf.DataFrame._from_data(data={}, index=idx)
         if isinstance(obj, cudf.DataFrame):
             res = obj.join(key_df, how="leftanti")
         else:
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index ab88b191570..a66e2936e3b 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -811,8 +811,9 @@ def _index_and_downcast(self, result, index, index_key):
             # it into an Index and name the final index values according
             # to that column's name.
             *_, last_column = index._data.columns
-            out_index = cudf.Index(last_column)
-            out_index.name = index.names[-1]
+            out_index = cudf.Index._from_column(
+                last_column, name=index.names[-1]
+            )
             index = out_index
         elif out_index._num_columns > 1:
             # Otherwise pop the leftmost levels, names, and codes from the
@@ -1061,7 +1062,7 @@ def get_level_values(self, level):
                 raise KeyError(f"Level not found: '{level}'")
         else:
             level_idx = colnames.index(level)
-        level_values = cudf.Index(
+        level_values = cudf.Index._from_column(
             self._data[level], name=self.names[level_idx]
         )
         return level_values
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index 715bbf89b15..e0aee28bfeb 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -145,7 +145,9 @@ def copy(self, deep=True):
     def keys(self):
         index = super().keys
         if self._freq is not None and isinstance(index, cudf.DatetimeIndex):
-            return cudf.DatetimeIndex._from_data(index._data, freq=self._freq)
+            return cudf.DatetimeIndex._from_column(
+                index._column, name=index.name, freq=self._freq
+            )
         return index
 
     def serialize(self):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 822b966364f..2fb4fde6552 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3245,8 +3245,8 @@ def value_counts(
             interval_col = IntervalColumn.from_struct_column(
                 res.index._column._get_decategorized_column()
             )
-            res.index = cudf.IntervalIndex._from_data(
-                {res.index.name: interval_col}
+            res.index = cudf.IntervalIndex._from_column(
+                interval_col, name=res.index.name
             )
         res.name = result_name
         return res
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index c50a36b68b5..a92bf420147 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -18,7 +18,6 @@
 )
 from cudf.api.types import is_integer, is_scalar
 from cudf.core import column
-from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.index import ensure_index
 
 # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
@@ -288,8 +287,7 @@ def to_datetime(
                 utc=utc,
             )
             if isinstance(arg, (cudf.BaseIndex, pd.Index)):
-                ca = ColumnAccessor({arg.name: col}, verify=False)
-                return cudf.DatetimeIndex._from_data(ca)
+                return cudf.DatetimeIndex._from_column(col, name=arg.name)
             elif isinstance(arg, (cudf.Series, pd.Series)):
                 return cudf.Series._from_column(
                     col, name=arg.name, index=ensure_index(arg.index)
@@ -297,7 +295,7 @@ def to_datetime(
             elif is_scalar(arg):
                 return col.element_indexing(0)
             else:
-                return cudf.Index(col)
+                return cudf.Index._from_column(col)
     except Exception as e:
         if errors == "raise":
             raise e
@@ -900,7 +898,9 @@ def date_range(
         end = cudf.Scalar(end, dtype=dtype).value.astype("int64")
         arr = np.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
-        return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz)
+        return cudf.DatetimeIndex._from_column(result, name=name).tz_localize(
+            tz
+        )
 
     # The code logic below assumes `freq` is defined. It is first normalized
     # into `DateOffset` for further computation with timestamps.
@@ -1001,9 +1001,9 @@ def date_range(
             "datetime64[ns]"
         )
 
-    return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize(
-        tz
-    )
+    return cudf.DatetimeIndex._from_column(
+        res, name=name, freq=freq
+    ).tz_localize(tz)
 
 
 def _has_fixed_frequency(freq: DateOffset) -> bool:
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index c2072d90e98..31ad24a4664 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -398,8 +398,12 @@ def assert_index_equal(
             )
 
         for level in range(left.nlevels):
-            llevel = cudf.Index(left._columns[level], name=left.names[level])
-            rlevel = cudf.Index(right._columns[level], name=right.names[level])
+            llevel = cudf.Index._from_column(
+                left._columns[level], name=left.names[level]
+            )
+            rlevel = cudf.Index._from_column(
+                right._columns[level], name=right.names[level]
+            )
             mul_obj = f"MultiIndex level [{level}]"
             assert_index_equal(
                 llevel,
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index a68f4574da3..b1e095e8853 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -167,7 +167,9 @@ def test_string_index():
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
-    stringIndex = cudf.Index(as_column(["a", "b", "c", "d", "e"]), name="name")
+    stringIndex = cudf.Index._from_column(
+        as_column(["a", "b", "c", "d", "e"]), name="name"
+    )
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 30880f074c0..cc88cc79769 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1092,7 +1092,7 @@ def test_string_index():
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
-    stringIndex = cudf.Index(
+    stringIndex = cudf.Index._from_column(
         cudf.core.column.as_column(["a", "b", "c", "d", "e"]), name="name"
     )
     pdf.index = stringIndex.to_pandas()

From 89863a3b791250a2285b90d2c13f51f009638f44 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Aug 2024 17:22:31 -1000
Subject: [PATCH 063/270] Align public utility function signatures  with pandas
 2.x (#16565)

The following function signatures have a breaking change

* `concat`
* `get_dummies`
* `date_range`

Additionally deprecates the `cat` argument in `get_dummies` (doesn't exist in pandas and not tested), and fixes a bug in `interval_range` where `names` was not being respected

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16565
---
 python/cudf/cudf/__init__.py                  |  2 +
 python/cudf/cudf/core/index.py                |  4 +-
 python/cudf/cudf/core/reshape.py              | 74 ++++++++++++++-----
 python/cudf/cudf/core/tools/datetimes.py      | 12 +--
 python/cudf/cudf/core/tools/numeric.py        |  9 ++-
 .../cudf/cudf/tests/indexes/test_interval.py  |  6 ++
 python/cudf/cudf/tests/test_onehot.py         |  6 ++
 7 files changed, 84 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index e14815a1b0d..77ae0791b81 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -97,6 +97,7 @@
     "DatetimeIndex",
     "Decimal32Dtype",
     "Decimal64Dtype",
+    "Decimal128Dtype",
     "Grouper",
     "Index",
     "IntervalDtype",
@@ -126,6 +127,7 @@
     "isclose",
     "melt",
     "merge",
+    "option_context",
     "pivot",
     "pivot_table",
     "read_avro",
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index c55f86d48e1..d02633a97fa 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3350,14 +3350,14 @@ def interval_range(
     if len(right_col) == 0 or len(left_col) == 0:
         dtype = IntervalDtype("int64", closed)
         data = column.column_empty_like_same_mask(left_col, dtype)
-        return IntervalIndex(data, closed=closed)
+        return IntervalIndex(data, closed=closed, name=name)
 
     interval_col = IntervalColumn(
         dtype=IntervalDtype(left_col.dtype, closed),
         size=len(left_col),
         children=(left_col, right_col),
     )
-    return IntervalIndex(interval_col, closed=closed)
+    return IntervalIndex(interval_col, closed=closed, name=name)
 
 
 class IntervalIndex(Index):
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 52a55760d4a..df471692702 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -118,7 +118,17 @@ def _normalize_series_and_dataframe(objs, axis):
             objs[idx] = obj.to_frame(name=name)
 
 
-def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
+def concat(
+    objs,
+    axis=0,
+    join="outer",
+    ignore_index=False,
+    keys=None,
+    levels=None,
+    names=None,
+    verify_integrity=False,
+    sort=None,
+):
     """Concatenate DataFrames, Series, or Indices row-wise.
 
     Parameters
@@ -132,6 +142,21 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     ignore_index : bool, default False
         Set True to ignore the index of the *objs* and provide a
         default range index instead.
+    keys : sequence, default None
+        If multiple levels passed, should contain tuples. Construct
+        hierarchical index using the passed keys as the outermost level.
+        Currently not supported.
+    levels : list of sequences, default None
+        Specific levels (unique values) to use for constructing a
+        MultiIndex. Otherwise they will be inferred from the keys.
+        Currently not supported.
+    names : list, default None
+        Names for the levels in the resulting hierarchical index.
+        Currently not supported.
+    verify_integrity : bool, default False
+        Check whether the new concatenated axis contains duplicates. This can
+        be very expensive relative to the actual data concatenation.
+        Currently not supported.
     sort : bool, default False
         Sort non-concatenation axis if it is not already aligned.
 
@@ -243,6 +268,12 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     0      a       1       c       3
     1      b       2       d       4
     """
+    if keys is not None:
+        raise NotImplementedError("keys is currently not supported")
+    if levels is not None:
+        raise NotImplementedError("levels is currently not supported")
+    if names is not None:
+        raise NotImplementedError("names is currently not supported")
     # TODO: Do we really need to have different error messages for an empty
     # list and a list of None?
     if not objs:
@@ -260,7 +291,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 f"Can only concatenate dictionary input along axis=1, not {axis}"
             )
         objs = {k: obj for k, obj in objs.items() if obj is not None}
-        keys = list(objs)
+        keys_objs = list(objs)
         objs = list(objs.values())
         if any(isinstance(o, cudf.BaseIndex) for o in objs):
             raise TypeError(
@@ -268,7 +299,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             )
     else:
         objs = [obj for obj in objs if obj is not None]
-        keys = None
+        keys_objs = None
 
     if not objs:
         raise ValueError("All objects passed were None")
@@ -317,8 +348,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 result = obj.to_frame()
             else:
                 result = obj.copy(deep=True)
-            if keys is not None and isinstance(result, cudf.DataFrame):
-                k = keys[0]
+            if keys_objs is not None and isinstance(result, cudf.DataFrame):
+                k = keys_objs[0]
                 result.columns = cudf.MultiIndex.from_tuples(
                     [
                         (k, *c) if isinstance(c, tuple) else (k, c)
@@ -370,7 +401,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             objs = _align_objs(objs, how=join, sort=sort)
             df.index = objs[0].index
 
-        if keys is None:
+        if keys_objs is None:
             for o in objs:
                 for name, col in o._data.items():
                     if name in df._data:
@@ -408,9 +439,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                     "label types in cuDF at this time. You must convert "
                     "the labels to the same type."
                 )
-            for k, o in zip(keys, objs):
+            for k, o in zip(keys_objs, objs):
                 for name, col in o._data.items():
-                    # if only series, then only keep keys as column labels
+                    # if only series, then only keep keys_objs as column labels
                     # if the existing column is multiindex, prepend it
                     # to handle cases where dfs and srs are concatenated
                     if only_series:
@@ -426,7 +457,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                     else:
                         df[col_label] = col
 
-        if keys is None:
+        if keys_objs is None:
             df.columns = result_columns.unique()
             if ignore_index:
                 df.columns = cudf.RangeIndex(len(result_columns.unique()))
@@ -666,7 +697,7 @@ def _tile(A, reps):
 
 
 def get_dummies(
-    df,
+    data,
     prefix=None,
     prefix_sep="_",
     dummy_na=False,
@@ -681,7 +712,7 @@ def get_dummies(
 
     Parameters
     ----------
-    df : array-like, Series, or DataFrame
+    data : array-like, Series, or DataFrame
         Data of which to get dummy indicators.
     prefix : str, dict, or sequence, optional
         Prefix to append. Either a str (to apply a constant prefix), dict
@@ -759,17 +790,22 @@ def get_dummies(
 
     if cats is None:
         cats = {}
+    else:
+        warnings.warn(
+            "cats is deprecated and will be removed in a future version.",
+            FutureWarning,
+        )
     if sparse:
         raise NotImplementedError("sparse is not supported yet")
 
     if drop_first:
         raise NotImplementedError("drop_first is not supported yet")
 
-    if isinstance(df, cudf.DataFrame):
+    if isinstance(data, cudf.DataFrame):
         encode_fallback_dtypes = ["object", "category"]
 
         if columns is None or len(columns) == 0:
-            columns = df.select_dtypes(
+            columns = data.select_dtypes(
                 include=encode_fallback_dtypes
             )._column_names
 
@@ -796,33 +832,33 @@ def get_dummies(
         # If we have no columns to encode, we need to drop
         # fallback columns(if any)
         if len(columns) == 0:
-            return df.select_dtypes(exclude=encode_fallback_dtypes)
+            return data.select_dtypes(exclude=encode_fallback_dtypes)
         else:
             result_data = {
                 col_name: col
-                for col_name, col in df._data.items()
+                for col_name, col in data._data.items()
                 if col_name not in columns
             }
 
             for name in columns:
                 if name not in cats:
                     unique = _get_unique(
-                        column=df._data[name], dummy_na=dummy_na
+                        column=data._data[name], dummy_na=dummy_na
                     )
                 else:
                     unique = as_column(cats[name])
 
                 col_enc_data = _one_hot_encode_column(
-                    column=df._data[name],
+                    column=data._data[name],
                     categories=unique,
                     prefix=prefix_map.get(name, prefix),
                     prefix_sep=prefix_sep_map.get(name, prefix_sep),
                     dtype=dtype,
                 )
                 result_data.update(col_enc_data)
-            return cudf.DataFrame._from_data(result_data, index=df.index)
+            return cudf.DataFrame._from_data(result_data, index=data.index)
     else:
-        ser = cudf.Series(df)
+        ser = cudf.Series(data)
         unique = _get_unique(column=ser._column, dummy_na=dummy_na)
         data = _one_hot_encode_column(
             column=ser._column,
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index a92bf420147..7197560b5a4 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -785,7 +785,7 @@ def date_range(
     tz=None,
     normalize: bool = False,
     name=None,
-    closed: Literal["left", "right", "both", "neither"] = "both",
+    inclusive: Literal["left", "right", "both", "neither"] = "both",
     *,
     unit: str | None = None,
 ):
@@ -823,7 +823,7 @@ def date_range(
     name : str, default None
         Name of the resulting DatetimeIndex
 
-    closed : {"left", "right", "both", "neither"}, default "both"
+    inclusive : {"left", "right", "both", "neither"}, default "both"
         Whether to set each bound as closed or open.
         Currently only "both" is supported
 
@@ -839,7 +839,7 @@ def date_range(
     -----
     Of the four parameters `start`, `end`, `periods`, and `freq`, exactly three
     must be specified. If `freq` is omitted, the resulting DatetimeIndex will
-    have periods linearly spaced elements between start and end (closed on both
+    have periods linearly spaced elements between start and end (inclusive on both
     sides).
 
     cudf supports `freq` specified with either fixed-frequency offset
@@ -866,8 +866,8 @@ def date_range(
                 '2026-04-23 08:00:00'],
                 dtype='datetime64[ns]')
     """
-    if closed != "both":
-        raise NotImplementedError(f"{closed=} is currently unsupported.")
+    if inclusive != "both":
+        raise NotImplementedError(f"{inclusive=} is currently unsupported.")
     if unit is not None:
         raise NotImplementedError(f"{unit=} is currently unsupported.")
     if normalize is not False:
@@ -961,7 +961,7 @@ def date_range(
             periods = 0
         else:
             # If end == start, periods == 0 and we return exactly 1 timestamp (start).
-            # Otherwise, since closed="both", we ensure the end point is included.
+            # Otherwise, since inclusive="both", we ensure the end point is included.
             periods += 1
 
     # We compute `end_estim` (the estimated upper bound of the date
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 8b95f6f6a04..6cecf3fa170 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -20,7 +20,7 @@
     from cudf.core.column import ColumnBase
 
 
-def to_numeric(arg, errors="raise", downcast=None):
+def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
     """
     Convert argument into numerical types.
 
@@ -48,6 +48,8 @@ def to_numeric(arg, errors="raise", downcast=None):
         Note that downcast behavior is decoupled from parsing. Errors
         encountered during downcast is raised regardless of ``errors``
         parameter.
+    dtype_backend : None
+        Not implemented.
 
     Returns
     -------
@@ -93,7 +95,10 @@ def to_numeric(arg, errors="raise", downcast=None):
         For example ``[1, 'a']``. A ``TypeError`` will be raised when such
         input is received, regardless of ``errors`` parameter.
     """
-
+    if dtype_backend is not None:
+        raise NotImplementedError(
+            "dtype_backend is not currently implemented."
+        )
     if errors not in {"raise", "ignore", "coerce"}:
         raise ValueError("invalid error value specified")
     elif errors == "ignore":
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index 3b3a9f96543..a567c27f584 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -401,3 +401,9 @@ def test_from_tuples():
     result = cudf.IntervalIndex.from_tuples(data, closed="left", name="a")
     expected = pd.IntervalIndex.from_tuples(data, closed="left", name="a")
     assert_eq(result, expected)
+
+
+def test_interval_range_name():
+    expected = pd.interval_range(start=0, periods=5, freq=2, name="foo")
+    result = cudf.interval_range(start=0, periods=5, freq=2, name="foo")
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index 154e1e19072..cc17dc46e0a 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -155,3 +155,9 @@ def test_get_dummies_array_like_with_nan():
     actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_")
 
     assert_eq(expected, actual)
+
+
+def test_get_dummies_cats_deprecated():
+    df = cudf.DataFrame(range(3))
+    with pytest.warns(FutureWarning):
+        cudf.get_dummies(df, cats={0: [0, 1, 2]})

From 2bcb7ecd2c077b3989ced1b8be8727e1b71f93b1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Aug 2024 17:24:48 -1000
Subject: [PATCH 064/270] Fix `.replace(Index, Index)` raising a TypeError
 (#16513)

Since `cudf.Index` is list-like, passing this to `.replace` should act like replacing a list of values with a corresponding list of values.

Discovered while working on https://github.com/rapidsai/cuml/pull/6019

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16513
---
 python/cudf/cudf/core/indexed_frame.py | 14 +++++++-------
 python/cudf/cudf/tests/test_replace.py |  6 ++++++
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8be9f0ad78e..ae7369c80d1 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -6469,7 +6469,7 @@ def _get_replacement_values_for_columns(
         to_replace_columns = {col: [to_replace] for col in columns_dtype_map}
         values_columns = {col: [value] for col in columns_dtype_map}
     elif cudf.api.types.is_list_like(to_replace) or isinstance(
-        to_replace, ColumnBase
+        to_replace, (ColumnBase, BaseIndex)
     ):
         if is_scalar(value):
             to_replace_columns = {col: to_replace for col in columns_dtype_map}
@@ -6483,7 +6483,9 @@ def _get_replacement_values_for_columns(
                 )
                 for col in columns_dtype_map
             }
-        elif cudf.api.types.is_list_like(value):
+        elif cudf.api.types.is_list_like(
+            value
+        ) or cudf.utils.dtypes.is_column_like(value):
             if len(to_replace) != len(value):
                 raise ValueError(
                     f"Replacement lists must be "
@@ -6495,9 +6497,6 @@ def _get_replacement_values_for_columns(
                     col: to_replace for col in columns_dtype_map
                 }
                 values_columns = {col: value for col in columns_dtype_map}
-        elif cudf.utils.dtypes.is_column_like(value):
-            to_replace_columns = {col: to_replace for col in columns_dtype_map}
-            values_columns = {col: value for col in columns_dtype_map}
         else:
             raise TypeError(
                 "value argument must be scalar, list-like or Series"
@@ -6592,12 +6591,13 @@ def _get_replacement_values_for_columns(
     return all_na_columns, to_replace_columns, values_columns
 
 
-def _is_series(obj):
+def _is_series(obj: Any) -> bool:
     """
     Checks if the `obj` is of type `cudf.Series`
     instead of checking for isinstance(obj, cudf.Series)
+    to avoid circular imports.
     """
-    return isinstance(obj, Frame) and obj.ndim == 1 and obj.index is not None
+    return isinstance(obj, IndexedFrame) and obj.ndim == 1
 
 
 @_performance_tracking
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 1973fe6fb41..e5ee0127a74 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -1378,3 +1378,9 @@ def test_fillna_nan_and_null():
     result = ser.fillna(2.2)
     expected = cudf.Series([2.2, 2.2, 1.1])
     assert_eq(result, expected)
+
+
+def test_replace_with_index_objects():
+    result = cudf.Series([1, 2]).replace(cudf.Index([1]), cudf.Index([2]))
+    expected = pd.Series([1, 2]).replace(pd.Index([1]), pd.Index([2]))
+    assert_eq(result, expected)

From ac42bc870a65d807784cae63e25b9e9ca788eb23 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 15 Aug 2024 09:37:43 -0400
Subject: [PATCH 065/270] Hide all gtest symbols in cudftestutil (#16546)

By hiding the gtest symbols in cudftestutil it allows consumers of the library to build with a differing version of gtest without issue.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Marcus D. Hanwell (https://github.com/cryos)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16546
---
 cpp/cmake/thirdparty/get_gtest.cmake | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 10e6b026d9a..ec8cbd8c568 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -16,9 +16,18 @@
 function(find_and_configure_gtest)
   include(${rapids-cmake-dir}/cpm/gtest.cmake)
 
+  # Mark all the non explicit googletest symbols as hidden. This ensures that libcudftestutil can be
+  # used by consumers with a different shared gtest.
+  set(gtest_hide_internal_symbols ON)
+
   # Find or install GoogleTest
   rapids_cpm_gtest(BUILD_STATIC)
 
+  # Mark all the explicit googletest symbols as hidden. This ensures that libcudftestutil can be
+  # used by consumers with a different shared gtest.
+  if(TARGET gtest)
+    target_compile_definitions(gtest PUBLIC "$<BUILD_LOCAL_INTERFACE:GTEST_API_=>")
+  endif()
 endfunction()
 
 find_and_configure_gtest()

From f4a9b1c5016e254ebf2de55ac9946af6420ebff5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 11:14:06 -1000
Subject: [PATCH 066/270] Use more idomatic cudf APIs in dask_cudf meta
 generation (#16487)

Namely:

* Avoiding `cudf.core` imports by checking public column `.dtype`s
* Using more straightforward cudf APIs to construct meta objects

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16487
---
 python/dask_cudf/dask_cudf/backends.py | 124 ++++++++++++-------------
 1 file changed, 58 insertions(+), 66 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 01bab30190a..82ea2ac033a 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -55,37 +55,31 @@
 @meta_nonempty.register(cudf.BaseIndex)
 @_dask_cudf_performance_tracking
 def _nonempty_index(idx):
-    if isinstance(idx, cudf.core.index.RangeIndex):
-        return cudf.core.index.RangeIndex(2, name=idx.name)
-    elif isinstance(idx, cudf.core.index.DatetimeIndex):
-        start = "1970-01-01"
-        data = np.array([start, "1970-01-02"], dtype=idx.dtype)
+    """Return a non-empty cudf.Index as metadata."""
+    # TODO: IntervalIndex, TimedeltaIndex?
+    if isinstance(idx, cudf.RangeIndex):
+        return cudf.RangeIndex(2, name=idx.name)
+    elif isinstance(idx, cudf.DatetimeIndex):
+        data = np.array(["1970-01-01", "1970-01-02"], dtype=idx.dtype)
         values = cudf.core.column.as_column(data)
-        return cudf.core.index.DatetimeIndex(values, name=idx.name)
-    elif isinstance(idx, cudf.core.index.CategoricalIndex):
-        key = tuple(idx._data.keys())
-        assert len(key) == 1
-        categories = idx._data[key[0]].categories
-        codes = [0, 0]
-        ordered = idx._data[key[0]].ordered
+        return cudf.DatetimeIndex(values, name=idx.name)
+    elif isinstance(idx, cudf.CategoricalIndex):
         values = cudf.core.column.build_categorical_column(
-            categories=categories, codes=codes, ordered=ordered
+            categories=idx.categories, codes=[0, 0], ordered=idx.ordered
         )
-        return cudf.core.index.CategoricalIndex(values, name=idx.name)
-    elif isinstance(idx, cudf.core.multiindex.MultiIndex):
+        return cudf.CategoricalIndex(values, name=idx.name)
+    elif isinstance(idx, cudf.MultiIndex):
         levels = [meta_nonempty(lev) for lev in idx.levels]
-        codes = [[0, 0] for i in idx.levels]
-        return cudf.core.multiindex.MultiIndex(
-            levels=levels, codes=codes, names=idx.names
-        )
-    elif isinstance(idx._column, cudf.core.column.StringColumn):
+        codes = [[0, 0]] * idx.nlevels
+        return cudf.MultiIndex(levels=levels, codes=codes, names=idx.names)
+    elif is_string_dtype(idx.dtype):
         return cudf.Index(["cat", "dog"], name=idx.name)
-    elif isinstance(idx, cudf.core.index.Index):
-        return cudf.core.index.Index(
-            np.arange(2, dtype=idx.dtype), name=idx.name
-        )
+    elif isinstance(idx, cudf.Index):
+        return cudf.Index(np.arange(2, dtype=idx.dtype), name=idx.name)
 
-    raise TypeError(f"Don't know how to handle index of type {type(idx)}")
+    raise TypeError(
+        f"Don't know how to handle index of type {type(idx).__name__}"
+    )
 
 
 def _nest_list_data(data, leaf_type):
@@ -101,50 +95,49 @@ def _nest_list_data(data, leaf_type):
 
 
 @_dask_cudf_performance_tracking
-def _get_non_empty_data(s):
-    """Return a non empty column as metadata."""
-    if isinstance(s, cudf.core.column.CategoricalColumn):
+def _get_non_empty_data(
+    s: cudf.core.column.ColumnBase,
+) -> cudf.core.column.ColumnBase:
+    """Return a non-empty column as metadata from a column."""
+    if isinstance(s.dtype, cudf.CategoricalDtype):
         categories = (
-            s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]
+            s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]  # type: ignore[attr-defined]
         )
         codes = cudf.core.column.as_column(
             0,
             dtype=cudf._lib.types.size_type_dtype,
             length=2,
         )
-        ordered = s.ordered
-        data = cudf.core.column.build_categorical_column(
+        ordered = s.ordered  # type: ignore[attr-defined]
+        return cudf.core.column.build_categorical_column(
             categories=categories, codes=codes, ordered=ordered
         )
-    elif isinstance(s, cudf.core.column.ListColumn):
+    elif isinstance(s.dtype, cudf.ListDtype):
         leaf_type = s.dtype.leaf_type
         if is_string_dtype(leaf_type):
             data = ["cat", "dog"]
         else:
             data = np.array([0, 1], dtype=leaf_type).tolist()
         data = _nest_list_data(data, s.dtype) * 2
-        data = cudf.core.column.as_column(data, dtype=s.dtype)
-    elif isinstance(s, cudf.core.column.StructColumn):
+        return cudf.core.column.as_column(data, dtype=s.dtype)
+    elif isinstance(s.dtype, cudf.StructDtype):
+        # Handles IntervalColumn
         struct_dtype = s.dtype
-        data = [{key: None for key in struct_dtype.fields.keys()}] * 2
-        data = cudf.core.column.as_column(data, dtype=s.dtype)
+        struct_data = [{key: None for key in struct_dtype.fields.keys()}] * 2
+        return cudf.core.column.as_column(struct_data, dtype=s.dtype)
     elif is_string_dtype(s.dtype):
-        data = cudf.core.column.as_column(pa.array(["cat", "dog"]))
+        return cudf.core.column.as_column(pa.array(["cat", "dog"]))
     elif isinstance(s.dtype, pd.DatetimeTZDtype):
-        from cudf.utils.dtypes import get_time_unit
-
-        data = cudf.date_range("2001-01-01", periods=2, freq=get_time_unit(s))
-        data = data.tz_localize(str(s.dtype.tz))._column
+        date_data = cudf.date_range("2001-01-01", periods=2, freq=s.time_unit)  # type: ignore[attr-defined]
+        return date_data.tz_localize(str(s.dtype.tz))._column
+    elif s.dtype.kind in "fiubmM":
+        return cudf.core.column.as_column(
+            np.arange(start=0, stop=2, dtype=s.dtype)
+        )
     else:
-        if pd.api.types.is_numeric_dtype(s.dtype):
-            data = cudf.core.column.as_column(
-                cp.arange(start=0, stop=2, dtype=s.dtype)
-            )
-        else:
-            data = cudf.core.column.as_column(
-                cp.arange(start=0, stop=2, dtype="int64")
-            ).astype(s.dtype)
-    return data
+        raise TypeError(
+            f"Don't know how to handle column of type {type(s).__name__}"
+        )
 
 
 @meta_nonempty.register(cudf.Series)
@@ -162,24 +155,25 @@ def _nonempty_series(s, idx=None):
 def meta_nonempty_cudf(x):
     idx = meta_nonempty(x.index)
     columns_with_dtype = dict()
-    res = cudf.DataFrame(index=idx)
-    for col in x._data.names:
-        dtype = str(x._data[col].dtype)
-        if dtype in ("list", "struct", "category"):
+    res = {}
+    for col_label, col in x._data.items():
+        dtype = col.dtype
+        if isinstance(
+            dtype,
+            (cudf.ListDtype, cudf.StructDtype, cudf.CategoricalDtype),
+        ):
             # 1. Not possible to hash and store list & struct types
             #    as they can contain different levels of nesting or
             #    fields.
-            # 2. Not possible to has `category` types as
+            # 2. Not possible to hash `category` types as
             #    they often contain an underlying types to them.
-            res._data[col] = _get_non_empty_data(x._data[col])
+            res[col_label] = _get_non_empty_data(col)
         else:
             if dtype not in columns_with_dtype:
-                columns_with_dtype[dtype] = cudf.core.column.as_column(
-                    _get_non_empty_data(x._data[col])
-                )
-            res._data[col] = columns_with_dtype[dtype]
+                columns_with_dtype[dtype] = _get_non_empty_data(col)
+            res[col_label] = columns_with_dtype[dtype]
 
-    return res
+    return cudf.DataFrame._from_data(res, index=idx)
 
 
 @make_meta_dispatch.register((cudf.Series, cudf.DataFrame))
@@ -197,9 +191,7 @@ def make_meta_cudf_index(x, index=None):
 @_dask_cudf_performance_tracking
 def _empty_series(name, dtype, index=None):
     if isinstance(dtype, str) and dtype == "category":
-        return cudf.Series(
-            [UNKNOWN_CATEGORIES], dtype=dtype, name=name, index=index
-        ).iloc[:0]
+        dtype = cudf.CategoricalDtype(categories=[UNKNOWN_CATEGORIES])
     return cudf.Series([], dtype=dtype, name=name, index=index)
 
 
@@ -337,7 +329,7 @@ def percentile_cudf(a, q, interpolation="linear"):
     if isinstance(q, Iterator):
         q = list(q)
 
-    if cudf.api.types._is_categorical_dtype(a.dtype):
+    if isinstance(a.dtype, cudf.CategoricalDtype):
         result = cp.percentile(a.cat.codes, q, interpolation=interpolation)
 
         return (
@@ -346,7 +338,7 @@ def percentile_cudf(a, q, interpolation="linear"):
             ),
             n,
         )
-    if np.issubdtype(a.dtype, np.datetime64):
+    if a.dtype.kind == "M":
         result = a.quantile(
             [i / 100.0 for i in q], interpolation=interpolation
         )

From 1e220b708582c73d128c53f3279d4588167a310f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 13:58:45 -1000
Subject: [PATCH 067/270] Return Interval object in pandas compat mode for
 IntervalIndex reductions (#16523)

xref https://github.com/rapidsai/cudf/issues/16507

In non pandas compat mode, I think this still makes sense to return a `dict` since that's the "scalar" type of a cudf struct/interval type, but in pandas compat mode we should match pandas and return an Interval.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16523
---
 python/cudf/cudf/_lib/reduce.pyx         |  6 +++++-
 python/cudf/cudf/core/column/interval.py | 14 ++++++++++++++
 python/cudf/cudf/tests/test_interval.py  | 11 +++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 64634b7a6f9..511bba20ef5 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -61,7 +61,11 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
             result,
             dtype=col_dtype.__class__(precision, scale),
         ).value
-    return DeviceScalar.from_pylibcudf(result).value
+    scalar = DeviceScalar.from_pylibcudf(result).value
+    if isinstance(col_dtype, cudf.StructDtype):
+        # TODO: Utilize column_metadata in libcudf to maintain field labels
+        return dict(zip(col_dtype.fields.keys(), scalar.values()))
+    return scalar
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index b2f79ef0c65..d9fc96a9f3e 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -11,6 +11,7 @@
 from cudf.core.dtypes import IntervalDtype
 
 if TYPE_CHECKING:
+    from cudf._typing import ScalarLike
     from cudf.core.column import ColumnBase
 
 
@@ -186,3 +187,16 @@ def element_indexing(self, index: int):
         if cudf.get_option("mode.pandas_compatible"):
             return pd.Interval(**result, closed=self.dtype.closed)
         return result
+
+    def _reduce(
+        self,
+        op: str,
+        skipna: bool | None = None,
+        min_count: int = 0,
+        *args,
+        **kwargs,
+    ) -> ScalarLike:
+        result = super()._reduce(op, skipna, min_count, *args, **kwargs)
+        if cudf.get_option("mode.pandas_compatible"):
+            return pd.Interval(**result, closed=self.dtype.closed)
+        return result
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 5eeea87d8e0..2d194107658 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -194,3 +194,14 @@ def test_intervaldtype_eq_string_with_attributes():
     dtype = cudf.IntervalDtype("int64", closed="left")
     assert dtype == "interval"
     assert dtype == "interval[int64, left]"
+
+
+def test_reduction_return_interval_pandas_compatible():
+    ii = pd.IntervalIndex.from_tuples(
+        [("2017-01-03", "2017-01-04")], dtype="interval[datetime64[ns], right]"
+    )
+    cudf_ii = cudf.IntervalIndex.from_pandas(ii)
+    with cudf.option_context("mode.pandas_compatible", True):
+        result = cudf_ii.min()
+    expected = ii.min()
+    assert result == expected

From 50841355812685e0e48d1577b8384399cdad5a0f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 13:59:58 -1000
Subject: [PATCH 068/270] Make NumericalColumn.__init__ strict (#16457)

This PR makes `NumericalBaseColumn.__init__` and its subclasses strict putting restrictions on `data`, `dtype`, `size` and `children` so these columns cannot be constructed into to an invalid state. It also aligns the signature with the base class.

xref https://github.com/rapidsai/cudf/issues/16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16457
---
 python/cudf/cudf/_lib/column.pyx              |  2 +
 python/cudf/cudf/core/column/column.py        | 22 ++---
 python/cudf/cudf/core/column/decimal.py       | 92 ++++++++++++++++++-
 python/cudf/cudf/core/column/numerical.py     | 13 ++-
 .../cudf/cudf/core/column/numerical_base.py   | 29 +++++-
 5 files changed, 134 insertions(+), 24 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index e030147fdd3..f0c07dfbc1b 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -88,6 +88,8 @@ cdef class Column:
         object null_count=None,
         object children=()
     ):
+        if size < 0:
+            raise ValueError("size must be >=0")
         self._size = size
         self._distinct_count = {}
         self._dtype = dtype
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a7d2cb441dd..9785c3e5517 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1652,23 +1652,19 @@ def build_column(
             null_count=null_count,
         )
     elif isinstance(dtype, StructDtype):
-        if size is None:
-            raise TypeError("Must specify size")
         return cudf.core.column.StructColumn(
             data=data,
             dtype=dtype,
-            size=size,
+            size=size,  # type: ignore[arg-type]
             offset=offset,
             mask=mask,
             null_count=null_count,
             children=children,
         )
     elif isinstance(dtype, cudf.Decimal64Dtype):
-        if size is None:
-            raise TypeError("Must specify size")
         return cudf.core.column.Decimal64Column(
-            data=data,
-            size=size,
+            data=data,  # type: ignore[arg-type]
+            size=size,  # type: ignore[arg-type]
             offset=offset,
             dtype=dtype,
             mask=mask,
@@ -1676,11 +1672,9 @@ def build_column(
             children=children,
         )
     elif isinstance(dtype, cudf.Decimal32Dtype):
-        if size is None:
-            raise TypeError("Must specify size")
         return cudf.core.column.Decimal32Column(
-            data=data,
-            size=size,
+            data=data,  # type: ignore[arg-type]
+            size=size,  # type: ignore[arg-type]
             offset=offset,
             dtype=dtype,
             mask=mask,
@@ -1688,11 +1682,9 @@ def build_column(
             children=children,
         )
     elif isinstance(dtype, cudf.Decimal128Dtype):
-        if size is None:
-            raise TypeError("Must specify size")
         return cudf.core.column.Decimal128Column(
-            data=data,
-            size=size,
+            data=data,  # type: ignore[arg-type]
+            size=size,  # type: ignore[arg-type]
             offset=offset,
             dtype=dtype,
             mask=mask,
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 6a7f338b065..3b979ef2e97 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -31,14 +31,38 @@
 
 if TYPE_CHECKING:
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
+    from cudf.core.buffer import Buffer
 
 
 class DecimalBaseColumn(NumericalBaseColumn):
     """Base column for decimal32, decimal64 or decimal128 columns"""
 
-    dtype: DecimalDtype
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: DecimalDtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(size, int):
+            raise ValueError("Must specify an integer size")
+        if not isinstance(dtype, DecimalDtype):
+            raise ValueError(f"{dtype=} must be a DecimalDtype instance")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
+
     @property
     def __cuda_array_interface__(self):
         raise NotImplementedError(
@@ -205,7 +229,27 @@ def as_numerical_column(
 
 
 class Decimal32Column(DecimalBaseColumn):
-    dtype: Decimal32Dtype
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: Decimal32Dtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(dtype, Decimal32Dtype):
+            raise ValueError(f"{dtype=} must be a Decimal32Dtype instance")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
 
     @classmethod
     def from_arrow(cls, data: pa.Array):
@@ -266,7 +310,27 @@ def _with_type_metadata(
 
 
 class Decimal128Column(DecimalBaseColumn):
-    dtype: Decimal128Dtype
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: Decimal128Dtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(dtype, Decimal128Dtype):
+            raise ValueError(f"{dtype=} must be a Decimal128Dtype instance")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
 
     @classmethod
     def from_arrow(cls, data: pa.Array):
@@ -287,7 +351,27 @@ def _with_type_metadata(
 
 
 class Decimal64Column(DecimalBaseColumn):
-    dtype: Decimal64Dtype
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: Decimal64Dtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(dtype, Decimal64Dtype):
+            raise ValueError(f"{dtype=} must be a Decimal64Dtype instance")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
 
     def __setitem__(self, key, value):
         if isinstance(value, np.integer):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index bbc74ef349e..16e78ef35ef 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -61,25 +61,30 @@ class NumericalColumn(NumericalBaseColumn):
     def __init__(
         self,
         data: Buffer,
-        dtype: DtypeObj,
+        size: int | None,
+        dtype: np.dtype,
         mask: Buffer | None = None,
-        size: int | None = None,  # TODO: make this non-optional
         offset: int = 0,
         null_count: int | None = None,
+        children: tuple = (),
     ):
-        dtype = cudf.dtype(dtype)
+        if not (isinstance(dtype, np.dtype) and dtype.kind in "iufb"):
+            raise ValueError(
+                "dtype must be a floating, integer or boolean numpy dtype."
+            )
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = (data.size // dtype.itemsize) - offset
         super().__init__(
-            data,
+            data=data,
             size=size,
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
+            children=children,
         )
 
     def _clear_cache(self):
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index f41010062c8..3b8dd05c13a 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -9,16 +9,19 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase
 from cudf.core.missing import NA
 from cudf.core.mixins import Scannable
 
 if TYPE_CHECKING:
     from cudf._typing import ScalarLike
+    from cudf.core.column.decimal import DecimalDtype
 
 
 class NumericalBaseColumn(ColumnBase, Scannable):
-    """A column composed of numerical data.
+    """
+    A column composed of numerical (bool, integer, float, decimal) data.
 
     This class encodes a standard interface for different types of columns
     containing numerical types of data. In particular, mathematical operations
@@ -42,6 +45,30 @@ class NumericalBaseColumn(ColumnBase, Scannable):
         "cummax",
     }
 
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: DecimalDtype | np.dtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(data, Buffer):
+            raise ValueError("data must be a Buffer instance.")
+        if len(children) != 0:
+            raise ValueError(f"{type(self).__name__} must have no children.")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
+
     def _can_return_nan(self, skipna: bool | None = None) -> bool:
         return not skipna and self.has_nulls()
 

From 155eddedc0e2b68d203cfbc318172396f4293d98 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 14:00:57 -1000
Subject: [PATCH 069/270] Make Timedelta/DatetimeColumn.__init__ strict
 (#16464)

This PR makes Datetime/TimedeltaColumn.__init__ and its subclasses strict putting restrictions on data, dtype, size and children so these columns cannot be constructed into to an invalid state. It also aligns the signature with the base class.

xref https://github.com/rapidsai/cudf/issues/16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16464
---
 python/cudf/cudf/core/column/column.py    | 12 ++-----
 python/cudf/cudf/core/column/datetime.py  | 43 ++++++++++++++++-------
 python/cudf/cudf/core/column/timedelta.py | 17 +++++----
 3 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 9785c3e5517..b0e33e8b9ce 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1592,10 +1592,8 @@ def build_column(
             children=children,
         )
     elif dtype.type is np.datetime64:
-        if data is None:
-            raise TypeError("Must specify data buffer")
         return cudf.core.column.DatetimeColumn(
-            data=data,
+            data=data,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             size=size,
@@ -1603,10 +1601,8 @@ def build_column(
             null_count=null_count,
         )
     elif isinstance(dtype, pd.DatetimeTZDtype):
-        if data is None:
-            raise TypeError("Must specify data buffer")
         return cudf.core.column.datetime.DatetimeTZColumn(
-            data=data,
+            data=data,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             size=size,
@@ -1614,10 +1610,8 @@ def build_column(
             null_count=null_count,
         )
     elif dtype.type is np.timedelta64:
-        if data is None:
-            raise TypeError("Must specify data buffer")
         return cudf.core.column.TimeDeltaColumn(
-            data=data,
+            data=data,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             size=size,
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 1dbc94384d3..d0ea4612a1b 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -24,6 +24,7 @@
     get_compatible_timezone,
     get_tz_data,
 )
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
@@ -34,10 +35,8 @@
         ColumnBinaryOperand,
         DatetimeLikeScalar,
         Dtype,
-        DtypeObj,
         ScalarLike,
     )
-    from cudf.core.buffer import Buffer
     from cudf.core.column.numerical import NumericalColumn
 
 if PANDAS_GE_220:
@@ -207,30 +206,39 @@ class DatetimeColumn(column.ColumnBase):
     def __init__(
         self,
         data: Buffer,
-        dtype: DtypeObj,
+        size: int | None,
+        dtype: np.dtype | pd.DatetimeTZDtype,
         mask: Buffer | None = None,
-        size: int | None = None,  # TODO: make non-optional
         offset: int = 0,
         null_count: int | None = None,
+        children: tuple = (),
     ):
-        dtype = cudf.dtype(dtype)
-        if dtype.kind != "M":
-            raise TypeError(f"{self.dtype} is not a supported datetime type")
-
+        if not isinstance(data, Buffer):
+            raise ValueError("data must be a Buffer.")
+        dtype = self._validate_dtype_instance(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = data.size // dtype.itemsize
             size = size - offset
+        if len(children) != 0:
+            raise ValueError(f"{type(self).__name__} must have no children.")
         super().__init__(
-            data,
+            data=data,
             size=size,
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
+            children=children,
         )
 
+    @staticmethod
+    def _validate_dtype_instance(dtype: np.dtype) -> np.dtype:
+        if not (isinstance(dtype, np.dtype) and dtype.kind == "M"):
+            raise ValueError("dtype must be a datetime, numpy dtype")
+        return dtype
+
     def __contains__(self, item: ScalarLike) -> bool:
         try:
             ts = pd.Timestamp(item).as_unit(self.time_unit)
@@ -858,21 +866,30 @@ class DatetimeTZColumn(DatetimeColumn):
     def __init__(
         self,
         data: Buffer,
+        size: int | None,
         dtype: pd.DatetimeTZDtype,
         mask: Buffer | None = None,
-        size: int | None = None,
         offset: int = 0,
         null_count: int | None = None,
+        children: tuple = (),
     ):
         super().__init__(
             data=data,
-            dtype=_get_base_dtype(dtype),
-            mask=mask,
             size=size,
+            dtype=dtype,
+            mask=mask,
             offset=offset,
             null_count=null_count,
+            children=children,
         )
-        self._dtype = get_compatible_timezone(dtype)
+
+    @staticmethod
+    def _validate_dtype_instance(
+        dtype: pd.DatetimeTZDtype,
+    ) -> pd.DatetimeTZDtype:
+        if not isinstance(dtype, pd.DatetimeTZDtype):
+            raise ValueError("dtype must be a pandas.DatetimeTZDtype")
+        return get_compatible_timezone(dtype)
 
     def to_pandas(
         self,
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index ba0dc4779bb..6b6f3e517a8 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -75,28 +75,33 @@ class TimeDeltaColumn(ColumnBase):
     def __init__(
         self,
         data: Buffer,
-        dtype: Dtype,
-        size: int | None = None,  # TODO: make non-optional
+        size: int | None,
+        dtype: np.dtype,
         mask: Buffer | None = None,
         offset: int = 0,
         null_count: int | None = None,
+        children: tuple = (),
     ):
-        dtype = cudf.dtype(dtype)
-        if dtype.kind != "m":
-            raise TypeError(f"{self.dtype} is not a supported duration type")
+        if not isinstance(data, Buffer):
+            raise ValueError("data must be a Buffer.")
+        if not (isinstance(dtype, np.dtype) and dtype.kind == "m"):
+            raise ValueError("dtype must be a timedelta numpy dtype.")
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = data.size // dtype.itemsize
             size = size - offset
+        if len(children) != 0:
+            raise ValueError("TimedeltaColumn must have no children.")
         super().__init__(
-            data,
+            data=data,
             size=size,
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
+            children=children,
         )
 
     def __contains__(self, item: DatetimeLikeScalar) -> bool:

From f955dd76b47779d4f527efe25de417b1acbff4a7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 15 Aug 2024 17:13:58 -0700
Subject: [PATCH 070/270] Rewrite remaining Python Arrow interop conversions
 using the C Data Interface (#16548)

This PR rewrites all remaining parts of the Python interop code previously using Arrow C++ types to instead use the C Data Interface. With this change, we no longer require pyarrow in that part of the Cython code. There are further improvements that we should make to streamline the internals, but I would like to keep this changeset minimal since getting it merged unblocks progress on multiple fronts so that we can progress further in parallel.

Contributes to #15193

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16548
---
 cpp/src/interop/arrow_utilities.cpp           |   1 +
 cpp/src/interop/to_arrow_schema.cpp           |   5 +-
 python/cudf/cudf/_lib/CMakeLists.txt          |   6 +-
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   5 +-
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   | 188 +++++++++---------
 .../cudf/_lib/pylibcudf/libcudf/interop.pxd   |  53 +++--
 .../cudf/cudf/pylibcudf_tests/common/utils.py |   6 +-
 7 files changed, 146 insertions(+), 118 deletions(-)

diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index 4292552a800..3776daf41aa 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -98,6 +98,7 @@ ArrowType id_to_arrow_type(cudf::type_id id)
 ArrowType id_to_arrow_storage_type(cudf::type_id id)
 {
   switch (id) {
+    case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_INT32;
     case cudf::type_id::TIMESTAMP_SECONDS:
     case cudf::type_id::TIMESTAMP_MILLISECONDS:
     case cudf::type_id::TIMESTAMP_MICROSECONDS:
diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp
index b98ca8a7bed..5afed772656 100644
--- a/cpp/src/interop/to_arrow_schema.cpp
+++ b/cpp/src/interop/to_arrow_schema.cpp
@@ -170,8 +170,9 @@ int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
   NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, NANOARROW_TYPE_LIST));
   auto child = input.child(cudf::lists_column_view::child_column_index);
   ArrowSchemaInit(out->children[0]);
-  auto child_meta =
-    metadata.children_meta.empty() ? column_metadata{"element"} : metadata.children_meta[0];
+  auto child_meta = metadata.children_meta.empty()
+                      ? column_metadata{"element"}
+                      : metadata.children_meta[cudf::lists_column_view::child_column_index];
 
   out->flags = input.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
   NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(out->children[0], child_meta.name.c_str()));
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 38b7e9ebe04..d32a2d8e3f8 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -64,9 +64,13 @@ rapids_cython_create_modules(
 
 target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
 
-set(targets_using_arrow_headers interop avro csv orc json parquet)
+set(targets_using_arrow_headers avro csv orc json parquet)
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
 
+include(${rapids-cmake-dir}/export/find_package_root.cmake)
+include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
+target_link_libraries(interop PUBLIC nanoarrow)
+
 add_subdirectory(io)
 add_subdirectory(nvtext)
 add_subdirectory(pylibcudf)
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index df4591baa71..da32d530928 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -52,7 +52,10 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
-link_to_pyarrow_headers(pylibcudf_interop)
+
+include(${rapids-cmake-dir}/export/find_package_root.cmake)
+include(../../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
+target_link_libraries(pylibcudf_interop PUBLIC nanoarrow)
 
 add_subdirectory(libcudf)
 add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index adf7e1fd7e8..caa19724786 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -1,11 +1,10 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
-from cpython cimport pycapsule
-from cython.operator cimport dereference
-from libcpp.memory cimport shared_ptr, unique_ptr
+from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New
+from libc.stdlib cimport free
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-from pyarrow cimport lib as pa
 
 from dataclasses import dataclass, field
 from functools import singledispatch
@@ -18,23 +17,14 @@ from cudf._lib.pylibcudf.libcudf.interop cimport (
     ArrowArrayStream,
     ArrowSchema,
     column_metadata,
-    from_arrow as cpp_from_arrow,
     from_arrow_column as cpp_from_arrow_column,
     from_arrow_stream as cpp_from_arrow_stream,
-    to_arrow as cpp_to_arrow,
-)
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
-    fixed_point_scalar,
-    scalar,
+    to_arrow_host_raw,
+    to_arrow_schema_raw,
 )
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport (
-    decimal32,
-    decimal64,
-    decimal128,
-    scale_type,
-)
 
+from . cimport copying
 from .column cimport Column
 from .scalar cimport Scalar
 from .table cimport Table
@@ -109,7 +99,9 @@ def from_arrow(pyarrow_object, *, DataType data_type=None):
     Union[Table, Scalar]
         The converted object of type corresponding to the input type in cudf.
     """
-    raise TypeError("from_arrow only accepts Table and Scalar objects")
+    raise TypeError(
+        f"Unsupported type {type(pyarrow_object)} for conversion from arrow"
+    )
 
 
 @from_arrow.register(pa.DataType)
@@ -133,7 +125,7 @@ def _from_arrow_table(pyarrow_object, *, DataType data_type=None):
         raise ValueError("data_type may not be passed for tables")
     stream = pyarrow_object.__arrow_c_stream__()
     cdef ArrowArrayStream* c_stream = (
-        <ArrowArrayStream*>pycapsule.PyCapsule_GetPointer(stream, "arrow_array_stream")
+        <ArrowArrayStream*>PyCapsule_GetPointer(stream, "arrow_array_stream")
     )
 
     cdef unique_ptr[table] c_result
@@ -146,54 +138,17 @@ def _from_arrow_table(pyarrow_object, *, DataType data_type=None):
 
 @from_arrow.register(pa.Scalar)
 def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None):
-    cdef shared_ptr[pa.CScalar] arrow_scalar = pa.pyarrow_unwrap_scalar(pyarrow_object)
-
-    cdef unique_ptr[scalar] c_result
-    with nogil:
-        c_result = move(cpp_from_arrow(dereference(arrow_scalar)))
-
-    cdef Scalar result = Scalar.from_libcudf(move(c_result))
-
-    if result.type().id() != type_id.DECIMAL128:
-        if data_type is not None:
-            raise ValueError(
-                "dtype may not be passed for non-decimal types"
-            )
-        return result
-
-    if data_type is None:
-        raise ValueError(
-            "Decimal scalars must be constructed with a dtype"
-        )
-
-    cdef type_id tid = data_type.id()
-
-    if tid == type_id.DECIMAL32:
-        result.c_obj.reset(
-            new fixed_point_scalar[decimal32](
-                (
-                    <fixed_point_scalar[decimal128]*> result.c_obj.get()
-                ).value(),
-                scale_type(-pyarrow_object.type.scale),
-                result.c_obj.get().is_valid()
-            )
-        )
-    elif tid == type_id.DECIMAL64:
-        result.c_obj.reset(
-            new fixed_point_scalar[decimal64](
-                (
-                    <fixed_point_scalar[decimal128]*> result.c_obj.get()
-                ).value(),
-                scale_type(-pyarrow_object.type.scale),
-                result.c_obj.get().is_valid()
-            )
-        )
-    elif tid != type_id.DECIMAL128:
-        raise ValueError(
-            "Decimal scalars may only be cast to decimals"
-        )
-
-    return result
+    if isinstance(pyarrow_object.type, pa.ListType) and pyarrow_object.as_py() is None:
+        # pyarrow doesn't correctly handle None values for list types, so
+        # we have to create this one manually.
+        # https://github.com/apache/arrow/issues/40319
+        pa_array = pa.array([None], type=pyarrow_object.type)
+    else:
+        pa_array = pa.array([pyarrow_object])
+    return copying.get_element(
+        from_arrow(pa_array, data_type=data_type),
+        0,
+    )
 
 
 @from_arrow.register(pa.Array)
@@ -204,10 +159,10 @@ def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
 
     schema, array = pyarrow_object.__arrow_c_array__()
     cdef ArrowSchema* c_schema = (
-        <ArrowSchema*>pycapsule.PyCapsule_GetPointer(schema, "arrow_schema")
+        <ArrowSchema*>PyCapsule_GetPointer(schema, "arrow_schema")
     )
     cdef ArrowArray* c_array = (
-        <ArrowArray*>pycapsule.PyCapsule_GetPointer(array, "arrow_array")
+        <ArrowArray*>PyCapsule_GetPointer(array, "arrow_array")
     )
 
     cdef unique_ptr[column] c_result
@@ -238,7 +193,7 @@ def to_arrow(cudf_object, metadata=None):
     Union[pyarrow.Array, pyarrow.Table, pyarrow.Scalar]
         The converted object of type corresponding to the input type in PyArrow.
     """
-    raise TypeError("to_arrow only accepts Table and Scalar objects")
+    raise TypeError(f"Unsupported type {type(cudf_object)} for conversion to arrow")
 
 
 @to_arrow.register(DataType)
@@ -281,46 +236,83 @@ def _to_arrow_datatype(cudf_object, **kwargs):
             )
 
 
-@to_arrow.register(Table)
-def _to_arrow_table(cudf_object, metadata=None):
+cdef void _release_schema(object schema_capsule) noexcept:
+    """Release the ArrowSchema object stored in a PyCapsule."""
+    cdef ArrowSchema* schema = <ArrowSchema*>PyCapsule_GetPointer(
+        schema_capsule, 'arrow_schema'
+    )
+    if schema.release != NULL:
+        schema.release(schema)
+
+    free(schema)
+
+
+cdef void _release_array(object array_capsule) noexcept:
+    """Release the ArrowArray object stored in a PyCapsule."""
+    cdef ArrowArray* array = <ArrowArray*>PyCapsule_GetPointer(
+        array_capsule, 'arrow_array'
+    )
+    if array.release != NULL:
+        array.release(array)
+
+    free(array)
+
+
+def _table_to_schema(Table tbl, metadata):
     if metadata is None:
-        metadata = [ColumnMetadata() for _ in range(len(cudf_object.columns()))]
+        metadata = [ColumnMetadata() for _ in range(len(tbl.columns()))]
     metadata = [ColumnMetadata(m) if isinstance(m, str) else m for m in metadata]
-    cdef vector[column_metadata] c_table_metadata
-    cdef shared_ptr[pa.CTable] c_table_result
+
+    cdef vector[column_metadata] c_metadata
+    c_metadata.reserve(len(metadata))
     for meta in metadata:
-        c_table_metadata.push_back(_metadata_to_libcudf(meta))
+        c_metadata.push_back(_metadata_to_libcudf(meta))
+
+    cdef ArrowSchema* raw_schema_ptr
     with nogil:
-        c_table_result = move(
-            cpp_to_arrow((<Table> cudf_object).view(), c_table_metadata)
-        )
+        raw_schema_ptr = to_arrow_schema_raw(tbl.view(), c_metadata)
 
-    return pa.pyarrow_wrap_table(c_table_result)
+    return PyCapsule_New(<void*>raw_schema_ptr, 'arrow_schema', _release_schema)
 
 
-@to_arrow.register(Scalar)
-def _to_arrow_scalar(cudf_object, metadata=None):
-    # Note that metadata for scalars is primarily important for preserving
-    # information on nested types since names are otherwise irrelevant.
-    if metadata is None:
-        metadata = ColumnMetadata()
-    metadata = ColumnMetadata(metadata) if isinstance(metadata, str) else metadata
-    cdef column_metadata c_scalar_metadata = _metadata_to_libcudf(metadata)
-    cdef shared_ptr[pa.CScalar] c_scalar_result
+def _table_to_host_array(Table tbl):
+    cdef ArrowArray* raw_host_array_ptr
     with nogil:
-        c_scalar_result = move(
-            cpp_to_arrow(
-                dereference((<Scalar> cudf_object).c_obj), c_scalar_metadata
-            )
-        )
+        raw_host_array_ptr = to_arrow_host_raw(tbl.view())
+
+    return PyCapsule_New(<void*>raw_host_array_ptr, "arrow_array", _release_array)
+
+
+class _TableWithArrowMetadata:
+    def __init__(self, tbl, metadata=None):
+        self.tbl = tbl
+        self.metadata = metadata
 
-    return pa.pyarrow_wrap_scalar(c_scalar_result)
+    def __arrow_c_array__(self, requested_schema=None):
+        return _table_to_schema(self.tbl, self.metadata), _table_to_host_array(self.tbl)
+
+
+# TODO: In the long run we should get rid of the `to_arrow` functions in favor of using
+# the protocols directly via `pa.table(cudf_object, schema=...)` directly. We can do the
+# same for columns. We cannot do this for scalars since there is no corresponding
+# protocol. Since this will require broader changes throughout the codebase, the current
+# approach is to leverage the protocol internally but to continue exposing `to_arrow`.
+@to_arrow.register(Table)
+def _to_arrow_table(cudf_object, metadata=None):
+    test_table = _TableWithArrowMetadata(cudf_object, metadata)
+    return pa.table(test_table)
 
 
 @to_arrow.register(Column)
 def _to_arrow_array(cudf_object, metadata=None):
     """Create a PyArrow array from a pylibcudf column."""
-    if metadata is None:
-        metadata = ColumnMetadata()
-    metadata = ColumnMetadata(metadata) if isinstance(metadata, str) else metadata
-    return to_arrow(Table([cudf_object]), [metadata])[0]
+    if metadata is not None:
+        metadata = [metadata]
+    return to_arrow(Table([cudf_object]), metadata)[0]
+
+
+@to_arrow.register(Scalar)
+def _to_arrow_scalar(cudf_object, metadata=None):
+    # Note that metadata for scalars is primarily important for preserving
+    # information on nested types since names are otherwise irrelevant.
+    return to_arrow(Column.from_scalar(cudf_object, 1), metadata=metadata)[0]
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
index 2151da28d4b..24d96b602dc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
@@ -3,11 +3,11 @@
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-from pyarrow.lib cimport CScalar, CTable
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
@@ -29,6 +29,9 @@ cdef extern from "cudf/interop.hpp" nogil:
     cdef struct ArrowArrayStream:
         void (*release)(ArrowArrayStream*) noexcept nogil
 
+    cdef struct ArrowDeviceArray:
+        ArrowArray array
+
 
 cdef extern from "cudf/interop.hpp" namespace "cudf" \
         nogil:
@@ -38,27 +41,49 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
     DLManagedTensor* to_dlpack(table_view input_table
                                ) except +
 
-    cdef unique_ptr[table] from_arrow(CTable input) except +
-    cdef unique_ptr[scalar] from_arrow(CScalar input) except +
-
     cdef cppclass column_metadata:
         column_metadata() except +
         column_metadata(string name_) except +
         string name
         vector[column_metadata] children_meta
 
-    cdef shared_ptr[CTable] to_arrow(
-        table_view input,
-        vector[column_metadata] metadata,
-    ) except +
-
-    cdef shared_ptr[CScalar] to_arrow(
-        const scalar& input,
-        column_metadata metadata,
-    ) except +
-
     cdef unique_ptr[table] from_arrow_stream(ArrowArrayStream* input) except +
     cdef unique_ptr[column] from_arrow_column(
         const ArrowSchema* schema,
         const ArrowArray* input
     ) except +
+
+
+cdef extern from *:
+    # Rather than exporting the underlying functions directly to Cython, we expose
+    # these wrappers that handle the release to avoid needing to teach Cython how
+    # to handle unique_ptrs with custom deleters that aren't default constructible.
+    # This will go away once we introduce cudf::arrow_column (need a
+    # cudf::arrow_schema as well), see
+    # https://github.com/rapidsai/cudf/issues/16104.
+    """
+    #include <nanoarrow/nanoarrow.h>
+    #include <nanoarrow/nanoarrow_device.h>
+
+    ArrowSchema* to_arrow_schema_raw(
+      cudf::table_view const& input,
+      cudf::host_span<cudf::column_metadata const> metadata) {
+      return to_arrow_schema(input, metadata).release();
+    }
+
+    ArrowArray* to_arrow_host_raw(
+      cudf::table_view const& tbl,
+      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) {
+      // Assumes the sync event is null and the data is already on the host.
+      ArrowArray *arr = new ArrowArray();
+      auto device_arr = cudf::to_arrow_host(tbl, stream, mr);
+      ArrowArrayMove(&device_arr->array, arr);
+      return arr;
+    }
+    """
+    cdef ArrowSchema *to_arrow_schema_raw(
+        const table_view& tbl,
+        const vector[column_metadata]& metadata,
+    ) except + nogil
+    cdef ArrowArray* to_arrow_host_raw(const table_view& tbl) except + nogil
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index e19ff58927f..acb2b5be85c 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -44,7 +44,7 @@ def metadata_from_arrow_type(
 def assert_column_eq(
     lhs: pa.Array | plc.Column,
     rhs: pa.Array | plc.Column,
-    check_field_nullability=True,
+    check_field_nullability=False,
 ) -> None:
     """Verify that a pylibcudf array and PyArrow array are equal.
 
@@ -59,7 +59,9 @@ def assert_column_eq(
         on child fields are equal.
 
         Useful for checking roundtripping of lossy formats like JSON that may not
-        preserve this information.
+        preserve this information. Also, our Arrow interop functions make different
+        choices by default than pyarrow field constructors since the interop functions
+        may make data-dependent choices.
     """
     # Nested types require children metadata to be passed to the conversion function.
     if isinstance(lhs, (pa.Array, pa.ChunkedArray)) and isinstance(

From 1c63e1ee31a07fb4999d7356919280ba3d528741 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 15 Aug 2024 21:51:47 -0400
Subject: [PATCH 071/270] Initial investigation into NumPy proxying in
 `cudf.pandas` (#16286)

Apart of #15397. Closes #14537. Creates `ProxyNDarray` which inherits from `np.ndarray`.

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16286
---
 python/cudf/cudf/pandas/_wrappers/numpy.py    |  3 +++
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 20 +++++++++++++++-
 python/cudf/cudf/pandas/proxy_base.py         | 23 +++++++++++++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     |  8 +++++++
 4 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/pandas/proxy_base.py

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index 3b012169676..eabea9713f1 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -14,6 +14,7 @@
     make_final_proxy_type,
     make_intermediate_proxy_type,
 )
+from ..proxy_base import ProxyNDarrayBase
 from .common import (
     array_interface,
     array_method,
@@ -111,12 +112,14 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
     numpy.ndarray,
     fast_to_slow=cupy.ndarray.get,
     slow_to_fast=cupy.asarray,
+    bases=(ProxyNDarrayBase,),
     additional_attributes={
         "__array__": array_method,
         # So that pa.array(wrapped-numpy-array) works
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
         "__array_interface__": array_interface,
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
         # ndarrays are unhashable
         "__hash__": None,
         # iter(cupy-array) produces an iterable of zero-dim device
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index bb678fd1efe..61aa6310082 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -19,6 +19,7 @@
 from ..options import _env_get_bool
 from ..testing import assert_eq
 from .annotation import nvtx
+from .proxy_base import ProxyNDarrayBase
 
 
 def call_operator(fn, args, kwargs):
@@ -564,7 +565,11 @@ def _fsproxy_wrap(cls, value, func):
         _FinalProxy subclasses can override this classmethod if they
         need particular behaviour when wrapped up.
         """
-        proxy = object.__new__(cls)
+        base_class = _get_proxy_base_class(cls)
+        if base_class is object:
+            proxy = base_class.__new__(cls)
+        else:
+            proxy = base_class.__new__(cls, value)
         proxy._fsproxy_wrapped = value
         return proxy
 
@@ -1193,6 +1198,19 @@ def is_proxy_object(obj: Any) -> bool:
     return False
 
 
+def _get_proxy_base_class(cls):
+    """Returns the proxy base class if one exists"""
+    for proxy_class in PROXY_BASE_CLASSES:
+        if proxy_class in cls.__mro__:
+            return proxy_class
+    return object
+
+
+PROXY_BASE_CLASSES: set[type] = {
+    ProxyNDarrayBase,
+}
+
+
 NUMPY_TYPES: set[str] = set(np.sctypeDict.values())
 
 
diff --git a/python/cudf/cudf/pandas/proxy_base.py b/python/cudf/cudf/pandas/proxy_base.py
new file mode 100644
index 00000000000..61d9cde127c
--- /dev/null
+++ b/python/cudf/cudf/pandas/proxy_base.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import cupy as cp
+import numpy as np
+
+
+class ProxyNDarrayBase(np.ndarray):
+    def __new__(cls, arr):
+        if isinstance(arr, cp.ndarray):
+            obj = np.asarray(arr.get()).view(cls)
+            return obj
+        elif isinstance(arr, np.ndarray):
+            obj = np.asarray(arr).view(cls)
+            return obj
+        else:
+            raise TypeError(
+                "Unsupported array type. Must be numpy.ndarray or cupy.ndarray"
+            )
+
+    def __array_finalize__(self, obj):
+        self._fsproxy_wrapped = getattr(obj, "_fsproxy_wrapped", None)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 6292022d8e4..e5483fff913 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1632,3 +1632,11 @@ def test_change_index_name(index):
 
         assert s.index.name == name
         assert df.index.name == name
+
+
+def test_numpy_ndarray_isinstancecheck(series):
+    s1, s2 = series
+    arr1 = s1.values
+    arr2 = s2.values
+    assert isinstance(arr1, np.ndarray)
+    assert isinstance(arr2, np.ndarray)

From e690d9d25b4fadbd553f7ef14ac4918e95d98b0e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 16:48:49 -1000
Subject: [PATCH 072/270] Ensure size is always passed to NumericalColumn
 (#16576)

https://github.com/rapidsai/cudf/pull/16457 requires `NumericalColumn` to be constructed with `size`. It appears another PR got in after this PR was created so there are currently a few usages where `size` isn't passed in.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16576
---
 python/cudf/cudf/core/_internals/where.py   |  4 +--
 python/cudf/cudf/core/column/categorical.py | 37 +++++----------------
 python/cudf/cudf/core/column/column.py      |  1 +
 python/cudf/cudf/core/column/numerical.py   |  1 +
 python/cudf/cudf/core/dataframe.py          |  5 +--
 python/cudf/cudf/core/index.py              |  1 +
 6 files changed, 13 insertions(+), 36 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 9f36499586b..0c754317185 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -110,9 +110,7 @@ def _make_categorical_like(result, column):
     if isinstance(column, cudf.core.column.CategoricalColumn):
         result = cudf.core.column.build_categorical_column(
             categories=column.categories,
-            codes=cudf.core.column.NumericalColumn(
-                result.base_data, dtype=result.dtype
-            ),
+            codes=result,
             mask=result.base_mask,
             size=result.size,
             offset=result.offset,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index d25983842f9..66aed38bffd 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -659,10 +659,7 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
             Self,
             cudf.core.column.build_categorical_column(
                 categories=self.categories,
-                codes=cudf.core.column.NumericalColumn(
-                    codes.base_data,  # type: ignore[arg-type]
-                    dtype=codes.dtype,
-                ),
+                codes=codes,
                 mask=codes.base_mask,
                 ordered=self.ordered,
                 size=codes.size,
@@ -734,10 +731,7 @@ def sort_values(
         codes = self.codes.sort_values(ascending, na_position)
         col = column.build_categorical_column(
             categories=self.dtype.categories._values,
-            codes=cudf.core.column.NumericalColumn(
-                codes.base_data,  # type: ignore[arg-type]
-                dtype=codes.dtype,
-            ),
+            codes=codes,
             mask=codes.base_mask,
             size=codes.size,
             ordered=self.dtype.ordered,
@@ -845,10 +839,7 @@ def unique(self) -> CategoricalColumn:
         codes = self.codes.unique()
         return column.build_categorical_column(
             categories=self.categories,
-            codes=cudf.core.column.NumericalColumn(
-                codes.base_data,  # type: ignore[arg-type]
-                dtype=codes.dtype,
-            ),
+            codes=codes,
             mask=codes.base_mask,
             offset=codes.offset,
             size=codes.size,
@@ -986,9 +977,7 @@ def find_and_replace(
 
         result = column.build_categorical_column(
             categories=new_cats["cats"],
-            codes=cudf.core.column.NumericalColumn(
-                output.base_data, dtype=output.dtype
-            ),
+            codes=output,
             mask=output.base_mask,
             offset=output.offset,
             size=output.size,
@@ -1184,10 +1173,7 @@ def _concat(
 
         return column.build_categorical_column(
             categories=column.as_column(cats),
-            codes=cudf.core.column.NumericalColumn(
-                codes_col.base_data,  # type: ignore[arg-type]
-                dtype=codes_col.dtype,
-            ),
+            codes=codes_col,
             mask=codes_col.base_mask,
             size=codes_col.size,
             offset=codes_col.offset,
@@ -1199,10 +1185,7 @@ def _with_type_metadata(
         if isinstance(dtype, CategoricalDtype):
             return column.build_categorical_column(
                 categories=dtype.categories._values,
-                codes=cudf.core.column.NumericalColumn(
-                    self.codes.base_data,  # type: ignore[arg-type]
-                    dtype=self.codes.dtype,
-                ),
+                codes=self.codes,
                 mask=self.codes.base_mask,
                 ordered=dtype.ordered,
                 size=self.codes.size,
@@ -1345,9 +1328,7 @@ def _set_categories(
             Self,
             column.build_categorical_column(
                 categories=new_cats,
-                codes=cudf.core.column.NumericalColumn(
-                    new_codes.base_data, dtype=new_codes.dtype
-                ),
+                codes=new_codes,
                 mask=new_codes.base_mask,
                 size=new_codes.size,
                 offset=new_codes.offset,
@@ -1478,9 +1459,7 @@ def pandas_categorical_as_column(
 
     return column.build_categorical_column(
         categories=categorical.categories,
-        codes=cudf.core.column.NumericalColumn(
-            codes.base_data, dtype=codes.dtype
-        ),
+        codes=codes,
         size=codes.size,
         mask=mask,
         ordered=categorical.ordered,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b0e33e8b9ce..090c02da990 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1513,6 +1513,7 @@ def column_empty(
                         * cudf.dtype(libcudf.types.size_type_dtype).itemsize
                     )
                 ),
+                size=None,
                 dtype=libcudf.types.size_type_dtype,
             ),
         )
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 16e78ef35ef..ac36813202a 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -654,6 +654,7 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
                 categories=dtype.categories._values,
                 codes=cudf.core.column.NumericalColumn(
                     self.base_data,  # type: ignore[arg-type]
+                    self.size,
                     dtype=self.dtype,
                 ),
                 mask=self.base_mask,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3033abd53f5..f935217f4f9 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -46,7 +46,6 @@
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
-    NumericalColumn,
     StructColumn,
     as_column,
     build_categorical_column,
@@ -8541,9 +8540,7 @@ def _reassign_categories(categories, cols, col_idxs):
         if idx in categories:
             cols[name] = build_categorical_column(
                 categories=categories[idx],
-                codes=NumericalColumn(
-                    cols[name].base_data, dtype=cols[name].dtype
-                ),
+                codes=cols[name],
                 mask=cols[name].base_mask,
                 offset=cols[name].offset,
                 size=cols[name].size,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index d02633a97fa..ee2f0317f8d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2501,6 +2501,7 @@ def _get_dt_field(self, field: str) -> Index:
         out_column = self._column.get_dt_field(field)
         out_column = NumericalColumn(
             data=out_column.base_data,
+            size=out_column.size,
             dtype=out_column.dtype,
             mask=out_column.base_mask,
             offset=out_column.offset,

From e197d72f2daafb2f4804f823019b1ca7810ed560 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Fri, 16 Aug 2024 09:45:30 -0700
Subject: [PATCH 073/270] Replace `NativeFile` dependency in dask-cudf Parquet
 reader (#16569)

Replaces `read_parquet` logic that currently depends on `NativeFile` for remote-storage access.

**NOTE**: ~It is possible to remove `NativeFile` usage without adding the new `_prefetch_remote_buffers` logic.~ ~However, I'd like to replace the cudf data-transfer logic soon anyway.~

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/cudf/pull/16569
---
 python/dask_cudf/dask_cudf/backends.py        |  21 ++++
 python/dask_cudf/dask_cudf/io/parquet.py      | 102 +++++++-----------
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |  64 +++++++----
 3 files changed, 101 insertions(+), 86 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 82ea2ac033a..a65ae819b44 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -498,6 +498,25 @@ def _unsupported_kwargs(old, new, kwargs):
         )
 
 
+def _raise_unsupported_parquet_kwargs(
+    open_file_options=None, filesystem=None, **kwargs
+):
+    import fsspec
+
+    if open_file_options is not None:
+        raise ValueError(
+            "The open_file_options argument is no longer supported "
+            "by the 'cudf' backend."
+        )
+
+    if filesystem not in ("fsspec", None) and not isinstance(
+        filesystem, fsspec.AbstractFileSystem
+    ):
+        raise ValueError(
+            f"filesystem={filesystem} is not supported by the 'cudf' backend."
+        )
+
+
 # Register cudf->pandas
 to_pandas_dispatch = PandasBackendEntrypoint.to_backend_dispatch()
 
@@ -573,6 +592,7 @@ def from_dict(
     def read_parquet(*args, engine=None, **kwargs):
         from dask_cudf.io.parquet import CudfEngine
 
+        _raise_unsupported_parquet_kwargs(**kwargs)
         return _default_backend(
             dd.read_parquet,
             *args,
@@ -665,6 +685,7 @@ def read_parquet(*args, engine=None, **kwargs):
 
         from dask_cudf.io.parquet import CudfEngine
 
+        _raise_unsupported_parquet_kwargs(**kwargs)
         return _default_backend(
             dx.read_parquet, *args, engine=CudfEngine, **kwargs
         )
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index f0cab953458..8f52fce7818 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 import itertools
 import warnings
-from contextlib import ExitStack
 from functools import partial
 from io import BufferedWriter, BytesIO, IOBase
 
@@ -22,18 +21,13 @@
 import cudf
 from cudf.core.column import as_column, build_categorical_column
 from cudf.io import write_to_dataset
-from cudf.io.parquet import (
-    _apply_post_filters,
-    _default_open_file_options,
-    _normalize_filters,
-)
+from cudf.io.parquet import _apply_post_filters, _normalize_filters
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
 from cudf.utils.ioutils import (
     _ROW_GROUP_SIZE_BYTES_DEFAULT,
+    _fsspec_data_transfer,
     _is_local_filesystem,
-    _open_remote_files,
 )
-from cudf.utils.utils import maybe_filter_deprecation
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -98,63 +92,45 @@ def _read_paths(
 
         dataset_kwargs = dataset_kwargs or {}
         dataset_kwargs["partitioning"] = partitioning or "hive"
-        with ExitStack() as stack:
-            # Non-local filesystem handling
-            paths_or_fobs = paths
-            if not _is_local_filesystem(fs):
-                paths_or_fobs = _open_remote_files(
-                    paths_or_fobs,
-                    fs,
-                    context_stack=stack,
-                    **_default_open_file_options(
-                        open_file_options, columns, row_groups
-                    ),
-                )
 
-            # Filter out deprecation warning unless the user
-            # specifies open_file_options and/or use_python_file_object.
-            # Otherwise, the FutureWarning is out of their control.
-            with maybe_filter_deprecation(
-                (
-                    not open_file_options
-                    and "use_python_file_object" not in kwargs
-                ),
-                message="Support for reading pyarrow's NativeFile is deprecated",
-                category=FutureWarning,
-            ):
-                # Use cudf to read in data
-                try:
-                    df = cudf.read_parquet(
-                        paths_or_fobs,
-                        engine="cudf",
-                        columns=columns,
-                        row_groups=row_groups if row_groups else None,
-                        dataset_kwargs=dataset_kwargs,
-                        categorical_partitions=False,
-                        **kwargs,
-                    )
-                except RuntimeError as err:
-                    # TODO: Remove try/except after null-schema issue is resolved
-                    # (See: https://github.com/rapidsai/cudf/issues/12702)
-                    if len(paths_or_fobs) > 1:
-                        df = cudf.concat(
-                            [
-                                cudf.read_parquet(
-                                    pof,
-                                    engine="cudf",
-                                    columns=columns,
-                                    row_groups=row_groups[i]
-                                    if row_groups
-                                    else None,
-                                    dataset_kwargs=dataset_kwargs,
-                                    categorical_partitions=False,
-                                    **kwargs,
-                                )
-                                for i, pof in enumerate(paths_or_fobs)
-                            ]
+        # Non-local filesystem handling
+        paths_or_fobs = paths
+        if not _is_local_filesystem(fs):
+            paths_or_fobs = [
+                _fsspec_data_transfer(fpath, fs=fs) for fpath in paths
+            ]
+
+        # Use cudf to read in data
+        try:
+            df = cudf.read_parquet(
+                paths_or_fobs,
+                engine="cudf",
+                columns=columns,
+                row_groups=row_groups if row_groups else None,
+                dataset_kwargs=dataset_kwargs,
+                categorical_partitions=False,
+                **kwargs,
+            )
+        except RuntimeError as err:
+            # TODO: Remove try/except after null-schema issue is resolved
+            # (See: https://github.com/rapidsai/cudf/issues/12702)
+            if len(paths_or_fobs) > 1:
+                df = cudf.concat(
+                    [
+                        cudf.read_parquet(
+                            pof,
+                            engine="cudf",
+                            columns=columns,
+                            row_groups=row_groups[i] if row_groups else None,
+                            dataset_kwargs=dataset_kwargs,
+                            categorical_partitions=False,
+                            **kwargs,
                         )
-                    else:
-                        raise err
+                        for i, pof in enumerate(paths_or_fobs)
+                    ]
+                )
+            else:
+                raise err
 
         # Apply filters (if any are defined)
         df = _apply_post_filters(df, filters)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index ac3245b3748..99f19917424 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -5,8 +5,8 @@
 from contextlib import contextmanager
 from io import BytesIO
 
+import fsspec
 import pandas as pd
-import pyarrow.fs as pa_fs
 import pytest
 
 from dask.dataframe import assert_eq
@@ -135,35 +135,53 @@ def test_read_csv_warns(s3_base, s3so):
             assert df.a.sum().compute() == 4
 
 
-@pytest.mark.parametrize(
-    "open_file_options",
-    [
-        {"precache_options": {"method": None}},
-        {"precache_options": {"method": "parquet"}},
-        {"open_file_func": None},
-    ],
-)
-def test_read_parquet_open_file_options(s3_base, s3so, open_file_options, pdf):
+def test_read_parquet_open_file_options_raises():
+    with pytest.raises(ValueError):
+        dask_cudf.read_parquet(
+            "s3://my/path",
+            open_file_options={"precache_options": {"method": "parquet"}},
+        )
+
+
+def test_read_parquet_filesystem(s3_base, s3so, pdf):
+    fname = "test_parquet_filesystem.parquet"
+    bucket = "parquet"
     buffer = BytesIO()
     pdf.to_parquet(path=buffer)
     buffer.seek(0)
-    with s3_context(
-        s3_base=s3_base, bucket="daskparquet", files={"file.parq": buffer}
-    ):
-        if "open_file_func" in open_file_options:
-            fs = pa_fs.S3FileSystem(
-                endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        path = f"s3://{bucket}/{fname}"
+
+        # Cannot pass filesystem="arrow"
+        with pytest.raises(ValueError):
+            dask_cudf.read_parquet(
+                path,
+                storage_options=s3so,
+                filesystem="arrow",
             )
-            open_file_options["open_file_func"] = fs.open_input_file
+
+        # Can pass filesystem="fsspec"
         df = dask_cudf.read_parquet(
-            "s3://daskparquet/*.parq",
+            path,
             storage_options=s3so,
-            open_file_options=open_file_options,
+            filesystem="fsspec",
         )
-        with pytest.warns(FutureWarning):
-            assert df.a.sum().compute() == 10
-        with pytest.warns(FutureWarning):
-            assert df.b.sum().compute() == 9
+        assert df.b.sum().compute() == 9
+
+
+def test_read_parquet_filesystem_explicit(s3_base, s3so, pdf):
+    fname = "test_parquet_filesystem_explicit.parquet"
+    bucket = "parquet"
+    buffer = BytesIO()
+    pdf.to_parquet(path=buffer)
+    buffer.seek(0)
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        path = f"s3://{bucket}/{fname}"
+        fs = fsspec.core.get_fs_token_paths(
+            path, mode="rb", storage_options=s3so
+        )[0]
+        df = dask_cudf.read_parquet(path, filesystem=fs)
+        assert df.b.sum().compute() == 9
 
 
 def test_read_parquet(s3_base, s3so, pdf):

From 623dfceb42eb3e73b352b295898ff3e6cfe7c865 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 16 Aug 2024 12:50:23 -0400
Subject: [PATCH 074/270] [FEA] Add support for `cudf.unique` (#16554)

closes #16460

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16554
---
 python/cudf/cudf/__init__.py          |   2 +-
 python/cudf/cudf/core/algorithms.py   | 122 ++++++++++++++++++++++++++
 python/cudf/cudf/tests/test_unique.py | 117 ++++++++++++++++++++++++
 3 files changed, 240 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/tests/test_unique.py

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 77ae0791b81..ccc45413de4 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -24,7 +24,7 @@
     register_series_accessor,
 )
 from cudf.api.types import dtype
-from cudf.core.algorithms import factorize
+from cudf.core.algorithms import factorize, unique
 from cudf.core.cut import cut
 from cudf.core.dataframe import DataFrame, from_dataframe, from_pandas, merge
 from cudf.core.dtypes import (
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index e27d6ec8d3e..b28fce6d343 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -7,6 +7,7 @@
 import cupy as cp
 import numpy as np
 
+import cudf
 from cudf.core.column import as_column
 from cudf.core.index import Index, RangeIndex
 from cudf.core.scalar import Scalar
@@ -145,3 +146,124 @@ def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase:
     first_nan_idx = valid_locs.values.argmax().item()
     result[:first_nan_idx] = np.nan
     return as_column(result)
+
+
+def unique(values):
+    """
+    Return unique values from array-like
+
+    Parameters
+    ----------
+    values : 1d array-like
+
+    Returns
+    -------
+    cudf.Series,
+
+        The return can be:
+
+        * Index : when the input is an Index
+        * cudf.Series : when the input is a Series
+        * cupy.ndarray : when the input is a cupy.ndarray
+
+        Return cudf.Series, cudf.Index, or cupy.ndarray.
+
+    See Also
+    --------
+    Index.unique : Return unique values from an Index.
+    Series.unique : Return unique values of Series object.
+
+    Examples
+    --------
+    >>> cudf.unique(cudf.Series([2, 1, 3, 3]))
+    0    2
+    1    1
+    2    3
+    dtype: int64
+
+    >>> cudf.unique(cudf.Series([2] + [1] * 5))
+    0    2
+    1    1
+    dtype: int64
+
+    >>> cudf.unique(cudf.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]))
+    0   2016-01-01
+    dtype: datetime64[ns]
+
+    >>> cudf.unique(
+    ...     cudf.Series(
+    ...         [
+    ...             pd.Timestamp("20160101", tz="US/Eastern"),
+    ...             pd.Timestamp("20160101", tz="US/Eastern"),
+    ...             pd.Timestamp("20160103", tz="US/Eastern"),
+    ...         ]
+    ...     )
+    ... )
+    0   2016-01-01 00:00:00-05:00
+    1   2016-01-03 00:00:00-05:00
+    dtype: datetime64[ns, US/Eastern]
+
+    >>> cudf.unique(
+    ...     cudf.Index(
+    ...         [
+    ...             pd.Timestamp("20160101", tz="US/Eastern"),
+    ...             pd.Timestamp("20160101", tz="US/Eastern"),
+    ...             pd.Timestamp("20160103", tz="US/Eastern"),
+    ...         ]
+    ...     )
+    ... )
+    DatetimeIndex(['2016-01-01 00:00:00-05:00', '2016-01-03 00:00:00-05:00'],dtype='datetime64[ns, US/Eastern]')
+
+    An unordered Categorical will return categories in the
+    order of appearance.
+
+    >>> cudf.unique(cudf.Series(pd.Categorical(list("baabc"))))
+    0    b
+    1    a
+    2    c
+    dtype: category
+    Categories (3, object): ['a', 'b', 'c']
+
+    >>> cudf.unique(cudf.Series(pd.Categorical(list("baabc"), categories=list("abc"))))
+    0    b
+    1    a
+    2    c
+    dtype: category
+    Categories (3, object): ['a', 'b', 'c']
+
+    An ordered Categorical preserves the category ordering.
+
+    >>> pd.unique(
+    ...     pd.Series(
+    ...         pd.Categorical(list("baabc"), categories=list("abc"), ordered=True)
+    ...     )
+    ... )
+    0    b
+    1    a
+    2    c
+    dtype: category
+    Categories (3, object): ['a' < 'b' < 'c']
+
+    An array of tuples
+
+    >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values)
+    array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
+    """
+    if not isinstance(values, (cudf.Series, cudf.Index, cp.ndarray)):
+        raise ValueError(
+            "Must pass cudf.Series, cudf.Index, or cupy.ndarray object"
+        )
+    if isinstance(values, cp.ndarray):
+        # pandas.unique will not sort the values in the result
+        # while cupy.unique documents it will, so we pass cupy.ndarray
+        # through cudf.Index to maintain the original order.
+        return cp.asarray(cudf.Index(values).unique())
+    if isinstance(values, cudf.Series):
+        if get_option("mode.pandas_compatible"):
+            if isinstance(values.dtype, cudf.CategoricalDtype):
+                raise NotImplementedError(
+                    "cudf.Categorical is not implemented"
+                )
+            else:
+                return cp.asarray(values.unique())
+    return values.unique()
diff --git a/python/cudf/cudf/tests/test_unique.py b/python/cudf/cudf/tests/test_unique.py
new file mode 100644
index 00000000000..699b3340521
--- /dev/null
+++ b/python/cudf/cudf/tests/test_unique.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import cupy as cp
+import numpy as np
+import pandas as pd
+import pytest
+
+import cudf
+from cudf.testing import assert_eq
+
+
+@pytest.fixture
+def df():
+    df = cudf.DataFrame()
+    np.random.seed(0)
+
+    arr = np.random.randint(2, size=10, dtype=np.int64)
+    df["foo"] = arr
+    df["bar"] = cudf.Series([pd.Timestamp(x) for x in arr])
+
+    return df
+
+
+@pytest.fixture(params=["foo", "bar"])
+def series_test_vals(request, df):
+    actual = cudf.unique(df[request.param])
+    expected = pd.unique(df[request.param].to_pandas())
+    return actual, expected
+
+
+def test_unique_series_obj(series_test_vals):
+    actual, expected = series_test_vals
+
+    assert isinstance(expected, np.ndarray)
+    assert isinstance(actual, cudf.Series)
+    assert_eq(actual, pd.Series(expected, name=actual.name))
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        (cudf.Index, pd.Index),
+        (cudf.MultiIndex, pd.MultiIndex),
+        (cudf.DatetimeIndex, pd.DatetimeIndex),
+        (cudf.CategoricalIndex, pd.CategoricalIndex),
+    ],
+)
+@pytest.mark.parametrize("col", ["foo", "bar"])
+def test_unique_index_obj(index, col, df):
+    if index[0] == cudf.MultiIndex:
+        df.index = cudf.MultiIndex.from_arrays([df[col], df[col]])
+    else:
+        df.index = index[0](df[col])
+    actual = cudf.unique(df.index)
+    expected = pd.unique(df.index.to_pandas())
+
+    isinstance(expected, np.ndarray)
+    assert isinstance(actual, index[0])
+
+    if index[0] == cudf.MultiIndex:
+        expect = index[1].from_arrays(
+            [
+                [x[0] for x in expected],
+                [x[1] for x in expected],
+            ],
+            names=actual.names,
+        )
+        assert_eq(actual, expect)
+    else:
+        assert_eq(actual, index[1](expected, name=actual.name))
+
+
+def test_unique_cupy_ndarray(df):
+    arr = np.asarray(df["foo"].to_pandas())
+    garr = cp.asarray(df["foo"])
+
+    expected = pd.unique(arr)
+    actual = cudf.unique(garr)
+
+    isinstance(expected, np.ndarray)
+    isinstance(actual, cp.ndarray)
+    assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        ["abc", "def", "abc", "a", "def", None],
+        [10, 20, 100, -10, 0, 1, None, 10, 100],
+    ],
+)
+def test_category_dtype_unique(data):
+    gs = cudf.Series(data, dtype="category")
+    ps = gs.to_pandas()
+
+    actual = cudf.unique(gs)
+    expected = pd.unique(ps)
+
+    assert isinstance(expected, pd.Categorical)
+    assert isinstance(actual, cudf.Series)
+    assert_eq(actual, pd.Series(expected))
+
+
+def test_unique_fails_value_error(df):
+    with pytest.raises(
+        ValueError,
+        match="Must pass cudf.Series, cudf.Index, or cupy.ndarray object",
+    ):
+        cudf.unique(df)
+
+
+def test_unique_fails_not_implemented_error(df):
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(
+            NotImplementedError, match="cudf.Categorical is not implemented"
+        ):
+            cudf.unique(cudf.Series(["foo", "foo"], dtype="category"))

From e16c2f2493d316259dc2472b448e61b6e717b7dd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 16 Aug 2024 07:17:40 -1000
Subject: [PATCH 075/270] Make (Indexed)Frame.__init__ require data (and index)
 (#16430)

This PR makes `data` and `Index` required arguments of `Frame` and `IndexedFrame` where relevant so we can gradually move towards ensuring `data` and `index` are valid mapping of columns and a cudf Index respectively

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16430
---
 python/cudf/cudf/core/dataframe.py     |  2 +-
 python/cudf/cudf/core/frame.py         |  8 ++------
 python/cudf/cudf/core/indexed_frame.py | 16 +++++++++-------
 python/cudf/cudf/core/reshape.py       |  2 +-
 python/cudf/cudf/core/series.py        |  2 +-
 5 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f935217f4f9..3d805881c5a 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -697,7 +697,7 @@ def __init__(
     ):
         if copy is not None:
             raise NotImplementedError("copy is not currently implemented.")
-        super().__init__()
+        super().__init__({}, index=cudf.Index([]))
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 32c313e42d3..ce23d671a6c 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -53,14 +53,10 @@ class Frame(BinaryOperand, Scannable):
         A Frame representing the (optional) index columns.
     """
 
-    _data: "ColumnAccessor"
-
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
-    def __init__(self, data=None):
-        if data is None:
-            data = {}
-        self._data = cudf.core.column_accessor.ColumnAccessor(data)
+    def __init__(self, data: ColumnAccessor | MutableMapping[Any, ColumnBase]):
+        self._data = ColumnAccessor(data)
 
     @property
     def _num_columns(self) -> int:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ae7369c80d1..8eb6de79bce 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -265,7 +265,6 @@ class IndexedFrame(Frame):
     # mypy can't handle bound type variables as class members
     _loc_indexer_type: type[_LocIndexerClass]  # type: ignore
     _iloc_indexer_type: type[_IlocIndexerClass]  # type: ignore
-    _index: cudf.core.index.BaseIndex
     _groupby = GroupBy
     _resampler = _Resampler
 
@@ -284,18 +283,21 @@ class IndexedFrame(Frame):
         "cummax": {"op_name": "cumulative max"},
     }
 
-    def __init__(self, data=None, index=None):
+    def __init__(
+        self,
+        data: ColumnAccessor | MutableMapping[Any, ColumnBase],
+        index: BaseIndex,
+    ):
         super().__init__(data=data)
-        # TODO: Right now it is possible to initialize an IndexedFrame without
-        # an index. The code's correctness relies on the subclass constructors
-        # assigning the attribute after the fact. We should restructure those
-        # to ensure that this constructor is always invoked with an index.
+        if not isinstance(index, cudf.core._base_index.BaseIndex):
+            raise ValueError(
+                f"index must be a cudf index not {type(index).__name__}"
+            )
         self._index = index
 
     @property
     def _num_rows(self) -> int:
         # Important to use the index because the data may be empty.
-        # TODO: Remove once DataFrame.__init__ is cleaned up
         return len(self.index)
 
     @property
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index df471692702..703a239bea2 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -490,7 +490,7 @@ def concat(
         elif len(objs) == 1:
             obj = objs[0]
             result = cudf.DataFrame._from_data(
-                data=None if join == "inner" else obj._data.copy(deep=True),
+                data={} if join == "inner" else obj._data.copy(deep=True),
                 index=cudf.RangeIndex(len(obj))
                 if ignore_index
                 else obj.index.copy(deep=True),
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 2fb4fde6552..4be10752651 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -518,7 +518,7 @@ def from_categorical(cls, categorical, codes=None):
 
     @classmethod
     @_performance_tracking
-    def from_arrow(cls, array: pa.Array):
+    def from_arrow(cls, array: pa.Array) -> Self:
         """Create from PyArrow Array/ChunkedArray.
 
         Parameters

From 30011c58ed2444f0a6ba9f80c17766e591a610a1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 16 Aug 2024 07:19:54 -1000
Subject: [PATCH 076/270] Clean up reshaping ops (#16553)

Uses some more "idiomatic" cudf patterns such as

* Checking `isinstance(column.dtype, ...)` instead of `isinstance(column, ...)` (to avoid importing the column objects)
* Using `DataFrame._from_data(dict)` instead of creating an empty `DataFrame` and adding columns one by one

Also avoids some column materialization in `DataFrame.columns = `:

* For `RangeIndex`, avoid materializing to a column to get a distinct count
* For `MultiIndex`, avoid creating a `cudf.MultiIndex` with columns as it's converted to a CPU object to get column labels for the `ColumnAccessor`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16553
---
 python/cudf/cudf/core/dataframe.py |   8 +-
 python/cudf/cudf/core/reshape.py   | 141 ++++++++++++++++-------------
 2 files changed, 82 insertions(+), 67 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3d805881c5a..6ee3d69441f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2654,8 +2654,12 @@ def columns(self, columns):
         elif isinstance(columns, (cudf.BaseIndex, ColumnBase, Series)):
             level_names = (getattr(columns, "name", None),)
             rangeindex = isinstance(columns, cudf.RangeIndex)
-            columns = as_column(columns)
-            if columns.distinct_count(dropna=False) != len(columns):
+            if rangeindex:
+                unique_count = len(columns)
+            else:
+                columns = as_column(columns)
+                unique_count = columns.distinct_count(dropna=False)
+            if unique_count != len(columns):
                 raise ValueError("Duplicate column names are not allowed")
             pd_columns = pd.Index(columns.to_pandas())
             label_dtype = pd_columns.dtype
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 703a239bea2..3d205957126 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -3,7 +3,7 @@
 
 import itertools
 import warnings
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
@@ -14,7 +14,7 @@
 from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import ColumnBase, as_column, column_empty_like
-from cudf.core.column.categorical import CategoricalColumn
+from cudf.core.column_accessor import ColumnAccessor
 from cudf.utils.dtypes import min_unsigned_type
 
 if TYPE_CHECKING:
@@ -101,7 +101,9 @@ def _get_combined_index(indexes, intersect: bool = False, sort=None):
     return index
 
 
-def _normalize_series_and_dataframe(objs, axis):
+def _normalize_series_and_dataframe(
+    objs: list[cudf.Series | cudf.DataFrame], axis: Literal[0, 1]
+) -> None:
     """Convert any cudf.Series objects in objs to DataFrames in place."""
     # Default to naming series by a numerical id if they are not named.
     sr_name = 0
@@ -335,7 +337,7 @@ def concat(
                     result = obj.to_frame()
                 else:
                     result = obj.copy(deep=True)
-                result.columns = pd.RangeIndex(len(result._data))
+                result.columns = cudf.RangeIndex(len(result._data))
             else:
                 result = type(obj)._from_data(
                     data=obj._data.copy(deep=True),
@@ -350,7 +352,7 @@ def concat(
                 result = obj.copy(deep=True)
             if keys_objs is not None and isinstance(result, cudf.DataFrame):
                 k = keys_objs[0]
-                result.columns = cudf.MultiIndex.from_tuples(
+                result.columns = pd.MultiIndex.from_tuples(
                     [
                         (k, *c) if isinstance(c, tuple) else (k, c)
                         for c in result._column_names
@@ -369,7 +371,6 @@ def concat(
             raise TypeError(
                 "Can only concatenate Series and DataFrame objects when axis=1"
             )
-        df = cudf.DataFrame()
         _normalize_series_and_dataframe(objs, axis=axis)
 
         any_empty = any(obj.empty for obj in objs)
@@ -393,18 +394,23 @@ def concat(
         objs = [obj for obj in objs if obj.shape != (0, 0)]
 
         if len(objs) == 0:
-            return df
+            # TODO: https://github.com/rapidsai/cudf/issues/16550
+            return cudf.DataFrame()
 
         # Don't need to align indices of all `objs` since we
         # would anyway return an empty dataframe below
         if not empty_inner:
             objs = _align_objs(objs, how=join, sort=sort)
-            df.index = objs[0].index
+            result_index = objs[0].index
+        else:
+            result_index = None
 
+        result_data = {}
+        result_columns = None
         if keys_objs is None:
             for o in objs:
                 for name, col in o._data.items():
-                    if name in df._data:
+                    if name in result_data:
                         raise NotImplementedError(
                             f"A Column with duplicate name found: {name}, cuDF "
                             f"doesn't support having multiple columns with "
@@ -414,11 +420,11 @@ def concat(
                         # if join is inner and it contains an empty df
                         # we return an empty df, hence creating an empty
                         # column with dtype metadata retained.
-                        df[name] = cudf.core.column.column_empty_like(
+                        result_data[name] = cudf.core.column.column_empty_like(
                             col, newsize=0
                         )
                     else:
-                        df[name] = col
+                        result_data[name] = col
 
             result_columns = (
                 objs[0]
@@ -451,21 +457,21 @@ def concat(
                     else:
                         col_label = (k, name)
                     if empty_inner:
-                        df[col_label] = cudf.core.column.column_empty_like(
-                            col, newsize=0
+                        result_data[col_label] = (
+                            cudf.core.column.column_empty_like(col, newsize=0)
                         )
                     else:
-                        df[col_label] = col
+                        result_data[col_label] = col
 
-        if keys_objs is None:
-            df.columns = result_columns.unique()
-            if ignore_index:
-                df.columns = cudf.RangeIndex(len(result_columns.unique()))
-        elif ignore_index:
-            # with ignore_index the column names change to numbers
-            df.columns = cudf.RangeIndex(len(result_columns))
+        df = cudf.DataFrame._from_data(
+            ColumnAccessor(result_data, verify=False), index=result_index
+        )
+        if ignore_index:
+            df.columns = cudf.RangeIndex(df._num_columns)
+        elif result_columns is not None:
+            df.columns = result_columns
         elif not only_series:
-            df.columns = cudf.MultiIndex.from_tuples(df._column_names)
+            df.columns = pd.MultiIndex.from_tuples(df._column_names)
 
         if empty_inner:
             # if join is inner and it contains an empty df
@@ -486,6 +492,7 @@ def concat(
         if len(objs) == 0:
             # If objs is empty, that indicates all of
             # objs are empty dataframes.
+            # TODO: https://github.com/rapidsai/cudf/issues/16550
             return cudf.DataFrame()
         elif len(objs) == 1:
             obj = objs[0]
@@ -519,7 +526,7 @@ def concat(
     elif typ is cudf.MultiIndex:
         return cudf.MultiIndex._concat(objs)
     elif issubclass(typ, cudf.Index):
-        return cudf.core.index.Index._concat(objs)
+        return cudf.Index._concat(objs)
     else:
         raise TypeError(f"cannot concatenate object of type {typ}")
 
@@ -632,18 +639,19 @@ def melt(
         value_vars = [c for c in frame._column_names if c not in unique_id]
 
     # Error for unimplemented support for datatype
-    dtypes = [frame[col].dtype for col in id_vars + value_vars]
-    if any(isinstance(typ, cudf.CategoricalDtype) for typ in dtypes):
+    if any(
+        isinstance(frame[col].dtype, cudf.CategoricalDtype)
+        for col in id_vars + value_vars
+    ):
         raise NotImplementedError(
             "Categorical columns are not yet supported for function"
         )
 
     # Check dtype homogeneity in value_var
     # Because heterogeneous concat is unimplemented
-    dtypes = [frame[col].dtype for col in value_vars]
-    if len(dtypes) > 0:
-        dtype = dtypes[0]
-        if any(t != dtype for t in dtypes):
+    if len(value_vars) > 1:
+        dtype = frame[value_vars[0]].dtype
+        if any(frame[col].dtype != dtype for col in value_vars):
             raise ValueError("all cols in value_vars must have the same dtype")
 
     # overlap
@@ -969,37 +977,39 @@ def _pivot(df, index, columns):
     index_labels, index_idx = index._encode()
     column_labels = columns_labels.to_pandas().to_flat_index()
 
-    def as_tuple(x):
-        return x if isinstance(x, tuple) else (x,)
-
     result = {}
-    for v in df:
-        names = [as_tuple(v) + as_tuple(name) for name in column_labels]
+    if len(index_labels) != 0 and len(columns_labels) != 0:
+
+        def as_tuple(x):
+            return x if isinstance(x, tuple) else (x,)
+
         nrows = len(index_labels)
-        ncols = len(names)
-        num_elements = nrows * ncols
-        if num_elements > 0:
-            col = df._data[v]
+        for col_label, col in df._data.items():
+            names = [
+                as_tuple(col_label) + as_tuple(name) for name in column_labels
+            ]
+            new_size = nrows * len(names)
             scatter_map = (columns_idx * np.int32(nrows)) + index_idx
-            target = cudf.DataFrame._from_data(
-                {
-                    None: cudf.core.column.column_empty_like(
-                        col, masked=True, newsize=nrows * ncols
-                    )
-                }
+            target_col = cudf.core.column.column_empty_like(
+                col, masked=True, newsize=new_size
             )
-            target._data[None][scatter_map] = col
-            result_frames = target._split(range(nrows, nrows * ncols, nrows))
+            target_col[scatter_map] = col
+            target = cudf.Index._from_column(target_col)
             result.update(
                 {
-                    name: next(iter(f._columns))
-                    for name, f in zip(names, result_frames)
+                    name: idx._column
+                    for name, idx in zip(
+                        names, target._split(range(nrows, new_size, nrows))
+                    )
                 }
             )
 
     # the result of pivot always has a multicolumn
-    ca = cudf.core.column_accessor.ColumnAccessor(
-        result, multiindex=True, level_names=(None,) + columns._data.names
+    ca = ColumnAccessor(
+        result,
+        multiindex=True,
+        level_names=(None,) + columns._data.names,
+        verify=False,
     )
     return cudf.DataFrame._from_data(
         ca, index=cudf.Index(index_labels, name=index.name)
@@ -1070,19 +1080,20 @@ def pivot(data, columns=None, index=no_default, values=no_default):
     if index is no_default:
         index = df.index
     else:
-        index = cudf.core.index.Index(df.loc[:, index])
+        index = cudf.Index(df.loc[:, index])
     columns = cudf.Index(df.loc[:, columns])
 
     # Create a DataFrame composed of columns from both
     # columns and index
-    columns_index = {}
-    columns_index = {
-        i: col
-        for i, col in enumerate(
-            itertools.chain(index._data.columns, columns._data.columns)
-        )
-    }
-    columns_index = cudf.DataFrame(columns_index)
+    ca = ColumnAccessor(
+        dict(
+            enumerate(
+                itertools.chain(index._data.columns, columns._data.columns)
+            )
+        ),
+        verify=False,
+    )
+    columns_index = cudf.DataFrame._from_data(ca)
 
     # Check that each row is unique:
     if len(columns_index) != len(columns_index.drop_duplicates()):
@@ -1225,13 +1236,13 @@ def unstack(df, level, fill_value=None, sort: bool = True):
     return result
 
 
-def _get_unique(column, dummy_na):
+def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase:
     """
     Returns unique values in a column, if
     dummy_na is False, nan's are also dropped.
     """
-    if isinstance(column, cudf.core.column.CategoricalColumn):
-        unique = column.categories
+    if isinstance(column.dtype, cudf.CategoricalDtype):
+        unique = column.categories  # type: ignore[attr-defined]
     else:
         unique = column.unique().sort_values()
     if not dummy_na:
@@ -1251,11 +1262,11 @@ def _one_hot_encode_column(
     `prefix`, separated with category name with `prefix_sep`. The encoding
     columns maybe coerced into `dtype`.
     """
-    if isinstance(column, CategoricalColumn):
+    if isinstance(column.dtype, cudf.CategoricalDtype):
         if column.size == column.null_count:
             column = column_empty_like(categories, newsize=column.size)
         else:
-            column = column._get_decategorized_column()
+            column = column._get_decategorized_column()  # type: ignore[attr-defined]
 
     if column.size * categories.size >= np.iinfo(size_type_dtype).max:
         raise ValueError(
@@ -1536,7 +1547,7 @@ def pivot_table(
         table_columns = tuple(
             map(lambda column: column[1:], table._data.names)
         )
-        table.columns = cudf.MultiIndex.from_tuples(
+        table.columns = pd.MultiIndex.from_tuples(
             tuples=table_columns, names=column_names
         )
 

From bc8ca9befdd77d3f4a270a64dae178b2ef355181 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 16 Aug 2024 12:02:21 -0700
Subject: [PATCH 077/270] Setup pylibcudf package (#16299)

Migrates cudf._lib.pylibcudf to a new pylibcudf package

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16299
---
 .github/labeler.yml                           |   2 +-
 .github/workflows/pr.yaml                     |  12 +-
 build.sh                                      |  15 +-
 ci/build_docs.sh                              |   2 +-
 ci/build_python.sh                            |   7 +
 ci/build_wheel_cudf.sh                        |   8 +-
 ci/build_wheel_pylibcudf.sh                   |  16 ++
 ci/cudf_pandas_scripts/pandas-tests/run.sh    |   2 +
 ci/cudf_pandas_scripts/run_tests.sh           |   2 +
 ci/test_python_cudf.sh                        |   2 +-
 ci/test_wheel_cudf.sh                         |  10 +-
 ci/test_wheel_cudf_polars.sh                  |   6 +-
 ci/test_wheel_dask_cudf.sh                    |   5 +-
 .../all_cuda-118_arch-x86_64.yaml             |   2 +
 .../all_cuda-125_arch-x86_64.yaml             |   2 +
 conda/recipes/cudf/meta.yaml                  |   2 +
 conda/recipes/cudf_kafka/meta.yaml            |   4 +-
 conda/recipes/pylibcudf/build.sh              |   4 +
 .../recipes/pylibcudf/conda_build_config.yaml |  20 +++
 conda/recipes/pylibcudf/meta.yaml             | 108 ++++++++++++
 dependencies.yaml                             | 165 +++++++++++++++++-
 .../api_docs/pylibcudf/aggregation.rst        |   2 +-
 .../api_docs/pylibcudf/binaryop.rst           |   2 +-
 .../user_guide/api_docs/pylibcudf/column.rst  |   2 +-
 .../api_docs/pylibcudf/column_factories.rst   |   2 +-
 .../api_docs/pylibcudf/concatenate.rst        |   2 +-
 .../user_guide/api_docs/pylibcudf/copying.rst |   2 +-
 .../api_docs/pylibcudf/datetime.rst           |   2 +-
 .../api_docs/pylibcudf/expressions.rst        |   2 +-
 .../user_guide/api_docs/pylibcudf/filling.rst |   2 +-
 .../api_docs/pylibcudf/gpumemoryview.rst      |   2 +-
 .../user_guide/api_docs/pylibcudf/groupby.rst |   2 +-
 .../user_guide/api_docs/pylibcudf/interop.rst |   2 +-
 .../user_guide/api_docs/pylibcudf/io/avro.rst |   2 +-
 .../user_guide/api_docs/pylibcudf/io/csv.rst  |   2 +-
 .../api_docs/pylibcudf/io/index.rst           |   2 +-
 .../user_guide/api_docs/pylibcudf/io/json.rst |   2 +-
 .../api_docs/pylibcudf/io/parquet.rst         |   2 +-
 .../user_guide/api_docs/pylibcudf/join.rst    |   2 +-
 .../user_guide/api_docs/pylibcudf/lists.rst   |   2 +-
 .../user_guide/api_docs/pylibcudf/merge.rst   |   2 +-
 .../api_docs/pylibcudf/quantiles.rst          |   2 +-
 .../user_guide/api_docs/pylibcudf/reduce.rst  |   2 +-
 .../user_guide/api_docs/pylibcudf/replace.rst |   2 +-
 .../user_guide/api_docs/pylibcudf/reshape.rst |   2 +-
 .../user_guide/api_docs/pylibcudf/rolling.rst |   2 +-
 .../user_guide/api_docs/pylibcudf/round.rst   |   2 +-
 .../user_guide/api_docs/pylibcudf/scalar.rst  |   2 +-
 .../user_guide/api_docs/pylibcudf/search.rst  |   2 +-
 .../user_guide/api_docs/pylibcudf/sorting.rst |   2 +-
 .../api_docs/pylibcudf/stream_compaction.rst  |   2 +-
 .../api_docs/pylibcudf/strings/capitalize.rst |   2 +-
 .../api_docs/pylibcudf/strings/char_types.rst |   2 +-
 .../api_docs/pylibcudf/strings/contains.rst   |   2 +-
 .../api_docs/pylibcudf/strings/find.rst       |   2 +-
 .../pylibcudf/strings/regex_flags.rst         |   2 +-
 .../pylibcudf/strings/regex_program.rst       |   2 +-
 .../api_docs/pylibcudf/strings/replace.rst    |   2 +-
 .../api_docs/pylibcudf/strings/slice.rst      |   2 +-
 .../user_guide/api_docs/pylibcudf/table.rst   |   2 +-
 .../user_guide/api_docs/pylibcudf/traits.rst  |   2 +-
 .../api_docs/pylibcudf/transform.rst          |   2 +-
 .../user_guide/api_docs/pylibcudf/types.rst   |   2 +-
 .../user_guide/api_docs/pylibcudf/unary.rst   |   2 +-
 python/cudf/CMakeLists.txt                    |   4 +-
 python/cudf/cudf/_lib/CMakeLists.txt          |   1 -
 python/cudf/cudf/_lib/__init__.py             |   1 -
 python/cudf/cudf/_lib/aggregation.pyx         |   3 +-
 python/cudf/cudf/_lib/avro.pyx                |   4 +-
 python/cudf/cudf/_lib/binaryop.pyx            |   3 +-
 python/cudf/cudf/_lib/column.pxd              |   9 +-
 python/cudf/cudf/_lib/column.pyx              |  20 +--
 python/cudf/cudf/_lib/concat.pyx              |   3 +-
 python/cudf/cudf/_lib/copying.pxd             |   2 +-
 python/cudf/cudf/_lib/copying.pyx             |  20 +--
 python/cudf/cudf/_lib/csv.pyx                 |  21 ++-
 python/cudf/cudf/_lib/datetime.pyx            |  13 +-
 python/cudf/cudf/_lib/filling.pyx             |   4 +-
 python/cudf/cudf/_lib/groupby.pyx             |   7 +-
 python/cudf/cudf/_lib/hash.pyx                |  15 +-
 python/cudf/cudf/_lib/interop.pyx             |   9 +-
 python/cudf/cudf/_lib/io/utils.pxd            |   7 +-
 python/cudf/cudf/_lib/io/utils.pyx            |  11 +-
 python/cudf/cudf/_lib/join.pyx                |   2 +-
 python/cudf/cudf/_lib/json.pyx                |  15 +-
 python/cudf/cudf/_lib/labeling.pyx            |  10 +-
 python/cudf/cudf/_lib/lists.pyx               |   7 +-
 python/cudf/cudf/_lib/merge.pyx               |   2 +-
 python/cudf/cudf/_lib/null_mask.pyx           |  11 +-
 .../cudf/_lib/nvtext/byte_pair_encode.pyx     |  11 +-
 .../cudf/cudf/_lib/nvtext/edit_distance.pyx   |   9 +-
 .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx |  13 +-
 python/cudf/cudf/_lib/nvtext/jaccard.pyx      |  11 +-
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  11 +-
 .../cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx |  13 +-
 python/cudf/cudf/_lib/nvtext/normalize.pyx    |   9 +-
 python/cudf/cudf/_lib/nvtext/replace.pyx      |  13 +-
 python/cudf/cudf/_lib/nvtext/stemmer.pyx      |  11 +-
 .../cudf/_lib/nvtext/subword_tokenize.pyx     |   7 +-
 python/cudf/cudf/_lib/nvtext/tokenize.pyx     |  13 +-
 python/cudf/cudf/_lib/orc.pyx                 |  33 ++--
 python/cudf/cudf/_lib/parquet.pyx             |  39 +++--
 python/cudf/cudf/_lib/partitioning.pyx        |  13 +-
 python/cudf/cudf/_lib/pylibcudf/io/avro.pxd   |  12 --
 .../pylibcudf/libcudf/strings/extract.pxd     |  15 --
 .../_lib/pylibcudf/strings/char_types.pxd     |   5 -
 .../cudf/_lib/pylibcudf/strings/contains.pxd  |   7 -
 .../_lib/pylibcudf/strings/regex_flags.pxd    |   2 -
 python/cudf/cudf/_lib/quantiles.pyx           |   5 +-
 python/cudf/cudf/_lib/reduce.pyx              |   3 +-
 python/cudf/cudf/_lib/replace.pyx             |   3 +-
 python/cudf/cudf/_lib/reshape.pyx             |   5 +-
 python/cudf/cudf/_lib/rolling.pyx             |   3 +-
 python/cudf/cudf/_lib/round.pyx               |   4 +-
 python/cudf/cudf/_lib/scalar.pxd              |   3 +-
 python/cudf/cudf/_lib/scalar.pyx              |  14 +-
 python/cudf/cudf/_lib/search.pyx              |   2 +-
 python/cudf/cudf/_lib/sort.pyx                |  13 +-
 python/cudf/cudf/_lib/stream_compaction.pyx   |   2 +-
 python/cudf/cudf/_lib/string_casting.pyx      |  21 +--
 python/cudf/cudf/_lib/strings/attributes.pyx  |   9 +-
 python/cudf/cudf/_lib/strings/capitalize.pyx  |   2 +-
 python/cudf/cudf/_lib/strings/case.pyx        |   2 +-
 python/cudf/cudf/_lib/strings/char_types.pyx  |  11 +-
 python/cudf/cudf/_lib/strings/combine.pyx     |  13 +-
 python/cudf/cudf/_lib/strings/contains.pyx    |  19 +-
 .../strings/convert/convert_fixed_point.pyx   |  11 +-
 .../_lib/strings/convert/convert_floats.pyx   |   9 +-
 .../_lib/strings/convert/convert_integers.pyx |   9 +-
 .../_lib/strings/convert/convert_lists.pyx    |  11 +-
 .../_lib/strings/convert/convert_urls.pyx     |   9 +-
 python/cudf/cudf/_lib/strings/extract.pyx     |  11 +-
 python/cudf/cudf/_lib/strings/find.pyx        |   6 +-
 .../cudf/cudf/_lib/strings/find_multiple.pyx  |   9 +-
 python/cudf/cudf/_lib/strings/findall.pyx     |  11 +-
 python/cudf/cudf/_lib/strings/json.pyx        |  11 +-
 python/cudf/cudf/_lib/strings/padding.pyx     |  11 +-
 python/cudf/cudf/_lib/strings/repeat.pyx      |   9 +-
 python/cudf/cudf/_lib/strings/replace.pyx     |   5 +-
 python/cudf/cudf/_lib/strings/replace_re.pyx  |  17 +-
 .../cudf/_lib/strings/split/partition.pyx     |  11 +-
 python/cudf/cudf/_lib/strings/split/split.pyx |  19 +-
 python/cudf/cudf/_lib/strings/strip.pyx       |  11 +-
 python/cudf/cudf/_lib/strings/substring.pyx   |   2 +-
 python/cudf/cudf/_lib/strings/translate.pyx   |  13 +-
 python/cudf/cudf/_lib/strings/wrap.pyx        |   9 +-
 python/cudf/cudf/_lib/strings_udf.pyx         |  12 +-
 python/cudf/cudf/_lib/text.pyx                |   7 +-
 python/cudf/cudf/_lib/timezone.pyx            |   5 +-
 python/cudf/cudf/_lib/transform.pyx           |  24 +--
 python/cudf/cudf/_lib/transpose.pyx           |   7 +-
 python/cudf/cudf/_lib/types.pxd               |   8 +-
 python/cudf/cudf/_lib/types.pyx               |  12 +-
 python/cudf/cudf/_lib/unary.pyx               |   3 +-
 python/cudf/cudf/_lib/utils.pxd               |   4 +-
 python/cudf/cudf/_lib/utils.pyx               |   9 +-
 .../cudf/cudf/core/_internals/expressions.py  |   4 +-
 python/cudf/cudf/core/buffer/buffer.py        |   3 +-
 python/cudf/cudf/core/column/numerical.py     |   3 +-
 python/cudf/cudf/core/indexed_frame.py        |   4 +-
 python/cudf/cudf/pandas/__init__.py           |   3 +-
 python/cudf/pyproject.toml                    |   3 +
 .../cudf_kafka/cudf_kafka/_lib/CMakeLists.txt |   4 +-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pxd   |   5 +-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pyx   |   3 +-
 .../cudf_polars/containers/column.py          |   2 +-
 .../cudf_polars/containers/dataframe.py       |   3 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    |   3 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      |   3 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |   3 +-
 .../cudf_polars/typing/__init__.py            |   4 +-
 .../cudf_polars/cudf_polars/utils/dtypes.py   |   3 +-
 .../cudf_polars/cudf_polars/utils/sorting.py  |   2 +-
 python/cudf_polars/pyproject.toml             |   2 +-
 .../tests/containers/test_column.py           |   3 +-
 .../tests/containers/test_dataframe.py        |   3 +-
 python/cudf_polars/tests/dsl/test_expr.py     |   3 +-
 .../tests/expressions/test_literal.py         |   3 +-
 .../tests/expressions/test_sort.py            |   3 +-
 .../cudf_polars/tests/utils/test_broadcast.py |   3 +-
 python/pylibcudf/CMakeLists.txt               | 100 +++++++++++
 python/pylibcudf/README.md                    |   1 +
 .../cmake/Modules/LinkPyarrowHeaders.cmake    |   0
 .../cmake/Modules/WheelHelpers.cmake          |   0
 .../pylibcudf/CMakeLists.txt                  |   2 +-
 python/pylibcudf/pylibcudf/VERSION            |   1 +
 .../_lib => pylibcudf}/pylibcudf/__init__.pxd |   0
 .../_lib => pylibcudf}/pylibcudf/__init__.py  |   1 +
 python/pylibcudf/pylibcudf/_version.py        |  24 +++
 .../pylibcudf/aggregation.pxd                 |   5 +-
 .../pylibcudf/aggregation.pyx                 |  21 +--
 .../_lib => pylibcudf}/pylibcudf/binaryop.pxd |   3 +-
 .../_lib => pylibcudf}/pylibcudf/binaryop.pyx |  15 +-
 .../_lib => pylibcudf}/pylibcudf/column.pxd   |  11 +-
 .../_lib => pylibcudf}/pylibcudf/column.pyx   |  11 +-
 .../pylibcudf/column_factories.pxd            |   3 +-
 .../pylibcudf/column_factories.pyx            |   7 +-
 .../pylibcudf/concatenate.pxd                 |   0
 .../pylibcudf/concatenate.pyx                 |  11 +-
 .../_lib => pylibcudf}/pylibcudf/copying.pxd  |   5 +-
 .../_lib => pylibcudf}/pylibcudf/copying.pyx  |  21 ++-
 .../_lib => pylibcudf}/pylibcudf/datetime.pxd |   0
 .../_lib => pylibcudf}/pylibcudf/datetime.pyx |   7 +-
 .../pylibcudf/exception_handler.pxd           |   0
 .../pylibcudf/experimental.pxd                |   0
 .../pylibcudf/experimental.pyx                |   3 +-
 .../pylibcudf/expressions.pxd                 |   3 +-
 .../pylibcudf/expressions.pyx                 |  15 +-
 .../_lib => pylibcudf}/pylibcudf/filling.pxd  |   2 +-
 .../_lib => pylibcudf}/pylibcudf/filling.pyx  |   9 +-
 .../pylibcudf/gpumemoryview.pxd               |   0
 .../pylibcudf/gpumemoryview.pyx               |   0
 .../_lib => pylibcudf}/pylibcudf/groupby.pxd  |   9 +-
 .../_lib => pylibcudf}/pylibcudf/groupby.pyx  |  15 +-
 .../_lib => pylibcudf}/pylibcudf/interop.pyx  |   6 +-
 .../pylibcudf/io/CMakeLists.txt               |   0
 .../pylibcudf/io/__init__.pxd                 |   0
 .../pylibcudf/io/__init__.py                  |   0
 python/pylibcudf/pylibcudf/io/avro.pxd        |  12 ++
 .../_lib => pylibcudf}/pylibcudf/io/avro.pyx  |   7 +-
 .../_lib => pylibcudf}/pylibcudf/io/csv.pyx   |  11 +-
 .../pylibcudf/io/datasource.pxd               |   5 +-
 .../pylibcudf/io/datasource.pyx               |   5 +-
 .../_lib => pylibcudf}/pylibcudf/io/json.pxd  |   7 +-
 .../_lib => pylibcudf}/pylibcudf/io/json.pyx  |  19 +-
 .../pylibcudf/io/parquet.pxd                  |  11 +-
 .../pylibcudf/io/parquet.pyx                  |  15 +-
 .../_lib => pylibcudf}/pylibcudf/io/types.pxd |   7 +-
 .../_lib => pylibcudf}/pylibcudf/io/types.pyx |  13 +-
 .../_lib => pylibcudf}/pylibcudf/join.pxd     |   2 +-
 .../_lib => pylibcudf}/pylibcudf/join.pyx     |   9 +-
 .../pylibcudf/libcudf/CMakeLists.txt          |   0
 .../pylibcudf/libcudf/__init__.pxd            |   0
 .../pylibcudf/libcudf/__init__.py             |   0
 .../pylibcudf/libcudf/aggregation.pxd         |   3 +-
 .../pylibcudf/libcudf/aggregation.pyx         |   0
 .../pylibcudf/libcudf/binaryop.pxd            |  11 +-
 .../pylibcudf/libcudf/binaryop.pyx            |   0
 .../pylibcudf/libcudf/column/__init__.pxd     |   0
 .../pylibcudf/libcudf/column/__init__.py      |   0
 .../pylibcudf/libcudf/column/column.pxd       |   9 +-
 .../libcudf/column/column_factories.pxd       |  11 +-
 .../pylibcudf/libcudf/column/column_view.pxd  |   7 +-
 .../pylibcudf/libcudf/concatenate.pxd         |   7 +-
 .../pylibcudf/libcudf/contiguous_split.pxd    |   5 +-
 .../pylibcudf/libcudf/copying.pxd             |  19 +-
 .../pylibcudf/libcudf/copying.pyx             |   0
 .../pylibcudf/libcudf/datetime.pxd            |   7 +-
 .../pylibcudf/libcudf/experimental.pxd        |   0
 .../pylibcudf/libcudf/expressions.pxd         |   9 +-
 .../pylibcudf/libcudf/expressions.pyx         |   0
 .../pylibcudf/libcudf/filling.pxd             |  13 +-
 .../pylibcudf/libcudf/groupby.pxd             |  19 +-
 .../pylibcudf/libcudf/hash.pxd                |   7 +-
 .../pylibcudf/libcudf/interop.pxd             |  13 +-
 .../pylibcudf/libcudf/io/CMakeLists.txt       |   0
 .../pylibcudf/libcudf/io/__init__.pxd         |   0
 .../pylibcudf/libcudf/io/__init__.py          |   0
 .../pylibcudf/libcudf/io/arrow_io_source.pxd  |   3 +-
 .../pylibcudf/libcudf/io/avro.pxd             |   5 +-
 .../pylibcudf/libcudf/io/csv.pxd              |   7 +-
 .../pylibcudf/libcudf/io/data_sink.pxd        |   0
 .../pylibcudf/libcudf/io/datasource.pxd       |   0
 .../pylibcudf/libcudf/io/json.pxd             |   7 +-
 .../pylibcudf/libcudf/io/json.pyx             |   0
 .../pylibcudf/libcudf/io/orc.pxd              |   7 +-
 .../pylibcudf/libcudf/io/orc_metadata.pxd     |   5 +-
 .../pylibcudf/libcudf/io/parquet.pxd          |  76 ++++----
 .../pylibcudf/libcudf/io/parquet_metadata.pxd |   5 +-
 .../pylibcudf/libcudf/io/text.pxd             |   3 +-
 .../pylibcudf/libcudf/io/timezone.pxd         |   3 +-
 .../pylibcudf/libcudf/io/types.pxd            |  11 +-
 .../pylibcudf/libcudf/io/types.pyx            |   0
 .../pylibcudf/libcudf/join.pxd                |   9 +-
 .../pylibcudf/libcudf/labeling.pxd            |   5 +-
 .../pylibcudf/libcudf/lists/__init__.pxd      |   0
 .../pylibcudf/libcudf/lists/__init__.py       |   0
 .../pylibcudf/libcudf/lists/combine.pxd       |   7 +-
 .../pylibcudf/libcudf/lists/contains.pxd      |  13 +-
 .../libcudf/lists/count_elements.pxd          |   7 +-
 .../pylibcudf/libcudf/lists/explode.pxd       |   7 +-
 .../pylibcudf/libcudf/lists/extract.pxd       |   9 +-
 .../pylibcudf/libcudf/lists/filling.pxd       |   5 +-
 .../pylibcudf/libcudf/lists/gather.pxd        |   7 +-
 .../libcudf/lists/lists_column_view.pxd       |   4 +-
 .../pylibcudf/libcudf/lists/reverse.pxd       |   7 +-
 .../libcudf/lists/set_operations.pxd          |   9 +-
 .../pylibcudf/libcudf/lists/sorting.pxd       |   9 +-
 .../libcudf/lists/stream_compaction.pxd       |   9 +-
 .../pylibcudf/libcudf/merge.pxd               |   7 +-
 .../pylibcudf/libcudf/null_mask.pxd           |  11 +-
 .../pylibcudf/libcudf/nvtext/__init__.pxd     |   0
 .../pylibcudf/libcudf/nvtext/__init__.py      |   0
 .../libcudf/nvtext/byte_pair_encode.pxd       |   7 +-
 .../libcudf/nvtext/edit_distance.pxd          |   5 +-
 .../libcudf/nvtext/generate_ngrams.pxd        |   9 +-
 .../pylibcudf/libcudf/nvtext/jaccard.pxd      |   7 +-
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |   7 +-
 .../libcudf/nvtext/ngrams_tokenize.pxd        |   9 +-
 .../pylibcudf/libcudf/nvtext/normalize.pxd    |   5 +-
 .../pylibcudf/libcudf/nvtext/replace.pxd      |   9 +-
 .../pylibcudf/libcudf/nvtext/stemmer.pxd      |   7 +-
 .../libcudf/nvtext/subword_tokenize.pxd       |   5 +-
 .../pylibcudf/libcudf/nvtext/tokenize.pxd     |   9 +-
 .../pylibcudf/libcudf/partitioning.pxd        |  11 +-
 .../pylibcudf/libcudf/quantiles.pxd           |  11 +-
 .../pylibcudf/libcudf/reduce.pxd              |  14 +-
 .../pylibcudf/libcudf/reduce.pyx              |   0
 .../pylibcudf/libcudf/replace.pxd             |   9 +-
 .../pylibcudf/libcudf/replace.pyx             |   0
 .../pylibcudf/libcudf/reshape.pxd             |   9 +-
 .../pylibcudf/libcudf/rolling.pxd             |  11 +-
 .../pylibcudf/libcudf/round.pxd               |   5 +-
 .../pylibcudf/libcudf/round.pyx               |   0
 .../pylibcudf/libcudf/scalar/__init__.pxd     |   0
 .../pylibcudf/libcudf/scalar/__init__.py      |   0
 .../pylibcudf/libcudf/scalar/scalar.pxd       |   9 +-
 .../libcudf/scalar/scalar_factories.pxd       |   5 +-
 .../pylibcudf/libcudf/search.pxd              |   9 +-
 .../pylibcudf/libcudf/sorting.pxd             |  15 +-
 .../pylibcudf/libcudf/stream_compaction.pxd   |  13 +-
 .../pylibcudf/libcudf/stream_compaction.pyx   |   0
 .../pylibcudf/libcudf/strings/CMakeLists.txt  |   0
 .../pylibcudf/libcudf/strings/__init__.pxd    |   0
 .../pylibcudf/libcudf/strings/__init__.py     |   0
 .../pylibcudf/libcudf/strings/attributes.pxd  |   5 +-
 .../pylibcudf/libcudf/strings/capitalize.pxd  |  11 +-
 .../pylibcudf/libcudf/strings/case.pxd        |   5 +-
 .../pylibcudf/libcudf/strings/char_types.pxd  |   7 +-
 .../pylibcudf/libcudf/strings/char_types.pyx  |   0
 .../pylibcudf/libcudf/strings/combine.pxd     |   9 +-
 .../pylibcudf/libcudf/strings/contains.pxd    |   9 +-
 .../libcudf/strings/convert/__init__.pxd      |   0
 .../libcudf/strings/convert/__init__.py       |   0
 .../strings/convert/convert_booleans.pxd      |   7 +-
 .../strings/convert/convert_datetime.pxd      |   7 +-
 .../strings/convert/convert_durations.pxd     |   7 +-
 .../strings/convert/convert_fixed_point.pxd   |   7 +-
 .../strings/convert/convert_floats.pxd        |   7 +-
 .../strings/convert/convert_integers.pxd      |   7 +-
 .../libcudf/strings/convert/convert_ipv4.pxd  |   5 +-
 .../libcudf/strings/convert/convert_lists.pxd |   7 +-
 .../libcudf/strings/convert/convert_urls.pxd  |   5 +-
 .../pylibcudf/libcudf/strings/extract.pxd     |  14 ++
 .../pylibcudf/libcudf/strings/find.pxd        |   9 +-
 .../libcudf/strings/find_multiple.pxd         |   5 +-
 .../pylibcudf/libcudf/strings/findall.pxd     |   7 +-
 .../pylibcudf/libcudf/strings/json.pxd        |   7 +-
 .../pylibcudf/libcudf/strings/padding.pxd     |  11 +-
 .../pylibcudf/libcudf/strings/regex_flags.pxd |   0
 .../pylibcudf/libcudf/strings/regex_flags.pyx |   0
 .../libcudf/strings/regex_program.pxd         |   3 +-
 .../pylibcudf/libcudf/strings/repeat.pxd      |   7 +-
 .../pylibcudf/libcudf/strings/replace.pxd     |   9 +-
 .../pylibcudf/libcudf/strings/replace_re.pxd  |  13 +-
 .../pylibcudf/libcudf/strings/side_type.pxd   |   0
 .../libcudf/strings/split/__init__.pxd        |   0
 .../libcudf/strings/split/__init__.py         |   0
 .../libcudf/strings/split/partition.pxd       |   9 +-
 .../pylibcudf/libcudf/strings/split/split.pxd |  13 +-
 .../pylibcudf/libcudf/strings/strip.pxd       |   9 +-
 .../pylibcudf/libcudf/strings/substring.pxd   |   9 +-
 .../pylibcudf/libcudf/strings/translate.pxd   |   9 +-
 .../pylibcudf/libcudf/strings/wrap.pxd        |   7 +-
 .../pylibcudf/libcudf/strings_udf.pxd         |   7 +-
 .../pylibcudf/libcudf/table/__init__.pxd      |   0
 .../pylibcudf/libcudf/table/__init__.py       |   0
 .../pylibcudf/libcudf/table/table.pxd         |  10 +-
 .../pylibcudf/libcudf/table/table_view.pxd    |   5 +-
 .../pylibcudf/libcudf/transform.pxd           |  17 +-
 .../pylibcudf/libcudf/transpose.pxd           |   5 +-
 .../pylibcudf/libcudf/types.pxd               |   0
 .../pylibcudf/libcudf/types.pyx               |   0
 .../pylibcudf/libcudf/unary.pxd               |   7 +-
 .../pylibcudf/libcudf/unary.pyx               |   0
 .../pylibcudf/libcudf/utilities/__init__.pxd  |   0
 .../pylibcudf/libcudf/utilities/__init__.py   |   0
 .../pylibcudf/libcudf/utilities/host_span.pxd |   0
 .../pylibcudf/libcudf/utilities/traits.pxd    |   3 +-
 .../libcudf/utilities/type_dispatcher.pxd     |   2 +-
 .../pylibcudf/libcudf/wrappers/__init__.pxd   |   0
 .../pylibcudf/libcudf/wrappers/__init__.py    |   0
 .../pylibcudf/libcudf/wrappers/decimals.pxd   |   3 +-
 .../pylibcudf/libcudf/wrappers/durations.pxd  |   0
 .../pylibcudf/libcudf/wrappers/timestamps.pxd |   0
 .../_lib => pylibcudf}/pylibcudf/lists.pxd    |   3 +-
 .../_lib => pylibcudf}/pylibcudf/lists.pyx    |  29 ++-
 .../_lib => pylibcudf}/pylibcudf/merge.pxd    |   0
 .../_lib => pylibcudf}/pylibcudf/merge.pyx    |   9 +-
 .../pylibcudf/quantiles.pxd                   |   3 +-
 .../pylibcudf/quantiles.pyx                   |  11 +-
 .../_lib => pylibcudf}/pylibcudf/reduce.pxd   |   2 +-
 .../_lib => pylibcudf}/pylibcudf/reduce.pyx   |  17 +-
 .../_lib => pylibcudf}/pylibcudf/replace.pxd  |   3 +-
 .../_lib => pylibcudf}/pylibcudf/replace.pyx  |   7 +-
 .../_lib => pylibcudf}/pylibcudf/reshape.pxd  |   2 +-
 .../_lib => pylibcudf}/pylibcudf/reshape.pyx  |   9 +-
 .../_lib => pylibcudf}/pylibcudf/rolling.pxd  |   2 +-
 .../_lib => pylibcudf}/pylibcudf/rolling.pyx  |   9 +-
 .../_lib => pylibcudf}/pylibcudf/round.pxd    |   3 +-
 .../_lib => pylibcudf}/pylibcudf/round.pyx    |  10 +-
 .../_lib => pylibcudf}/pylibcudf/scalar.pxd   |   3 +-
 .../_lib => pylibcudf}/pylibcudf/scalar.pyx   |   7 +-
 .../_lib => pylibcudf}/pylibcudf/search.pxd   |   0
 .../_lib => pylibcudf}/pylibcudf/search.pyx   |   7 +-
 .../_lib => pylibcudf}/pylibcudf/sorting.pxd  |  10 +-
 .../_lib => pylibcudf}/pylibcudf/sorting.pyx  |  11 +-
 .../pylibcudf/stream_compaction.pxd           |   6 +-
 .../pylibcudf/stream_compaction.pyx           |  17 +-
 .../pylibcudf/strings/CMakeLists.txt          |   0
 .../pylibcudf/strings/__init__.pxd            |   0
 .../pylibcudf/strings/__init__.py             |   0
 .../pylibcudf/strings/capitalize.pxd          |   4 +-
 .../pylibcudf/strings/capitalize.pyx          |  15 +-
 .../pylibcudf/strings/case.pxd                |   2 +-
 .../pylibcudf/strings/case.pyx                |   7 +-
 .../pylibcudf/strings/char_types.pxd          |   3 +
 .../pylibcudf/strings/char_types.pyx          |   2 +-
 .../pylibcudf/pylibcudf/strings/contains.pxd  |   7 +
 .../pylibcudf/strings/contains.pyx            |   9 +-
 .../pylibcudf/strings/find.pxd                |   6 +-
 .../pylibcudf/strings/find.pyx                |  27 ++-
 .../pylibcudf/strings/regex_flags.pxd         |   2 +
 .../pylibcudf/strings/regex_flags.pyx         |   2 +-
 .../pylibcudf/strings/regex_program.pxd       |   3 +-
 .../pylibcudf/strings/regex_program.pyx       |   8 +-
 .../pylibcudf/strings/replace.pxd             |   6 +-
 .../pylibcudf/strings/replace.pyx             |  15 +-
 .../pylibcudf/strings/slice.pxd               |   4 +-
 .../pylibcudf/strings/slice.pyx               |  21 ++-
 .../_lib => pylibcudf}/pylibcudf/table.pxd    |   5 +-
 .../_lib => pylibcudf}/pylibcudf/table.pyx    |   7 +-
 .../pylibcudf/tests}/common/utils.py          |  14 +-
 .../pylibcudf/tests}/conftest.py              |   5 +-
 .../pylibcudf/tests}/io/test_avro.py          |   3 +-
 .../pylibcudf/tests}/io/test_csv.py           |   5 +-
 .../pylibcudf/tests}/io/test_json.py          |   5 +-
 .../pylibcudf/tests}/io/test_parquet.py       |   7 +-
 .../tests}/io/test_source_sink_info.py        |   3 +-
 .../pylibcudf/tests}/pytest.ini               |   0
 .../pylibcudf/tests}/test_binaryops.py        |   3 +-
 .../pylibcudf/tests}/test_column_factories.py |   3 +-
 .../tests}/test_column_from_device.py         |   3 +-
 .../pylibcudf/tests}/test_copying.py          |   3 +-
 .../pylibcudf/tests}/test_datetime.py         |   6 +-
 .../pylibcudf/tests}/test_expressions.py      |   3 +-
 .../pylibcudf/tests}/test_interop.py          |   3 +-
 .../pylibcudf/tests}/test_join.py             |   3 +-
 .../pylibcudf/tests}/test_lists.py            |   6 +-
 .../pylibcudf/tests}/test_quantiles.py        |   3 +-
 .../pylibcudf/tests}/test_regex_program.py    |   3 +-
 .../pylibcudf/tests}/test_reshape.py          |   3 +-
 .../pylibcudf/tests}/test_round.py            |   8 +-
 .../tests}/test_string_capitalize.py          |  10 +-
 .../pylibcudf/tests}/test_string_case.py      |  10 +-
 .../pylibcudf/tests}/test_string_contains.py  |   6 +-
 .../pylibcudf/tests}/test_string_find.py      |   8 +-
 .../pylibcudf/tests}/test_string_replace.py   |   8 +-
 .../pylibcudf/tests}/test_string_slice.py     |   3 +-
 .../pylibcudf/tests}/test_table.py            |   3 +-
 .../pylibcudf/tests}/test_traits.py           |   2 +-
 .../pylibcudf/tests}/test_transform.py        |   3 +-
 .../pylibcudf/tests}/test_unary.py            |   2 +-
 .../_lib => pylibcudf}/pylibcudf/traits.pxd   |   0
 .../_lib => pylibcudf}/pylibcudf/traits.pyx   |   3 +-
 .../pylibcudf/transform.pxd                   |   0
 .../pylibcudf/transform.pyx                   |   5 +-
 .../_lib => pylibcudf}/pylibcudf/types.pxd    |   3 +-
 .../_lib => pylibcudf}/pylibcudf/types.pyx    |  27 ++-
 .../_lib => pylibcudf}/pylibcudf/unary.pxd    |   3 +-
 .../_lib => pylibcudf}/pylibcudf/unary.pyx    |   9 +-
 .../_lib => pylibcudf}/pylibcudf/utils.pxd    |   5 +-
 .../_lib => pylibcudf}/pylibcudf/utils.pyx    |   5 +-
 .../_lib => pylibcudf/pylibcudf}/variant.pxd  |   0
 python/pylibcudf/pyproject.toml               | 123 +++++++++++++
 475 files changed, 1916 insertions(+), 1522 deletions(-)
 create mode 100755 ci/build_wheel_pylibcudf.sh
 create mode 100644 conda/recipes/pylibcudf/build.sh
 create mode 100644 conda/recipes/pylibcudf/conda_build_config.yaml
 create mode 100644 conda/recipes/pylibcudf/meta.yaml
 delete mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
 delete mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
 delete mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd
 delete mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
 delete mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
 create mode 100644 python/pylibcudf/CMakeLists.txt
 create mode 120000 python/pylibcudf/README.md
 rename python/{cudf => pylibcudf}/cmake/Modules/LinkPyarrowHeaders.cmake (100%)
 rename python/{cudf => pylibcudf}/cmake/Modules/WheelHelpers.cmake (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/CMakeLists.txt (96%)
 create mode 120000 python/pylibcudf/pylibcudf/VERSION
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/__init__.py (99%)
 create mode 100644 python/pylibcudf/pylibcudf/_version.py
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/aggregation.pxd (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/aggregation.pyx (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/binaryop.pxd (90%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/binaryop.pyx (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/column.pxd (84%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/column.pyx (98%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/column_factories.pxd (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/column_factories.pyx (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/concatenate.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/concatenate.pyx (80%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/copying.pxd (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/copying.pyx (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/datetime.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/datetime.pyx (78%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/exception_handler.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/experimental.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/experimental.pyx (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/expressions.pxd (91%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/expressions.pyx (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/filling.pxd (90%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/filling.pyx (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/gpumemoryview.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/gpumemoryview.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/groupby.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/groupby.pyx (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/interop.pyx (98%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/CMakeLists.txt (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/__init__.py (100%)
 create mode 100644 python/pylibcudf/pylibcudf/io/avro.pxd
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/avro.pyx (89%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/csv.pyx (97%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/datasource.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/datasource.pyx (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/json.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/json.pyx (95%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/parquet.pxd (72%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/parquet.pyx (93%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/types.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/types.pyx (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/join.pxd (91%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/join.pyx (95%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/CMakeLists.txt (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/aggregation.pxd (98%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/aggregation.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/binaryop.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/binaryop.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/column/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/column/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/column/column.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/column/column_factories.pxd (93%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/column/column_view.pxd (97%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/concatenate.pxd (77%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/contiguous_split.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/copying.pxd (90%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/copying.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/datetime.pxd (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/experimental.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/expressions.pxd (90%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/expressions.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/filling.pxd (74%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/groupby.pxd (83%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/hash.pxd (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/interop.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/CMakeLists.txt (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/arrow_io_source.pxd (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/avro.pxd (91%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/csv.pxd (98%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/data_sink.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/datasource.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/json.pxd (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/json.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/orc.pxd (97%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/orc_metadata.pxd (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/parquet.pxd (80%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/parquet_metadata.pxd (89%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/text.pxd (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/timezone.pxd (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/types.pxd (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/types.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/join.pxd (88%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/labeling.pxd (78%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/combine.pxd (78%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/contains.pxd (75%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/count_elements.pxd (61%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/explode.pxd (59%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/extract.pxd (64%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/filling.pxd (76%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/gather.pxd (67%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/lists_column_view.pxd (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/reverse.pxd (62%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/set_operations.pxd (81%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/sorting.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/stream_compaction.pxd (68%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/merge.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/null_mask.pxd (80%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/edit_distance.pxd (75%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/generate_ngrams.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/jaccard.pxd (61%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/minhash.pxd (70%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd (58%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/normalize.pxd (75%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/replace.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/stemmer.pxd (79%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/subword_tokenize.pxd (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/tokenize.pxd (84%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/partitioning.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/quantiles.pxd (70%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/reduce.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/reduce.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/replace.pxd (83%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/replace.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/reshape.pxd (57%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/rolling.pxd (64%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/round.pxd (75%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/round.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/scalar/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/scalar/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/scalar/scalar.pxd (91%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/scalar/scalar_factories.pxd (76%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/search.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/sorting.pxd (84%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/stream_compaction.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/stream_compaction.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/CMakeLists.txt (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/attributes.pxd (76%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/capitalize.pxd (63%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/case.pxd (81%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/char_types.pxd (82%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/char_types.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/combine.pxd (83%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/contains.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_booleans.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_datetime.pxd (76%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_durations.pxd (72%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_floats.pxd (71%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_integers.pxd (80%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd (76%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_lists.pxd (62%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_urls.pxd (72%)
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/find.pxd (83%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/find_multiple.pxd (68%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/findall.pxd (56%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/json.pxd (79%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/padding.pxd (59%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/regex_flags.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/regex_flags.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/regex_program.pxd (84%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/repeat.pxd (67%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/replace.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/replace_re.pxd (63%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/side_type.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/split/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/split/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/split/partition.pxd (63%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/split/split.pxd (78%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/strip.pxd (52%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/substring.pxd (66%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/translate.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/wrap.pxd (58%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings_udf.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/table/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/table/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/table/table.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/table/table_view.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/transform.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/transpose.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/types.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/types.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/unary.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/unary.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/utilities/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/utilities/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/utilities/host_span.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/utilities/traits.pxd (93%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/utilities/type_dispatcher.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/wrappers/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/wrappers/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/wrappers/decimals.pxd (90%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/wrappers/durations.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/wrappers/timestamps.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/lists.pxd (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/lists.pyx (95%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/merge.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/merge.pyx (83%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/quantiles.pxd (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/quantiles.pyx (93%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/reduce.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/reduce.pyx (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/replace.pxd (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/replace.pyx (97%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/reshape.pxd (80%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/reshape.pyx (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/rolling.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/rolling.pyx (89%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/round.pxd (77%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/round.pyx (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/scalar.pxd (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/scalar.pyx (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/search.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/search.pyx (93%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/sorting.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/sorting.pyx (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/stream_compaction.pxd (89%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/stream_compaction.pyx (95%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/CMakeLists.txt (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/capitalize.pxd (64%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/capitalize.pyx (84%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/case.pxd (76%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/case.pyx (79%)
 create mode 100644 python/pylibcudf/pylibcudf/strings/char_types.pxd
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/char_types.pyx (64%)
 create mode 100644 python/pylibcudf/pylibcudf/strings/contains.pxd
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/contains.pyx (75%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/find.pxd (77%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/find.pyx (90%)
 create mode 100644 python/pylibcudf/pylibcudf/strings/regex_flags.pxd
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/regex_flags.pyx (59%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/regex_program.pxd (70%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/regex_program.pyx (84%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/replace.pxd (71%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/replace.pyx (90%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/slice.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/slice.pyx (81%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/table.pxd (78%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/table.pyx (93%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/common/utils.py (97%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/conftest.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/io/test_avro.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/io/test_csv.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/io/test_json.py (99%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/io/test_parquet.py (97%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/io/test_source_sink_info.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/pytest.ini (100%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_binaryops.py (99%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_column_factories.py (99%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_column_from_device.py (97%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_copying.py (99%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_datetime.py (83%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_expressions.py (97%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_interop.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_join.py (94%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_lists.py (99%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_quantiles.py (99%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_regex_program.py (89%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_reshape.py (96%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_round.py (86%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_string_capitalize.py (86%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_string_case.py (80%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_string_contains.py (92%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_string_find.py (97%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_string_replace.py (95%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_string_slice.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_table.py (93%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_traits.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_transform.py (95%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_unary.py (93%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/traits.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/traits.pyx (98%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/transform.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/transform.pyx (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/types.pxd (91%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/types.pyx (66%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/unary.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/unary.pyx (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/utils.pxd (71%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/utils.pyx (93%)
 rename python/{cudf/cudf/_lib => pylibcudf/pylibcudf}/variant.pxd (100%)
 create mode 100644 python/pylibcudf/pyproject.toml

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 48967417af3..90cdda4d3ca 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -12,7 +12,7 @@ cudf.polars:
   - 'python/cudf_polars/**'
 
 pylibcudf:
-  - 'python/cudf/cudf/_lib/pylibcudf/**'
+  - 'python/cudf/pylibcudf/**'
 
 libcudf:
   - 'cpp/**'
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index ea8a1762b2c..74bdc666c68 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -23,6 +23,7 @@ jobs:
       - static-configure
       - conda-notebook-tests
       - docs-build
+      - wheel-build-pylibcudf
       - wheel-build-cudf
       - wheel-tests-cudf
       - wheel-build-cudf-polars
@@ -120,10 +121,17 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
-  wheel-build-cudf:
+  wheel-build-pylibcudf:
     needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    with:
+      build_type: pull-request
+      script: "ci/build_wheel_pylibcudf.sh"
+  wheel-build-cudf:
+    needs: wheel-build-pylibcudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
@@ -135,7 +143,7 @@ jobs:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-cudf-polars:
-    needs: wheel-build-cudf
+    needs: wheel-build-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
diff --git a/build.sh b/build.sh
index 52bb1e64d16..957f41aedac 100755
--- a/build.sh
+++ b/build.sh
@@ -17,11 +17,12 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
-HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
+VALIDARGS="clean libcudf pylibcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
+HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
    clean                         - remove all existing build artifacts and configuration (start
                                    over)
    libcudf                       - build the cudf C++ code only
+   pylibcudf                     - build the pylibcudf Python package
    cudf                          - build the cudf Python package
    cudfjar                       - build cudf JAR with static libcudf using devtoolset toolchain
    dask_cudf                     - build the dask_cudf Python package
@@ -268,7 +269,7 @@ fi
 ################################################################################
 # Configure, build, and install libcudf
 
-if buildAll || hasArg libcudf || hasArg cudf || hasArg cudfjar; then
+if buildAll || hasArg libcudf || hasArg pylibcudf || hasArg cudf || hasArg cudfjar; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         CUDF_CMAKE_CUDA_ARCHITECTURES="${CUDF_CMAKE_CUDA_ARCHITECTURES:-NATIVE}"
         if [[ "$CUDF_CMAKE_CUDA_ARCHITECTURES" == "NATIVE" ]]; then
@@ -340,6 +341,14 @@ if buildAll || hasArg libcudf; then
     fi
 fi
 
+# Build and install the pylibcudf Python package
+if buildAll || hasArg pylibcudf; then
+
+    cd ${REPODIR}/python/pylibcudf
+    SKBUILD_CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX};-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR};-DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES};${EXTRA_CMAKE_ARGS}" \
+        python ${PYTHON_ARGS_FOR_INSTALL} .
+fi
+
 # Build and install the cudf Python package
 if buildAll || hasArg cudf; then
 
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 14dc7a59048..c67d127e635 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -29,7 +29,7 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  libcudf cudf dask-cudf
+  libcudf pylibcudf cudf dask-cudf
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 79e09432779..2e3f70ba767 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -22,9 +22,16 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 # TODO: Remove `--no-test` flag once importing on a CPU
 # node works correctly
 # With boa installed conda build forwards to the boa builder
+
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+  --no-test \
+  --channel "${CPP_CHANNEL}" \
+  conda/recipes/pylibcudf
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
+  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf
 
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index 1b563bc499c..7c0fb1efebe 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -7,10 +7,14 @@ package_dir="python/cudf"
 
 export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON"
 
+# Download the pylibcudf built in the previous step
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 /tmp/pylibcudf_dist
+
+echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" > /tmp/constraints.txt
+export PIP_CONSTRAINT="/tmp/constraints.txt"
 ./ci/build_wheel.sh ${package_dir}
 
 python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
-
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
new file mode 100755
index 00000000000..b25d118ff81
--- /dev/null
+++ b/ci/build_wheel_pylibcudf.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir="python/pylibcudf"
+
+export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON"
+
+./ci/build_wheel.sh ${package_dir}
+
+python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
+
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index 48ee4a05628..8deaeab78a3 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -11,7 +11,9 @@ rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids
 rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+python -m pip install $(ls ./local-pylibcudf-dep/pylibcudf*.whl)
 python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,pandas-tests]
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 1c3b99953fb..bfb655db3ca 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -36,7 +36,9 @@ if [ "$no_cudf" = true ]; then
     echo "Skipping cudf install"
 else
     RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+    RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
     RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+    python -m pip install $(ls ./local-pylibcudf-dep/pylibcudf*.whl)
     python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf-pandas-tests]
 fi
 
diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh
index 217dd2fd9a8..ae34047e87f 100755
--- a/ci/test_python_cudf.sh
+++ b/ci/test_python_cudf.sh
@@ -15,7 +15,7 @@ trap "EXITCODE=1" ERR
 set +e
 
 rapids-logger "pytest pylibcudf"
-pushd python/cudf/cudf/pylibcudf_tests
+pushd python/pylibcudf/pylibcudf/tests
 python -m pytest \
   --cache-clear \
   --dist=worksteal \
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index fdb61278d36..5a2c3ccac8f 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -3,11 +3,15 @@
 
 set -eou pipefail
 
+# Download the pylibcudf built in the previous step
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
-# echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/cudf*.whl)[test]
+# Install both pylibcudf and cudf
+python -m pip install \
+    "$(echo ./local-pylibcudf-dep/pylibcudf*.whl)[test]" \
+    "$(echo ./dist/cudf*.whl)[test]"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
@@ -15,7 +19,7 @@ mkdir -p "${RAPIDS_TESTS_DIR}"
 
 
 rapids-logger "pytest pylibcudf"
-pushd python/cudf/cudf/pylibcudf_tests
+pushd python/pylibcudf/pylibcudf/tests
 python -m pytest \
   --cache-clear \
   --dist=worksteal \
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index cc9f5788685..357d4170d47 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -10,7 +10,7 @@ set -eou pipefail
 # files in cudf_polars/pylibcudf", rather than "are there changes
 # between upstream and this branch which touch cudf_polars/pylibcudf"
 # TODO: is the target branch exposed anywhere in an environment variable?
-if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/pylibcudf/)" ];
 then
     HAS_CHANGES=1
 else
@@ -21,8 +21,8 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
 # Download the cudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install ./local-cudf-dep/cudf*.whl
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
+python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
 
 rapids-logger "Install cudf_polars"
 python -m pip install $(echo ./dist/cudf_polars*.whl)[test]
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index c3800d3cc25..4d045472604 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -7,8 +7,11 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
 # Download the cudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install ./local-cudf-dep/cudf*.whl
+python -m pip install \
+    "$(echo ./local-pylibcudf-dep/pylibcudf*.whl)" \
+    "$(echo ./local-cudf-dep/cudf*.whl)"
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8d5fc2e31d9..d0d18e57abc 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -56,12 +56,14 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
+- numpy
 - numpy>=1.23,<2.0a0
 - numpydoc
 - nvcc_linux-64=11.8
 - nvcomp==3.0.6
 - nvtx>=0.2.1
 - packaging
+- pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
 - pip
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 7b0485d7f29..caf39a32d79 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -55,11 +55,13 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
+- numpy
 - numpy>=1.23,<2.0a0
 - numpydoc
 - nvcomp==3.0.6
 - nvtx>=0.2.1
 - packaging
+- pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
 - pip
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 8d7ef63715b..7e86147732e 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -68,6 +68,7 @@ requirements:
     - numpy 1.23
     - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
+    - pylibcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
     - cudatoolkit
@@ -87,6 +88,7 @@ requirements:
     - numpy >=1.23,<2.0a0
     - {{ pin_compatible('pyarrow', max_pin='x.x') }}
     - libcudf ={{ version }}
+    - pylibcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 748a32e5518..d04d9b21a46 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -58,7 +58,7 @@ requirements:
     - python
     - cython >=3.0.3
     - cuda-version ={{ cuda_version }}
-    - cudf ={{ version }}
+    - pylibcudf ={{ version }}
     - libcudf_kafka ={{ version }}
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
@@ -69,7 +69,7 @@ requirements:
     - python
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - libcudf_kafka ={{ version }}
-    - cudf ={{ version }}
+    - pylibcudf ={{ version }}
     {% if cuda_major != "11" %}
     - cuda-cudart
     {% endif %}
diff --git a/conda/recipes/pylibcudf/build.sh b/conda/recipes/pylibcudf/build.sh
new file mode 100644
index 00000000000..483346504db
--- /dev/null
+++ b/conda/recipes/pylibcudf/build.sh
@@ -0,0 +1,4 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+
+# This assumes the script is executed from the root of the repo directory
+./build.sh pylibcudf
diff --git a/conda/recipes/pylibcudf/conda_build_config.yaml b/conda/recipes/pylibcudf/conda_build_config.yaml
new file mode 100644
index 00000000000..af894cccda0
--- /dev/null
+++ b/conda/recipes/pylibcudf/conda_build_config.yaml
@@ -0,0 +1,20 @@
+c_compiler_version:
+  - 11
+
+cxx_compiler_version:
+  - 11
+
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
+  - "2.17"
+
+cmake_version:
+  - ">=3.26.4,!=3.30.0"
+
+cuda_compiler:
+  - cuda-nvcc
+
+cuda11_compiler:
+  - nvcc
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
new file mode 100644
index 00000000000..f405fd10f5d
--- /dev/null
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -0,0 +1,108 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
+{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set py_version = environ['CONDA_PY'] %}
+{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
+{% set cuda_major = cuda_version.split('.')[0] %}
+{% set date_string = environ['RAPIDS_DATE_STRING'] %}
+
+package:
+  name: pylibcudf
+  version: {{ version }}
+
+source:
+  path: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  script_env:
+    - AWS_ACCESS_KEY_ID
+    - AWS_SECRET_ACCESS_KEY
+    - AWS_SESSION_TOKEN
+    - CMAKE_C_COMPILER_LAUNCHER
+    - CMAKE_CUDA_COMPILER_LAUNCHER
+    - CMAKE_CXX_COMPILER_LAUNCHER
+    - CMAKE_GENERATOR
+    - PARALLEL_LEVEL
+    - SCCACHE_BUCKET
+    - SCCACHE_IDLE_TIMEOUT
+    - SCCACHE_REGION
+    - SCCACHE_S3_KEY_PREFIX=pylibcudf-aarch64 # [aarch64]
+    - SCCACHE_S3_KEY_PREFIX=pylibcudf-linux64 # [linux64]
+    - SCCACHE_S3_USE_SSL
+    - SCCACHE_S3_NO_CREDENTIALS
+  ignore_run_exports:
+    # libcudf's run_exports pinning is looser than we would like
+    - libcudf
+  ignore_run_exports_from:
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }}
+    {% else %}
+    - {{ compiler('cuda') }}
+    - cuda-cudart-dev
+    - libcufile-dev  # [linux64]
+    {% endif %}
+
+requirements:
+  build:
+    - cmake {{ cmake_version }}
+    - ninja
+    - {{ compiler('c') }}
+    - {{ compiler('cxx') }}
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    {% else %}
+    - {{ compiler('cuda') }}
+    {% endif %}
+    - cuda-version ={{ cuda_version }}
+    - {{ stdlib("c") }}
+  host:
+    - python
+    - cython >=3.0.3
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - scikit-build-core >=0.10.0
+    - dlpack >=0.8,<1.0
+    # TODO: Change to `2.0` for NumPy 2
+    - numpy 1.23
+    - pyarrow ==16.1.0.*
+    - libcudf ={{ version }}
+    - rmm ={{ minor_version }}
+    {% if cuda_major == "11" %}
+    - cudatoolkit
+    {% else %}
+    - cuda-cudart-dev
+    - cuda-nvrtc
+    - libcufile-dev  # [linux64]
+    {% endif %}
+    - cuda-version ={{ cuda_version }}
+  run:
+    - python
+    - typing_extensions >=4.0.0
+    - pandas >=2.0,<2.2.3dev0
+    # TODO: Update `numpy` in `host` when dropping `<2.0a0`
+    - numpy >=1.23,<2.0a0
+    - {{ pin_compatible('pyarrow', max_pin='x.x') }}
+    - {{ pin_compatible('rmm', max_pin='x.x') }}
+    - fsspec >=0.6.0
+    {% if cuda_major == "11" %}
+    - cuda-python >=11.7.1,<12.0a0
+    {% else %}
+    - cuda-python >=12.0,<13.0a0
+    {% endif %}
+    - nvtx >=0.2.1
+    - packaging
+
+test:
+  requires:
+    - cuda-version ={{ cuda_version }}
+  imports:
+    - pylibcudf
+
+about:
+  home: https://rapids.ai/
+  license: Apache-2.0
+  license_family: APACHE
+  license_file: LICENSE
+  summary: pylibcudf library
diff --git a/dependencies.yaml b/dependencies.yaml
index b0d62a9fb0d..ca615905a15 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -10,6 +10,7 @@ files:
       - build_all
       - build_cpp
       - build_python_common
+      - build_python_pylibcudf
       - build_python_cudf
       - cuda
       - cuda_version
@@ -22,12 +23,14 @@ files:
       - rapids_build_setuptools
       - run_common
       - run_cudf
+      - run_pylibcudf
       - run_dask_cudf
       - run_custreamz
       - test_cpp
       - test_python_common
       - test_python_cudf
       - test_python_dask_cudf
+      - test_python_pylibcudf
       - depends_on_cupy
   test_static_build:
     output: none
@@ -76,14 +79,14 @@ files:
       - docs
       - libarrow_run
       - py_version
-  py_rapids_build_cudf:
+  py_build_cudf:
     output: pyproject
     pyproject_dir: python/cudf
     extras:
       table: build-system
     includes:
       - rapids_build_skbuild
-  py_build_cudf:
+  py_rapids_build_cudf:
     output: pyproject
     pyproject_dir: python/cudf
     extras:
@@ -93,6 +96,7 @@ files:
       - build_base
       - build_python_common
       - build_python_cudf
+      - pylibcudf_build_dep
   py_run_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -103,6 +107,7 @@ files:
       - run_cudf
       - pyarrow_run
       - depends_on_cupy
+      - depends_on_pylibcudf
   py_test_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -112,6 +117,40 @@ files:
     includes:
       - test_python_common
       - test_python_cudf
+  py_rapids_build_pylibcudf:
+    output: pyproject
+    pyproject_dir: python/pylibcudf
+    extras:
+      table: build-system
+    includes:
+      - rapids_build_skbuild
+  py_build_pylibcudf:
+    output: pyproject
+    pyproject_dir: python/pylibcudf
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
+    includes:
+      - build_base
+      - build_python_common
+      - build_python_pylibcudf
+  py_run_pylibcudf:
+    output: pyproject
+    pyproject_dir: python/pylibcudf
+    extras:
+      table: project
+    includes:
+      - run_pylibcudf
+      - pyarrow_run
+  py_test_pylibcudf:
+    output: pyproject
+    pyproject_dir: python/pylibcudf
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
+      - test_python_pylibcudf
   py_test_pandas_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -142,7 +181,7 @@ files:
       table: project
     includes:
       - run_cudf_polars
-      - depends_on_cudf
+      - depends_on_pylibcudf
   py_test_cudf_polars:
     output: pyproject
     pyproject_dir: python/cudf_polars
@@ -326,11 +365,36 @@ dependencies:
           # Sync with conda build constraint & wheel run constraint.
           # TODO: Change to `2.0.*` for NumPy 2
           - numpy==1.23.*
-  build_python_cudf:
+  build_python_pylibcudf:
     common:
       - output_types: conda
         packages:
           - &rmm_unsuffixed rmm==24.10.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*rmm_unsuffixed]}
+  build_python_cudf:
+    common:
+      - output_types: conda
+        packages:
+          - *rmm_unsuffixed
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
@@ -349,12 +413,33 @@ dependencies:
               cuda_suffixed: "true"
             packages:
               - rmm-cu12==24.10.*,>=0.0.0a0
+              - pylibcudf-cu12==24.10.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
               - rmm-cu11==24.10.*,>=0.0.0a0
+              - pylibcudf-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*rmm_unsuffixed]}
+  pylibcudf_build_dep:
+    common:
+      - output_types: conda
+        packages:
+          - &pylibcudf_unsuffixed pylibcudf==24.10.*,>=0.0.0a0
+    specific:
+      - output_types: [pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - pylibcudf-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - pylibcudf-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*pylibcudf_unsuffixed]}
   libarrow_build:
     common:
       - output_types: conda
@@ -560,6 +645,45 @@ dependencies:
           # TODO: Update `numpy` in `build_python_common` when dropping `<2.0a0`
           - numpy>=1.23,<2.0a0
           - pandas>=2.0,<2.2.3dev0
+  run_pylibcudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - nvtx>=0.2.1
+          - packaging
+          - typing_extensions>=4.0.0
+      - output_types: conda
+        packages:
+          - *rmm_unsuffixed
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [conda, requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - cuda-python>=12.0,<13.0a0
+          - matrix: {cuda: "11.*"}
+            packages: &run_pylibcudf_packages_all_cu11
+              - cuda-python>=11.7.1,<12.0a0
+          - {matrix: null, packages: *run_pylibcudf_packages_all_cu11}
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*rmm_unsuffixed]}
   run_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -680,6 +804,14 @@ dependencies:
           - pytest<8
           - pytest-cov
           - pytest-xdist
+  test_python_pylibcudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - fastavro>=0.22.9
+          - hypothesis
+          - numpy
+          - pandas
   test_python_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -724,6 +856,31 @@ dependencies:
         packages:
           - dask-cuda==24.10.*,>=0.0.0a0
           - *numba
+  depends_on_pylibcudf:
+    common:
+      - output_types: conda
+        packages:
+          - *pylibcudf_unsuffixed
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm, cubinlinker, ptxcompiler.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - pylibcudf-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - pylibcudf-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*pylibcudf_unsuffixed]}
   depends_on_cudf:
     common:
       - output_types: conda
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst
index 739305af5d4..4b2b213b6c3 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst
@@ -2,5 +2,5 @@
 aggregation
 ===========
 
-.. automodule:: cudf._lib.pylibcudf.aggregation
+.. automodule:: pylibcudf.aggregation
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst
index e5bc6aa7cda..8bbbfbf88c1 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst
@@ -2,5 +2,5 @@
 binaryop
 ========
 
-.. automodule:: cudf._lib.pylibcudf.binaryop
+.. automodule:: pylibcudf.binaryop
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst
index d1105d356b4..d26c8737cf4 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst
@@ -2,5 +2,5 @@
 Column
 ======
 
-.. automodule:: cudf._lib.pylibcudf.column
+.. automodule:: pylibcudf.column
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst
index c858135b6ce..8dfaa4bae03 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst
@@ -2,5 +2,5 @@
 column_factories
 ================
 
-.. automodule:: cudf._lib.pylibcudf.column_factories
+.. automodule:: pylibcudf.column_factories
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/concatenate.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/concatenate.rst
index e83739056f4..7912cb83767 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/concatenate.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/concatenate.rst
@@ -2,5 +2,5 @@
 concatenate
 ===========
 
-.. automodule:: cudf._lib.pylibcudf.concatenate
+.. automodule:: pylibcudf.concatenate
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst
index fddd3ea440f..25e3ef50e6a 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst
@@ -2,5 +2,5 @@
 copying
 =======
 
-.. automodule:: cudf._lib.pylibcudf.copying
+.. automodule:: pylibcudf.copying
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
index 558268ea495..71f7874cfbe 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
@@ -2,5 +2,5 @@
 datetime
 ========
 
-.. automodule:: cudf._lib.pylibcudf.datetime
+.. automodule:: pylibcudf.datetime
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
index 03f769ee861..5493d4662a9 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
@@ -2,5 +2,5 @@
 expressions
 ===========
 
-.. automodule:: cudf._lib.pylibcudf.expressions
+.. automodule:: pylibcudf.expressions
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/filling.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/filling.rst
index 542a5e12bc4..0d328a0b0e9 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/filling.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/filling.rst
@@ -2,5 +2,5 @@
 filling
 ========
 
-.. automodule:: cudf._lib.pylibcudf.filling
+.. automodule:: pylibcudf.filling
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst
index dffc7c24e02..5515a74adcc 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst
@@ -2,5 +2,5 @@
 gpumemoryview
 =============
 
-.. automodule:: cudf._lib.pylibcudf.gpumemoryview
+.. automodule:: pylibcudf.gpumemoryview
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst
index d6e994f7dbc..27cda383818 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst
@@ -2,5 +2,5 @@
 groupby
 =======
 
-.. automodule:: cudf._lib.pylibcudf.groupby
+.. automodule:: pylibcudf.groupby
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
index 881ab8d7be4..0d2cb55212e 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
@@ -2,5 +2,5 @@
 interop
 =======
 
-.. automodule:: cudf._lib.pylibcudf.interop
+.. automodule:: pylibcudf.interop
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
index 495bd505fdc..1c57a6157f5 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
@@ -2,5 +2,5 @@
 Avro
 ====
 
-.. automodule:: cudf._lib.pylibcudf.io.avro
+.. automodule:: pylibcudf.io.avro
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
index 5a2276f8b2d..59f7d8fe54c 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
@@ -2,5 +2,5 @@
 CSV
 ===
 
-.. automodule:: cudf._lib.pylibcudf.io.csv
+.. automodule:: pylibcudf.io.csv
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index e2d342ffe47..c8933981736 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -5,7 +5,7 @@ I/O
 I/O Utility Classes
 ===================
 
-.. automodule:: cudf._lib.pylibcudf.io.types
+.. automodule:: pylibcudf.io.types
    :members:
 
 
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
index 6aeae1f322a..a4626f43cc3 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
@@ -2,5 +2,5 @@
 JSON
 ====
 
-.. automodule:: cudf._lib.pylibcudf.io.json
+.. automodule:: pylibcudf.io.json
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
index 9dfbadfa216..07c2503ab28 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
@@ -2,5 +2,5 @@
 Parquet
 =======
 
-.. automodule:: cudf._lib.pylibcudf.io.parquet
+.. automodule:: pylibcudf.io.parquet
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst
index 05b9709d116..de065e4fc40 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst
@@ -2,5 +2,5 @@
 join
 ====
 
-.. automodule:: cudf._lib.pylibcudf.join
+.. automodule:: pylibcudf.join
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/lists.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/lists.rst
index a127dd6006a..0fe1a876073 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/lists.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/lists.rst
@@ -2,5 +2,5 @@
 lists
 =====
 
-.. automodule:: cudf._lib.pylibcudf.lists
+.. automodule:: pylibcudf.lists
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/merge.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/merge.rst
index ef1189a064a..3f634ffcfd7 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/merge.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/merge.rst
@@ -2,5 +2,5 @@
 merge
 =====
 
-.. automodule:: cudf._lib.pylibcudf.merge
+.. automodule:: pylibcudf.merge
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst
index 3417c1ff59d..0f0f701b5dc 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst
@@ -2,5 +2,5 @@
 quantiles
 =========
 
-.. automodule:: cudf._lib.pylibcudf.quantiles
+.. automodule:: pylibcudf.quantiles
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/reduce.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/reduce.rst
index e6f1b02331d..047f217c276 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/reduce.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/reduce.rst
@@ -2,5 +2,5 @@
 reduce
 ======
 
-.. automodule:: cudf._lib.pylibcudf.reduce
+.. automodule:: pylibcudf.reduce
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/replace.rst
index 7f846872fca..7410b20e1b0 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/replace.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/replace.rst
@@ -2,5 +2,5 @@
 replace
 =======
 
-.. automodule:: cudf._lib.pylibcudf.replace
+.. automodule:: pylibcudf.replace
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst
index 964cef04923..09ec0501bb9 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst
@@ -2,5 +2,5 @@
 reshape
 =======
 
-.. automodule:: cudf._lib.pylibcudf.reshape
+.. automodule:: pylibcudf.reshape
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/rolling.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/rolling.rst
index 0817d117a94..1f8da467e84 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/rolling.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/rolling.rst
@@ -2,5 +2,5 @@
 rolling
 =======
 
-.. automodule:: cudf._lib.pylibcudf.rolling
+.. automodule:: pylibcudf.rolling
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst
index c97fda12301..e064357cbd1 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst
@@ -2,5 +2,5 @@
 round
 =====
 
-.. automodule:: cudf._lib.pylibcudf.round
+.. automodule:: pylibcudf.round
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst
index b12f47618fb..a9100c6bb2d 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst
@@ -2,5 +2,5 @@
 Scalar
 ======
 
-.. automodule:: cudf._lib.pylibcudf.scalar
+.. automodule:: pylibcudf.scalar
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst
index aa57bcd9d92..02307037994 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst
@@ -2,5 +2,5 @@
 search
 ======
 
-.. automodule:: cudf._lib.pylibcudf.search
+.. automodule:: pylibcudf.search
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/sorting.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/sorting.rst
index e9441366eeb..b8fd8fda9bd 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/sorting.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/sorting.rst
@@ -2,5 +2,5 @@
 sorting
 =======
 
-.. automodule:: cudf._lib.pylibcudf.sorting
+.. automodule:: pylibcudf.sorting
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/stream_compaction.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/stream_compaction.rst
index 00b479446d8..0252d0684d9 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/stream_compaction.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/stream_compaction.rst
@@ -2,5 +2,5 @@
 stream_compaction
 =================
 
-.. automodule:: cudf._lib.pylibcudf.stream_compaction
+.. automodule:: pylibcudf.stream_compaction
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
index 578b2b75e37..6b9ed8d47e7 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
@@ -2,5 +2,5 @@
 capitalize
 ==========
 
-.. automodule:: cudf._lib.pylibcudf.strings.capitalize
+.. automodule:: pylibcudf.strings.capitalize
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
index 577ec34915b..896fa6086db 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
@@ -2,5 +2,5 @@
 char_types
 ==========
 
-.. automodule:: cudf._lib.pylibcudf.strings.char_types
+.. automodule:: pylibcudf.strings.char_types
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst
index e5745331bc7..d2d164be638 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst
@@ -2,5 +2,5 @@
 contains
 ========
 
-.. automodule:: cudf._lib.pylibcudf.strings.contains
+.. automodule:: pylibcudf.strings.contains
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
index 61d4079e9a3..7c540e99929 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
@@ -2,5 +2,5 @@
 find
 ====
 
-.. automodule:: cudf._lib.pylibcudf.strings.find
+.. automodule:: pylibcudf.strings.find
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
index 0126b6a3706..53fd712d864 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
@@ -2,5 +2,5 @@
 regex_flags
 ===========
 
-.. automodule:: cudf._lib.pylibcudf.strings.regex_flags
+.. automodule:: pylibcudf.strings.regex_flags
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
index 2f398186d51..6f3d2f6681c 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
@@ -2,5 +2,5 @@
 regex_program
 =============
 
-.. automodule:: cudf._lib.pylibcudf.strings.regex_program
+.. automodule:: pylibcudf.strings.regex_program
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst
index 9575ec226a7..d5417adac43 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst
@@ -2,5 +2,5 @@
 replace
 =======
 
-.. automodule:: cudf._lib.pylibcudf.strings.replace
+.. automodule:: pylibcudf.strings.replace
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
index 0ee5af71c03..e9908904512 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
@@ -2,5 +2,5 @@
 slice
 =====
 
-.. automodule:: cudf._lib.pylibcudf.strings.slice
+.. automodule:: pylibcudf.strings.slice
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
index d8337b6596d..e39ca18a12b 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
@@ -2,5 +2,5 @@
 Table
 =====
 
-.. automodule:: cudf._lib.pylibcudf.table
+.. automodule:: pylibcudf.table
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
index 294ca8dc78c..2cce7b9d7d7 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
@@ -2,5 +2,5 @@
 traits
 ======
 
-.. automodule:: cudf._lib.pylibcudf.traits
+.. automodule:: pylibcudf.traits
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
index ef04bbad7e6..839163f83fc 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
@@ -2,5 +2,5 @@
 transform
 =========
 
-.. automodule:: cudf._lib.pylibcudf.transform
+.. automodule:: pylibcudf.transform
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst
index 8d5409bbd97..75521ac2f4d 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst
@@ -2,5 +2,5 @@
 types
 =====
 
-.. automodule:: cudf._lib.pylibcudf.types
+.. automodule:: pylibcudf.types
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/unary.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/unary.rst
index add4baa0a54..34077242b90 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/unary.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/unary.rst
@@ -2,5 +2,5 @@
 unary
 =====
 
-.. automodule:: cudf._lib.pylibcudf.unary
+.. automodule:: pylibcudf.unary
    :members:
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index ecadbf5cbbc..e11d62b3bd5 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -79,7 +79,7 @@ if(NOT cudf_FOUND)
   # require access to libcudf, we place the library and all its dependent artifacts in the cudf
   # directory as a single source of truth and modify the other rpaths appropriately.
   set(cython_lib_dir cudf)
-  include(cmake/Modules/WheelHelpers.cmake)
+  include(../pylibcudf/cmake/Modules/WheelHelpers.cmake)
   # TODO: This install is currently overzealous. We should only install the libraries that are
   # downloaded by CPM during the build, not libraries that were found on the system.  However, in
   # practice right this would only be a problem is if libcudf was not found but some of the
@@ -92,7 +92,7 @@ endif()
 
 rapids_cython_init()
 
-include(cmake/Modules/LinkPyarrowHeaders.cmake)
+include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
 add_subdirectory(cudf/_lib)
 add_subdirectory(udf_cpp)
 
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index d32a2d8e3f8..d6182673308 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -73,5 +73,4 @@ target_link_libraries(interop PUBLIC nanoarrow)
 
 add_subdirectory(io)
 add_subdirectory(nvtext)
-add_subdirectory(pylibcudf)
 add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 34c0e29d0b1..918edb6d3f1 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -21,7 +21,6 @@
     orc,
     parquet,
     partitioning,
-    pylibcudf,
     quantiles,
     reduce,
     replace,
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 1616c24eec2..7c91533cf93 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -3,8 +3,9 @@
 import pandas as pd
 from numba.np import numpy_support
 
+import pylibcudf
+
 import cudf
-from cudf._lib import pylibcudf
 from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
 from cudf.utils import cudautils
 
diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx
index 3c132b22880..b1759635a36 100644
--- a/python/cudf/cudf/_lib/avro.pyx
+++ b/python/cudf/cudf/_lib/avro.pyx
@@ -2,8 +2,8 @@
 
 from cudf._lib.utils cimport data_from_pylibcudf_io
 
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.types import SourceInfo
+import pylibcudf as plc
+from pylibcudf.io.types import SourceInfo
 
 
 cpdef read_avro(datasource, columns=None, skip_rows=0, num_rows=-1):
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 2e352dd7904..e2547476849 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -4,7 +4,8 @@ from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.types cimport dtype_to_pylibcudf_type
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf._lib.scalar import as_device_scalar
 from cudf.core.buffer import acquire_spill_lock
 
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index 437f44af9f0..8ceea4920e2 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -5,14 +5,13 @@ from typing import Literal
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
-from rmm._lib.device_buffer cimport device_buffer
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
+from rmm._lib.device_buffer cimport device_buffer
 
 
 cdef class Column:
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index f0c07dfbc1b..2e400f775d3 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -7,11 +7,11 @@ import cupy as cp
 import numpy as np
 import pandas as pd
 
+import pylibcudf
 import rmm
 
 import cudf
 import cudf._lib as libcudf
-from cudf._lib import pylibcudf
 from cudf.core.buffer import (
     Buffer,
     ExposureTrackedBuffer,
@@ -39,18 +39,18 @@ from cudf._lib.types cimport (
 from cudf._lib.null_mask import bitmask_allocation_size_bytes
 from cudf._lib.types import dtype_from_pylibcudf_column
 
-
-cimport cudf._lib.pylibcudf.libcudf.copying as cpp_copying
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-cimport cudf._lib.pylibcudf.libcudf.unary as libcudf_unary
-from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
-from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
+cimport pylibcudf.libcudf.copying as cpp_copying
+cimport pylibcudf.libcudf.types as libcudf_types
+cimport pylibcudf.libcudf.unary as libcudf_unary
+from pylibcudf.libcudf.column.column cimport column, column_contents
+from pylibcudf.libcudf.column.column_factories cimport (
     make_column_from_scalar as cpp_make_column_from_scalar,
     make_numeric_column,
 )
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index 89ddcfee99e..e661059faa3 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -5,7 +5,8 @@ from libcpp cimport bool
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport data_from_pylibcudf_table
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf.core.buffer import acquire_spill_lock
 
 
diff --git a/python/cudf/cudf/_lib/copying.pxd b/python/cudf/cudf/_lib/copying.pxd
index 8fc7f4e1da0..14c7d2066d8 100644
--- a/python/cudf/cudf/_lib/copying.pxd
+++ b/python/cudf/cudf/_lib/copying.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.contiguous_split cimport packed_columns
+from pylibcudf.libcudf.contiguous_split cimport packed_columns
 
 
 cdef class _CPackedColumns:
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 796c70e615c..16182e31c08 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -10,8 +10,9 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport DeviceBuffer
 
+import pylibcudf
+
 import cudf
-from cudf._lib import pylibcudf
 from cudf.core.buffer import Buffer, acquire_spill_lock, as_buffer
 
 from cudf._lib.column cimport Column
@@ -26,17 +27,16 @@ from cudf.core.abc import Serializable
 
 from libcpp.memory cimport make_unique
 
-cimport cudf._lib.pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.gather cimport (
+cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.lists.gather cimport (
     segmented_gather as cpp_segmented_gather,
 )
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_table_view
 
 # workaround for https://github.com/cython/cython/issues/3885
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 099b61d62ae..e9aa97ecbc9 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -6,8 +6,9 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
+cimport pylibcudf.libcudf.types as libcudf_types
+from pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
+
 from cudf._lib.types cimport dtype_to_pylibcudf_type
 
 import errno
@@ -23,22 +24,24 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-from cudf._lib.io.utils cimport make_sink_info
-from cudf._lib.pylibcudf.libcudf.io.csv cimport (
+from pylibcudf.libcudf.io.csv cimport (
     csv_writer_options,
     write_csv as cpp_write_csv,
 )
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type, sink_info
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.io.data_sink cimport data_sink
+from pylibcudf.libcudf.io.types cimport compression_type, sink_info
+from pylibcudf.libcudf.table.table_view cimport table_view
+
+from cudf._lib.io.utils cimport make_sink_info
 from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
 
 from pyarrow.lib import NativeFile
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
+
 from cudf.api.types import is_hashable
 
-from cudf._lib.pylibcudf.types cimport DataType
+from pylibcudf.types cimport DataType
 
 CSV_HEX_TYPE_MAP = {
     "hex": np.dtype("int64"),
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index b30ef875a7b..483250dd36f 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -7,13 +7,14 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-cimport cudf._lib.pylibcudf.libcudf.datetime as libcudf_datetime
+cimport pylibcudf.libcudf.datetime as libcudf_datetime
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.filling cimport calendrical_month_sequence
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.filling cimport calendrical_month_sequence
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/filling.pyx b/python/cudf/cudf/_lib/filling.pyx
index b7302f3d07a..b2f4c620144 100644
--- a/python/cudf/cudf/_lib/filling.pyx
+++ b/python/cudf/cudf/_lib/filling.pyx
@@ -2,12 +2,12 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf._lib.scalar import as_device_scalar
 
 
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 9d18e023fe8..c199ed96d4f 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -18,10 +18,11 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.replace cimport replace_policy
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+
+import pylibcudf
 
-from cudf._lib import pylibcudf
 from cudf._lib.aggregation import make_aggregation
 
 # The sets below define the possible aggregations that can be performed on
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index b8331d5a226..48f75b12a73 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -7,10 +7,9 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.hash cimport (
+cimport pylibcudf.libcudf.types as libcudf_types
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.hash cimport (
     md5,
     murmurhash3_x86_32,
     sha1,
@@ -20,11 +19,13 @@ from cudf._lib.pylibcudf.libcudf.hash cimport (
     sha512,
     xxhash_64,
 )
-from cudf._lib.pylibcudf.libcudf.partitioning cimport (
+from pylibcudf.libcudf.partitioning cimport (
     hash_partition as cpp_hash_partition,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+
+from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 37595b65e65..1dc586bb257 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -4,15 +4,16 @@ from cpython cimport pycapsule
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
-from cudf._lib.pylibcudf.libcudf.interop cimport (
+from pylibcudf.libcudf.interop cimport (
     DLManagedTensor,
     from_dlpack as cpp_from_dlpack,
     to_dlpack as cpp_to_dlpack,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+
 from cudf._lib.utils cimport (
     columns_from_pylibcudf_table,
     columns_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 680a87c789e..1938f00c179 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -3,14 +3,15 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.libcudf.io.data_sink cimport data_sink
+from pylibcudf.libcudf.io.types cimport (
     column_name_info,
     sink_info,
     source_info,
 )
 
+from cudf._lib.column cimport Column
+
 
 cdef source_info make_source_info(list src) except*
 cdef sink_info make_sinks_info(
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 58956b9e9b7..b1900138d94 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -7,17 +7,18 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.io.datasource cimport Datasource
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.io.datasource cimport Datasource
+from pylibcudf.libcudf.io.data_sink cimport data_sink
+from pylibcudf.libcudf.io.datasource cimport datasource
+from pylibcudf.libcudf.io.types cimport (
     column_name_info,
     host_buffer,
     sink_info,
     source_info,
 )
 
+from cudf._lib.column cimport Column
+
 import codecs
 import errno
 import io
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 0a54f0d67a0..2559358c21f 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -4,7 +4,7 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
 # The functions below return the *gathermaps* that represent
 # the join result when joining on the keys `lhs` and `rhs`.
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 03bf9ed8b75..9bbbcf60dcf 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -9,18 +9,19 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport pylibcudf.libcudf.io.types as cudf_io_types
+from pylibcudf.io.types cimport compression_type
+from pylibcudf.libcudf.io.json cimport json_recovery_mode_t
+from pylibcudf.libcudf.io.types cimport compression_type
+from pylibcudf.libcudf.types cimport data_type, type_id
+from pylibcudf.types cimport DataType
+
 from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport add_df_col_struct_names
-from cudf._lib.pylibcudf.io.types cimport compression_type
-from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t
-from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
-from cudf._lib.pylibcudf.types cimport DataType
 from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
index 439a727a9ca..2e1959a348d 100644
--- a/python/cudf/cudf/_lib/labeling.pyx
+++ b/python/cudf/cudf/_lib/labeling.pyx
@@ -6,13 +6,11 @@ from libcpp cimport bool as cbool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.labeling cimport inclusive, label_bins as cpp_label_bins
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.labeling cimport (
-    inclusive,
-    label_bins as cpp_label_bins,
-)
 
 
 # Note that the parameter input shadows a Python built-in in the local scope,
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index f6d9c8c404c..7e8710bedb6 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -4,13 +4,14 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
+from pylibcudf.libcudf.types cimport null_order, size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, size_type
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
-from cudf._lib.pylibcudf cimport Scalar
+from pylibcudf cimport Scalar
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx
index fe7f7ad2918..9372acdab44 100644
--- a/python/cudf/cudf/_lib/merge.pyx
+++ b/python/cudf/cudf/_lib/merge.pyx
@@ -4,7 +4,7 @@ from libcpp cimport bool
 
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
 
 def merge_sorted(
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index b00deae2270..3a7b6a59bf3 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -10,9 +10,8 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.null_mask cimport (
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
     bitmask_and as cpp_bitmask_and,
     bitmask_or as cpp_bitmask_or,
@@ -20,8 +19,10 @@ from cudf._lib.pylibcudf.libcudf.null_mask cimport (
     create_null_mask as cpp_create_null_mask,
     underlying_type_t_mask_state,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport mask_state, size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.utils cimport table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
index d60162d0656..0d768e24f39 100644
--- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -6,15 +6,16 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.byte_pair_encode cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.byte_pair_encode cimport (
     bpe_merge_pairs as cpp_bpe_merge_pairs,
     byte_pair_encoding as cpp_byte_pair_encoding,
     load_merge_pairs as cpp_load_merge_pairs,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
index 514b6610575..e3c2273345a 100644
--- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
+++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
@@ -5,14 +5,15 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.edit_distance cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.edit_distance cimport (
     edit_distance as cpp_edit_distance,
     edit_distance_matrix as cpp_edit_distance_matrix,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def edit_distance(Column strings, Column targets):
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
index a6b9a1e4f7a..6591b527eec 100644
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
@@ -5,16 +5,17 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.generate_ngrams cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
     generate_character_ngrams as cpp_generate_character_ngrams,
     generate_ngrams as cpp_generate_ngrams,
     hash_character_ngrams as cpp_hash_character_ngrams,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
index 42fe15d6869..0ebf7c281e3 100644
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
@@ -5,13 +5,14 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.jaccard cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.jaccard cimport (
     jaccard_index as cpp_jaccard_index,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 4c92999e190..5ee15d0e409 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -5,14 +5,15 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.minhash cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
index ccd8de8c96f..dec4f037d98 100644
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
@@ -5,14 +5,15 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
     ngrams_tokenize as cpp_ngrams_tokenize,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
index 9f81f865bb7..5e86a9ce959 100644
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx
@@ -6,14 +6,15 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.normalize cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.normalize cimport (
     normalize_characters as cpp_normalize_characters,
     normalize_spaces as cpp_normalize_spaces,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def normalize_spaces(Column strings):
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
index ce2edc58d19..61ae3da5782 100644
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ b/python/cudf/cudf/_lib/nvtext/replace.pyx
@@ -5,15 +5,16 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.replace cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.replace cimport (
     filter_tokens as cpp_filter_tokens,
     replace_tokens as cpp_replace_tokens,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
index 8f75953ae99..5bf25562fed 100644
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -7,16 +7,17 @@ from libcpp.utility cimport move
 
 from enum import IntEnum
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.stemmer cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.stemmer cimport (
     is_letter as cpp_is_letter,
     letter_type,
     porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 
 
 class LetterType(IntEnum):
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
index 1112667a087..ee442ece5c6 100644
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -9,9 +9,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.subword_tokenize cimport (
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.subword_tokenize cimport (
     hashed_vocabulary as cpp_hashed_vocabulary,
     load_vocabulary_file as cpp_load_vocabulary_file,
     move as tr_move,
@@ -19,6 +18,8 @@ from cudf._lib.pylibcudf.libcudf.nvtext.subword_tokenize cimport (
     tokenizer_result as cpp_tokenizer_result,
 )
 
+from cudf._lib.column cimport Column
+
 
 cdef class Hashed_Vocabulary:
     cdef unique_ptr[cpp_hashed_vocabulary] c_obj
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
index 98afd94ab1c..a7e63f1e9ae 100644
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
@@ -5,10 +5,9 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.tokenize cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.tokenize cimport (
     character_tokenize as cpp_character_tokenize,
     count_tokens as cpp_count_tokens,
     detokenize as cpp_detokenize,
@@ -17,8 +16,10 @@ from cudf._lib.pylibcudf.libcudf.nvtext.tokenize cimport (
     tokenize_vocabulary as cpp_tokenize_vocabulary,
     tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 9609e3131b4..d506dcd4346 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -14,23 +14,17 @@ from libcpp.vector cimport vector
 import datetime
 from collections import OrderedDict
 
-cimport cudf._lib.pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
+cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
 
 try:
     import ujson as json
 except ImportError:
     import json
 
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_column_struct_field_names,
-)
-from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.orc cimport (
+cimport pylibcudf.libcudf.io.types as cudf_io_types
+from pylibcudf.io.datasource cimport NativeFileDatasource
+from pylibcudf.libcudf.io.data_sink cimport data_sink
+from pylibcudf.libcudf.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
     orc_reader_options,
@@ -38,7 +32,7 @@ from cudf._lib.pylibcudf.libcudf.io.orc cimport (
     read_orc as libcudf_read_orc,
     write_orc as libcudf_write_orc,
 )
-from cudf._lib.pylibcudf.libcudf.io.orc_metadata cimport (
+from pylibcudf.libcudf.io.orc_metadata cimport (
     binary_statistics,
     bucket_statistics,
     column_statistics,
@@ -53,7 +47,7 @@ from cudf._lib.pylibcudf.libcudf.io.orc_metadata cimport (
     string_statistics,
     timestamp_statistics,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     compression_type,
     sink_info,
@@ -61,9 +55,16 @@ from cudf._lib.pylibcudf.libcudf.io.types cimport (
     table_input_metadata,
     table_with_metadata,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
-from cudf._lib.variant cimport get_if as std_get_if, holds_alternative
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport data_type, size_type, type_id
+from pylibcudf.variant cimport get_if as std_get_if, holds_alternative
+
+from cudf._lib.column cimport Column
+from cudf._lib.io.utils cimport (
+    make_sink_info,
+    make_source_info,
+    update_column_struct_field_names,
+)
 
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 0fffb6ade58..4bfb79ff651 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -31,40 +31,43 @@ from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    add_df_col_struct_names,
-    make_sinks_info,
-    make_source_info,
-)
-from cudf._lib.pylibcudf.expressions cimport Expression
-from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
-from cudf._lib.pylibcudf.io.parquet cimport ChunkedParquetReader
-from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
+cimport pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
+cimport pylibcudf.libcudf.io.types as cudf_io_types
+from pylibcudf.expressions cimport Expression
+from pylibcudf.io.datasource cimport NativeFileDatasource
+from pylibcudf.io.parquet cimport ChunkedParquetReader
+from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_writer_options,
     merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
     parquet_writer_options,
     write_parquet as parquet_writer,
 )
-from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport (
+from pylibcudf.libcudf.io.parquet_metadata cimport (
     parquet_metadata,
     read_parquet_metadata as parquet_metadata_reader,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     table_input_metadata,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
+from cudf._lib.io.utils cimport (
+    add_df_col_struct_names,
+    make_sinks_info,
+    make_source_info,
+)
 from cudf._lib.utils cimport table_view_from_table
 
 from pyarrow.lib import NativeFile
 
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf cimport Table
+import pylibcudf as plc
+
+from pylibcudf cimport Table
+
 from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
 
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index 708ec4174aa..d94f0e1b564 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -7,19 +7,18 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.partitioning cimport partition as cpp_partition
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.partitioning cimport (
-    partition as cpp_partition,
-)
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf._lib.reduce import minmax
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+cimport pylibcudf.libcudf.types as libcudf_types
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
deleted file mode 100644
index 3695f36a6e7..00000000000
--- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
-from cudf._lib.pylibcudf.libcudf.io.avro cimport avro_reader_options
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
-
-cpdef TableWithMetadata read_avro(
-    SourceInfo source_info,
-    list columns = *,
-    size_type skip_rows = *,
-    size_type num_rows = *
-)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
deleted file mode 100644
index 57903ca27de..00000000000
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-
-
-cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[table] extract(
-        column_view source_strings,
-        regex_program) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd
deleted file mode 100644
index a80e02f520c..00000000000
--- a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
-    string_character_types,
-)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
deleted file mode 100644
index 275aa95d97e..00000000000
--- a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram
-
-
-cpdef Column contains_re(Column input, RegexProgram prog)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
deleted file mode 100644
index 79937bf574a..00000000000
--- a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx
index 7b50c00919a..7666b7ff8da 100644
--- a/python/cudf/cudf/_lib/quantiles.pyx
+++ b/python/cudf/cudf/_lib/quantiles.pyx
@@ -13,10 +13,11 @@ from cudf._lib.types cimport (
 
 from cudf._lib.types import Interpolation
 
-from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted
+from pylibcudf.libcudf.types cimport interpolation, sorted
+
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 511bba20ef5..944753d28b8 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -8,7 +8,8 @@ from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.types cimport dtype_to_pylibcudf_type, is_decimal_type_id
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf._lib.aggregation import make_aggregation
 
 
diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx
index 2b5f32c7675..b50c6dd25e3 100644
--- a/python/cudf/cudf/_lib/replace.pyx
+++ b/python/cudf/cudf/_lib/replace.pyx
@@ -6,7 +6,8 @@ from cudf.core.buffer import acquire_spill_lock
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf._lib.scalar import as_device_scalar
 
 
diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx
index 6bba8f0df35..6cebeb2bc16 100644
--- a/python/cudf/cudf/_lib/reshape.pyx
+++ b/python/cudf/cudf/_lib/reshape.pyx
@@ -2,11 +2,12 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx
index 5439e70fdce..687b261c2c7 100644
--- a/python/cudf/cudf/_lib/rolling.pyx
+++ b/python/cudf/cudf/_lib/rolling.pyx
@@ -4,7 +4,8 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf._lib.aggregation import make_aggregation
 
 
diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx
index f8ad57947c8..f961c09e6f6 100644
--- a/python/cudf/cudf/_lib/round.pyx
+++ b/python/cudf/cudf/_lib/round.pyx
@@ -4,8 +4,8 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.round import RoundingMethod
+import pylibcudf as plc
+from pylibcudf.round import RoundingMethod
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index b57acbb37f1..27095ca02d4 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -3,10 +3,9 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-
 
 cdef class DeviceScalar:
     cdef public object c_value
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index e68398498d1..0dde91316fb 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -11,38 +11,40 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+import pylibcudf
+
 import cudf
-from cudf._lib import pylibcudf
 from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+cimport pylibcudf.libcudf.types as libcudf_types
 # We currently need this cimport because some of the implementations here
 # access the c_obj of the scalar, and because we need to be able to call
 # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
-from cudf._lib.pylibcudf cimport Scalar as plc_Scalar
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
+from pylibcudf cimport Scalar as plc_Scalar
+from pylibcudf.libcudf.scalar.scalar cimport (
     duration_scalar,
     list_scalar,
     scalar,
     struct_scalar,
     timestamp_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport (
+from pylibcudf.libcudf.wrappers.durations cimport (
     duration_ms,
     duration_ns,
     duration_s,
     duration_us,
 )
-from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
+from pylibcudf.libcudf.wrappers.timestamps cimport (
     timestamp_ms,
     timestamp_ns,
     timestamp_s,
     timestamp_us,
 )
+
 from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 
 
diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx
index 1ee73949fd3..8108361052b 100644
--- a/python/cudf/cudf/_lib/search.pyx
+++ b/python/cudf/cudf/_lib/search.pyx
@@ -4,7 +4,7 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index ff9565b9a89..185552ede82 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -9,18 +9,19 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+from pylibcudf.libcudf.aggregation cimport rank_method
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.search cimport lower_bound, upper_bound
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport null_order, order as cpp_order
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.search cimport lower_bound, upper_bound
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order as cpp_order
 from cudf._lib.utils cimport (
     columns_from_pylibcudf_table,
     table_view_from_columns,
 )
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index 834f91f48d9..1b8831940e3 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -7,7 +7,7 @@ from libcpp cimport bool
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index dfad7fd101c..8d463829a19 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -12,39 +12,40 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_booleans cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.convert.convert_booleans cimport (
     from_booleans as cpp_from_booleans,
     to_booleans as cpp_to_booleans,
 )
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_datetime cimport (
+from pylibcudf.libcudf.strings.convert.convert_datetime cimport (
     from_timestamps as cpp_from_timestamps,
     is_timestamp as cpp_is_timestamp,
     to_timestamps as cpp_to_timestamps,
 )
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_durations cimport (
+from pylibcudf.libcudf.strings.convert.convert_durations cimport (
     from_durations as cpp_from_durations,
     to_durations as cpp_to_durations,
 )
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_floats cimport (
+from pylibcudf.libcudf.strings.convert.convert_floats cimport (
     from_floats as cpp_from_floats,
     to_floats as cpp_to_floats,
 )
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_integers cimport (
+from pylibcudf.libcudf.strings.convert.convert_integers cimport (
     from_integers as cpp_from_integers,
     hex_to_integers as cpp_hex_to_integers,
     integers_to_hex as cpp_integers_to_hex,
     is_hex as cpp_is_hex,
     to_integers as cpp_to_integers,
 )
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_ipv4 cimport (
+from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport (
     integers_to_ipv4 as cpp_integers_to_ipv4,
     ipv4_to_integers as cpp_ipv4_to_integers,
     is_ipv4 as cpp_is_ipv4,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
+from pylibcudf.libcudf.types cimport data_type, type_id
+
 from cudf._lib.types cimport underlying_type_t_type_id
 
 import cudf
diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx
index 1f3d7c4eb1b..fe8c17c9e31 100644
--- a/python/cudf/cudf/_lib/strings/attributes.pyx
+++ b/python/cudf/cudf/_lib/strings/attributes.pyx
@@ -5,15 +5,16 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.attributes cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.attributes cimport (
     code_points as cpp_code_points,
     count_bytes as cpp_count_bytes,
     count_characters as cpp_count_characters,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def count_characters(Column source_strings):
diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx
index b3ca6a5ac8f..42c40e2e753 100644
--- a/python/cudf/cudf/_lib/strings/capitalize.pyx
+++ b/python/cudf/cudf/_lib/strings/capitalize.pyx
@@ -4,7 +4,7 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx
index 38f242a67d6..ad4cbb6f088 100644
--- a/python/cudf/cudf/_lib/strings/case.pyx
+++ b/python/cudf/cudf/_lib/strings/case.pyx
@@ -4,7 +4,7 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
-from cudf._lib.pylibcudf.strings import case
+from pylibcudf.strings import case
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 5b7b6d19d9e..376a6f8af97 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -7,15 +7,16 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.char_types cimport (
     all_characters_of_type as cpp_all_characters_of_type,
     filter_characters_of_type as cpp_filter_characters_of_type,
     string_character_types,
 )
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index 288f333d4d8..76cc13db0da 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -5,18 +5,19 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.combine cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.combine cimport (
     concatenate as cpp_concatenate,
     join_list_elements as cpp_join_list_elements,
     join_strings as cpp_join_strings,
     output_if_empty_list,
     separator_on_nulls,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.table.table_view cimport table_view
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport table_view_from_columns
 
diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
index 502a1d14696..82f5e06c547 100644
--- a/python/cudf/cudf/_lib/strings/contains.pyx
+++ b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -9,21 +9,22 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.contains cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.contains cimport (
     count_re as cpp_count_re,
     like as cpp_like,
     matches_re as cpp_matches_re,
 )
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
-from cudf._lib.pylibcudf.strings import contains
-from cudf._lib.pylibcudf.strings.regex_program import RegexProgram
+from pylibcudf.strings import contains
+from pylibcudf.strings.regex_program import RegexProgram
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index 6faff606226..a8df8c9a92c 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -7,15 +7,16 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_fixed_point cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.convert.convert_fixed_point cimport (
     from_fixed_point as cpp_from_fixed_point,
     is_fixed_point as cpp_is_fixed_point,
     to_fixed_point as cpp_to_fixed_point,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
+from pylibcudf.libcudf.types cimport data_type, type_id
+
+from cudf._lib.column cimport Column
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
index 341cbc99dab..7965b588703 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -5,13 +5,14 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_floats cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.convert.convert_floats cimport (
     is_float as cpp_is_float,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def is_float(Column source_strings):
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
index 081b03cdc0d..8b6da2bfa1c 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
@@ -5,13 +5,14 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_integers cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.convert.convert_integers cimport (
     is_integer as cpp_is_integer,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def is_integer(Column source_strings):
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
index 4418bf2a72d..73aebf8ab35 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
@@ -5,14 +5,15 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_lists cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.convert.convert_lists cimport (
     format_list_column as cpp_format_list_column,
 )
 
+from cudf._lib.column cimport Column
+
 from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
index 5f62efe5c00..e52116d6247 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
@@ -5,14 +5,15 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_urls cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.convert.convert_urls cimport (
     url_decode as cpp_url_decode,
     url_encode as cpp_url_encode,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def url_decode(Column source_strings):
diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx
index 3b80c4f6368..63f4d57e562 100644
--- a/python/cudf/cudf/_lib/strings/extract.pyx
+++ b/python/cudf/cudf/_lib/strings/extract.pyx
@@ -8,12 +8,13 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.extract cimport extract as cpp_extract
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.table.table cimport table
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.extract cimport extract as cpp_extract
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx
index 3c0009ee569..2d284d1aa9d 100644
--- a/python/cudf/cudf/_lib/strings/find.pyx
+++ b/python/cudf/cudf/_lib/strings/find.pyx
@@ -1,10 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
+
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx
index c75f28db21b..1358f8e3c2c 100644
--- a/python/cudf/cudf/_lib/strings/find_multiple.pyx
+++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx
@@ -5,13 +5,14 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.find_multiple cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.find_multiple cimport (
     find_multiple as cpp_find_multiple,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def find_multiple(Column source_strings, Column target_strings):
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index 0d409889bc8..3cf2084e30a 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -8,12 +8,13 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.findall cimport findall as cpp_findall
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.findall cimport findall as cpp_findall
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx
index 560f284b56c..c9b0bba088d 100644
--- a/python/cudf/cudf/_lib/strings/json.pyx
+++ b/python/cudf/cudf/_lib/strings/json.pyx
@@ -5,14 +5,15 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.json cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.json cimport (
     get_json_object as cpp_get_json_object,
     get_json_object_options,
 )
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx
index 9226810951f..d0239e91ec3 100644
--- a/python/cudf/cudf/_lib/strings/padding.pyx
+++ b/python/cudf/cudf/_lib/strings/padding.pyx
@@ -6,18 +6,19 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from enum import IntEnum
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.strings.padding cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.padding cimport (
     pad as cpp_pad,
     zfill as cpp_zfill,
 )
-from cudf._lib.pylibcudf.libcudf.strings.side_type cimport (
+from pylibcudf.libcudf.strings.side_type cimport (
     side_type,
     underlying_type_t_side_type,
 )
diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx
index 2b8116848cf..42fcfa5d94e 100644
--- a/python/cudf/cudf/_lib/strings/repeat.pyx
+++ b/python/cudf/cudf/_lib/strings/repeat.pyx
@@ -5,11 +5,12 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings cimport repeat as cpp_repeat
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings cimport repeat as cpp_repeat
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx
index 374831f1833..a260c4e4f45 100644
--- a/python/cudf/cudf/_lib/strings/replace.pyx
+++ b/python/cudf/cudf/_lib/strings/replace.pyx
@@ -4,11 +4,12 @@ from libc.stdint cimport int32_t
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx
index e13880a6186..fffc8b7c3f6 100644
--- a/python/cudf/cudf/_lib/strings/replace_re.pyx
+++ b/python/cudf/cudf/_lib/strings/replace_re.pyx
@@ -8,17 +8,18 @@ from libcpp.vector cimport vector
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
-from cudf._lib.pylibcudf.libcudf.strings.replace_re cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.strings.replace_re cimport (
     replace_re as cpp_replace_re,
     replace_with_backrefs as cpp_replace_with_backrefs,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
index be377c0f86b..a81fb18e752 100644
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ b/python/cudf/cudf/_lib/strings/split/partition.pyx
@@ -5,14 +5,15 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.split.partition cimport (
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.split.partition cimport (
     partition as cpp_partition,
     rpartition as cpp_rpartition,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table cimport table
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
index 942235686d7..f481fea4c51 100644
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ b/python/cudf/cudf/_lib/strings/split/split.pyx
@@ -7,13 +7,12 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
-from cudf._lib.pylibcudf.libcudf.strings.split.split cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.strings.split.split cimport (
     rsplit as cpp_rsplit,
     rsplit_re as cpp_rsplit_re,
     rsplit_record as cpp_rsplit_record,
@@ -23,8 +22,10 @@ from cudf._lib.pylibcudf.libcudf.strings.split.split cimport (
     split_record as cpp_split_record,
     split_record_re as cpp_split_record_re,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx
index 199fa5fc3b6..acf52cb7b9f 100644
--- a/python/cudf/cudf/_lib/strings/strip.pyx
+++ b/python/cudf/cudf/_lib/strings/strip.pyx
@@ -5,12 +5,13 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.side_type cimport side_type
+from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
-from cudf._lib.pylibcudf.libcudf.strings.strip cimport strip as cpp_strip
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx
index 706c21c0634..db96d99c7b6 100644
--- a/python/cudf/cudf/_lib/strings/substring.pyx
+++ b/python/cudf/cudf/_lib/strings/substring.pyx
@@ -10,7 +10,7 @@ from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx
index 8846e2e280d..3fad91bbfc0 100644
--- a/python/cudf/cudf/_lib/strings/translate.pyx
+++ b/python/cudf/cudf/_lib/strings/translate.pyx
@@ -8,16 +8,17 @@ from libcpp.vector cimport vector
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.translate cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.translate cimport (
     filter_characters as cpp_filter_characters,
     filter_type,
     translate as cpp_translate,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport char_utf8
+from pylibcudf.libcudf.types cimport char_utf8
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx
index 92750f21e4d..eed5cf33b10 100644
--- a/python/cudf/cudf/_lib/strings/wrap.pyx
+++ b/python/cudf/cudf/_lib/strings/wrap.pyx
@@ -5,11 +5,12 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index 7610cad0b40..78fc9f08bd8 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -2,7 +2,7 @@
 
 from libc.stdint cimport uint8_t, uint16_t, uintptr_t
 
-from cudf._lib.pylibcudf.libcudf.strings_udf cimport (
+from pylibcudf.libcudf.strings_udf cimport (
     get_character_cases_table as cpp_get_character_cases_table,
     get_character_flags_table as cpp_get_character_flags_table,
     get_special_case_mapping_table as cpp_get_special_case_mapping_table,
@@ -15,17 +15,17 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import as_buffer
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
-
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
-from cudf._lib.pylibcudf.libcudf.strings_udf cimport (
+from pylibcudf.libcudf.column.column cimport column, column_view
+from pylibcudf.libcudf.strings_udf cimport (
     column_from_udf_string_array as cpp_column_from_udf_string_array,
     free_udf_string_array as cpp_free_udf_string_array,
     get_cuda_build_version as cpp_get_cuda_build_version,
     to_string_view_array as cpp_to_string_view_array,
     udf_string,
 )
+from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+
+from cudf._lib.column cimport Column
 
 
 def get_cuda_build_version():
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index 6e63b8758b8..ece69b424bb 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -8,9 +8,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.io.text cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.io.text cimport (
     byte_range_info,
     data_chunk_source,
     make_source,
@@ -20,6 +19,8 @@ from cudf._lib.pylibcudf.libcudf.io.text cimport (
     parse_options,
 )
 
+from cudf._lib.column cimport Column
+
 
 def read_text(object filepaths_or_buffers,
               object delimiter=None,
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index 53977e984c2..bff3b2c4ce4 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -5,10 +5,11 @@ from libcpp.optional cimport make_optional
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.pylibcudf.libcudf.io.timezone cimport (
+from pylibcudf.libcudf.io.timezone cimport (
     make_timezone_transition_table as cpp_make_timezone_transition_table,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table cimport table
+
 from cudf._lib.utils cimport columns_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 622725e06a3..baa08a545ec 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -15,23 +15,23 @@ from libcpp.pair cimport pair
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
-
-cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf cimport transform as plc_transform
-from cudf._lib.pylibcudf.expressions cimport Expression
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.expressions cimport expression
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
+cimport pylibcudf.libcudf.transform as libcudf_transform
+from pylibcudf cimport transform as plc_transform
+from pylibcudf.expressions cimport Expression
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.expressions cimport expression
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport (
     bitmask_type,
     data_type,
     size_type,
     type_id,
 )
+from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+
+from cudf._lib.column cimport Column
 from cudf._lib.types cimport underlying_type_t_type_id
 from cudf._lib.utils cimport (
     columns_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index 82b23439e6a..f78fbd4c844 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -4,10 +4,11 @@ from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.transpose cimport transpose as cpp_transpose
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.transpose cimport transpose as cpp_transpose
 from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index 519d5ff8554..4fd3d31841e 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -3,11 +3,9 @@
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
+cimport pylibcudf.libcudf.types as libcudf_types
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
 ctypedef bool underlying_type_t_order
 ctypedef bool underlying_type_t_null_order
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 253fdf7b0d9..861bb063707 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -7,19 +7,19 @@ import pandas as pd
 
 from libcpp.memory cimport make_shared, shared_ptr
 
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
+cimport pylibcudf.libcudf.types as libcudf_types
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+
 from cudf._lib.types cimport (
     underlying_type_t_interpolation,
     underlying_type_t_order,
     underlying_type_t_sorted,
 )
 
+import pylibcudf
+
 import cudf
-from cudf._lib import pylibcudf
 
 
 class TypeId(IntEnum):
diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx
index 2f58c4512d6..d5602fd5a1c 100644
--- a/python/cudf/cudf/_lib/unary.pyx
+++ b/python/cudf/cudf/_lib/unary.pyx
@@ -5,7 +5,8 @@ from cudf._lib.types cimport dtype_to_pylibcudf_type
 
 import numpy as np
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf.api.types import is_decimal_dtype
 from cudf.core.buffer import acquire_spill_lock
 
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 1d55f7218dc..ff97fe80310 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -4,8 +4,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view
+from pylibcudf.libcudf.column.column cimport column_view
+from pylibcudf.libcudf.table.table cimport table, table_view
 
 
 cdef data_from_unique_ptr(
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 267432a0182..cae28d02ef4 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -10,11 +10,12 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+from pylibcudf.libcudf.column.column cimport column, column_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 try:
     import ujson as json
diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
index 63714a78572..67bde5a72b2 100644
--- a/python/cudf/cudf/core/_internals/expressions.py
+++ b/python/cudf/cudf/core/_internals/expressions.py
@@ -6,8 +6,8 @@
 
 import pyarrow as pa
 
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.expressions import (
+import pylibcudf as plc
+from pylibcudf.expressions import (
     ASTOperator,
     ColumnReference,
     Expression,
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 80dbbe4c048..32ae8c5ee53 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -11,6 +11,7 @@
 import numpy
 from typing_extensions import Self
 
+import pylibcudf
 import rmm
 
 import cudf
@@ -501,7 +502,7 @@ def get_ptr_and_size(array_interface: Mapping) -> tuple[int, int]:
     shape = array_interface["shape"] or (1,)
     strides = array_interface["strides"]
     itemsize = cudf.dtype(array_interface["typestr"]).itemsize
-    if strides is None or cudf._lib.pylibcudf.column.is_c_contiguous(
+    if strides is None or pylibcudf.column.is_c_contiguous(
         shape, strides, itemsize
     ):
         nelem = math.prod(shape)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index ac36813202a..a37355dfcda 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -9,9 +9,10 @@
 import pandas as pd
 from typing_extensions import Self
 
+import pylibcudf
+
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib import pylibcudf
 from cudf.api.types import is_integer, is_scalar
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.dtypes import CategoricalDtype
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8eb6de79bce..2263dfd5c98 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -24,6 +24,8 @@
 import pandas as pd
 from typing_extensions import Self
 
+import pylibcudf
+
 import cudf
 import cudf._lib as libcudf
 import cudf.core
@@ -6311,7 +6313,7 @@ def rank(
         if method not in {"average", "min", "max", "first", "dense"}:
             raise KeyError(method)
 
-        method_enum = libcudf.pylibcudf.aggregation.RankMethod[method.upper()]
+        method_enum = pylibcudf.aggregation.RankMethod[method.upper()]
         if na_option not in {"keep", "top", "bottom"}:
             raise ValueError(
                 "na_option must be one of 'keep', 'top', or 'bottom'"
diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index e88e795671e..bacf1f7e77b 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -5,10 +5,9 @@
 import os
 import warnings
 
+import pylibcudf
 import rmm.mr
 
-from cudf._lib import pylibcudf
-
 from .fast_slow_proxy import is_proxy_object
 from .magics import load_ipython_extension
 from .profiler import Profiler
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 60ac171f3d7..9db52164eca 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
     "pyarrow>=16.1.0,<16.2.0a0",
+    "pylibcudf==24.10.*,>=0.0.0a0",
     "rich",
     "rmm==24.10.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
@@ -88,6 +89,7 @@ known_dask = [
 ]
 known_rapids = [
     "rmm",
+    "pylibcudf"
 ]
 known_first_party = [
     "cudf",
@@ -127,6 +129,7 @@ requires = [
     "ninja",
     "numpy==1.23.*",
     "pyarrow==16.1.0.*",
+    "pylibcudf==24.10.*,>=0.0.0a0",
     "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
diff --git a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
index 4f3b9220a4f..1b205537d73 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
+++ b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -20,5 +20,5 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}"
 )
-include(../../../cudf/cmake/Modules/LinkPyarrowHeaders.cmake)
+include(../../../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
 link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index 2de0bf39785..e65b0d233b9 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -6,9 +6,8 @@ from libcpp.map cimport map
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.io.datasource cimport Datasource
-from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
+from pylibcudf.io.datasource cimport Datasource
+from pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef extern from "cudf_kafka/kafka_callback.hpp" \
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 2927dc0aa9a..20aa43b0134 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -6,8 +6,7 @@ from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
+from pylibcudf.libcudf.io.datasource cimport datasource
 
 from cudf_kafka._lib.kafka cimport kafka_consumer
 
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 02018548b2c..dd3b771e305 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -8,7 +8,7 @@
 import functools
 from typing import TYPE_CHECKING
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 if TYPE_CHECKING:
     from typing_extensions import Self
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index dba76855329..7c28e7b9a6c 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -10,11 +10,10 @@
 from typing import TYPE_CHECKING, cast
 
 import pyarrow as pa
+import pylibcudf as plc
 
 import polars as pl
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.containers.column import NamedColumn
 from cudf_polars.utils import dtypes
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 9e0fca3f52f..e1b4d30b76b 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -21,11 +21,10 @@
 from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
 
 import pyarrow as pa
+import pylibcudf as plc
 
 from polars.polars import _expr_nodes as pl_expr
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.containers import Column, NamedColumn
 from cudf_polars.utils import dtypes, sorting
 
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 3754addeb11..019f00f4fca 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -21,12 +21,11 @@
 from typing import TYPE_CHECKING, Any, Callable, ClassVar
 
 import pyarrow as pa
+import pylibcudf as plc
 from typing_extensions import assert_never
 
 import polars as pl
 
-import cudf._lib.pylibcudf as plc
-
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import DataFrame, NamedColumn
 from cudf_polars.utils import sorting
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index dec45679c75..6dc97c7cb51 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -11,14 +11,13 @@
 from typing import Any
 
 import pyarrow as pa
+import pylibcudf as plc
 from typing_extensions import assert_never
 
 import polars as pl
 import polars.polars as plrs
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.dsl import expr, ir
 from cudf_polars.typing import NodeTraverser
 from cudf_polars.utils import dtypes
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index c04eac41bb7..02440e67fde 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -8,9 +8,9 @@
 from collections.abc import Mapping
 from typing import TYPE_CHECKING, Literal, Protocol, Union
 
-from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
+import pylibcudf as plc
 
-import cudf._lib.pylibcudf as plc
+from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 if TYPE_CHECKING:
     from typing import Callable
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index cd68d021286..7f6ea1edfd9 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -8,12 +8,11 @@
 from functools import cache
 
 import pyarrow as pa
+import pylibcudf as plc
 from typing_extensions import assert_never
 
 import polars as pl
 
-import cudf._lib.pylibcudf as plc
-
 __all__ = ["from_polars", "downcast_arrow_lists"]
 
 
diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py
index 57f94c4ec4c..17ea44e5b1b 100644
--- a/python/cudf_polars/cudf_polars/utils/sorting.py
+++ b/python/cudf_polars/cudf_polars/utils/sorting.py
@@ -7,7 +7,7 @@
 
 from typing import TYPE_CHECKING
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 424c83a5199..c380853035d 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,8 +19,8 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.10.*,>=0.0.0a0",
     "polars>=1.0,<1.3",
+    "pylibcudf==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py
index 4f3c0de5975..19919877f84 100644
--- a/python/cudf_polars/tests/containers/test_column.py
+++ b/python/cudf_polars/tests/containers/test_column.py
@@ -6,10 +6,9 @@
 from functools import partial
 
 import pyarrow
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.containers import Column, NamedColumn
 
 
diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py
index 87508e17407..6b470268084 100644
--- a/python/cudf_polars/tests/containers/test_dataframe.py
+++ b/python/cudf_polars/tests/containers/test_dataframe.py
@@ -3,12 +3,11 @@
 
 from __future__ import annotations
 
+import pylibcudf as plc
 import pytest
 
 import polars as pl
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.containers import DataFrame, NamedColumn
 
 
diff --git a/python/cudf_polars/tests/dsl/test_expr.py b/python/cudf_polars/tests/dsl/test_expr.py
index ddc3ca66d86..b7d4672daca 100644
--- a/python/cudf_polars/tests/dsl/test_expr.py
+++ b/python/cudf_polars/tests/dsl/test_expr.py
@@ -3,10 +3,9 @@
 
 from __future__ import annotations
 
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.dsl import expr
 
 
diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py
index 5bd3131d1d7..ced49bdc254 100644
--- a/python/cudf_polars/tests/expressions/test_literal.py
+++ b/python/cudf_polars/tests/expressions/test_literal.py
@@ -2,12 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import pylibcudf as plc
 import pytest
 
 import polars as pl
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
index d46df92db94..76c7648813a 100644
--- a/python/cudf_polars/tests/expressions/test_sort.py
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -4,12 +4,11 @@
 
 import itertools
 
+import pylibcudf as plc
 import pytest
 
 import polars as pl
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars import translate_ir
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py
index 69ad1e519e2..35aaef44e1f 100644
--- a/python/cudf_polars/tests/utils/test_broadcast.py
+++ b/python/cudf_polars/tests/utils/test_broadcast.py
@@ -3,10 +3,9 @@
 
 from __future__ import annotations
 
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.containers import NamedColumn
 from cudf_polars.dsl.ir import broadcast
 
diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt
new file mode 100644
index 00000000000..424d8372280
--- /dev/null
+++ b/python/pylibcudf/CMakeLists.txt
@@ -0,0 +1,100 @@
+# =============================================================================
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+include(../../rapids_config.cmake)
+include(rapids-cuda)
+rapids_cuda_init_architectures(pylibcudf)
+
+project(
+  pylibcudf
+  VERSION "${RAPIDS_VERSION}"
+  LANGUAGES CXX CUDA
+)
+
+option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
+       OFF
+)
+option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF)
+mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
+
+# Find Python early so that later commands can use it
+find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+
+# If the user requested it we attempt to find CUDF.
+if(FIND_CUDF_CPP)
+  include(rapids-cpm)
+  include(rapids-export)
+  include(rapids-find)
+  rapids_cpm_init()
+
+  if(USE_LIBARROW_FROM_PYARROW)
+    # We need to find arrow before libcudf since libcudf requires it but doesn't bundle arrow
+    # libraries. These variables have no effect because we are always searching for arrow via
+    # pyarrow, but they must be set as they are required arguments to the function in
+    # get_arrow.cmake.
+    set(CUDF_USE_ARROW_STATIC OFF)
+    set(CUDF_ENABLE_ARROW_S3 OFF)
+    set(CUDF_ENABLE_ARROW_ORC OFF)
+    set(CUDF_ENABLE_ARROW_PYTHON OFF)
+    set(CUDF_ENABLE_ARROW_PARQUET OFF)
+    include(../../cpp/cmake/thirdparty/get_arrow.cmake)
+  endif()
+
+  find_package(cudf "${RAPIDS_VERSION}" REQUIRED)
+
+  # an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
+  # for the interop.pyx
+  include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
+else()
+  set(cudf_FOUND OFF)
+endif()
+
+include(rapids-cython-core)
+
+if(NOT cudf_FOUND)
+  set(BUILD_TESTS OFF)
+  set(BUILD_BENCHMARKS OFF)
+  set(CUDF_BUILD_TESTUTIL OFF)
+  set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
+  set(CUDA_STATIC_RUNTIME ON)
+
+  add_subdirectory(../../cpp cudf-cpp EXCLUDE_FROM_ALL)
+
+  # libcudf targets are excluded by default above via EXCLUDE_FROM_ALL to remove extraneous
+  # components like headers from libcudacxx, but we do need the libraries. However, we want to
+  # control where they are installed to. Since there are multiple subpackages of pylibcudf that
+  # require access to libcudf, we place the library and all its dependent artifacts in the cudf
+  # directory as a single source of truth and modify the other rpaths appropriately.
+  set(cython_lib_dir pylibcudf)
+  include(cmake/Modules/WheelHelpers.cmake)
+  # TODO: This install is currently overzealous. We should only install the libraries that are
+  # downloaded by CPM during the build, not libraries that were found on the system.  However, in
+  # practice right this would only be a problem is if libcudf was not found but some of the
+  # dependencies were, and we have no real use cases where that happens.
+  install_aliased_imported_targets(
+    TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
+    DESTINATION ${cython_lib_dir}
+  )
+endif()
+
+rapids_cython_init()
+
+include(cmake/Modules/LinkPyarrowHeaders.cmake)
+add_subdirectory(pylibcudf)
+
+if(DEFINED cython_lib_dir)
+  rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
+endif()
diff --git a/python/pylibcudf/README.md b/python/pylibcudf/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/pylibcudf/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake b/python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake
similarity index 100%
rename from python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake
rename to python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake
diff --git a/python/cudf/cmake/Modules/WheelHelpers.cmake b/python/pylibcudf/cmake/Modules/WheelHelpers.cmake
similarity index 100%
rename from python/cudf/cmake/Modules/WheelHelpers.cmake
rename to python/pylibcudf/cmake/Modules/WheelHelpers.cmake
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
rename to python/pylibcudf/pylibcudf/CMakeLists.txt
index da32d530928..ab21bfe97ab 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -54,7 +54,7 @@ rapids_cython_create_modules(
 )
 
 include(${rapids-cmake-dir}/export/find_package_root.cmake)
-include(../../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
+include(../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
 target_link_libraries(pylibcudf_interop PUBLIC nanoarrow)
 
 add_subdirectory(libcudf)
diff --git a/python/pylibcudf/pylibcudf/VERSION b/python/pylibcudf/pylibcudf/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/__init__.pxd
rename to python/pylibcudf/pylibcudf/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
similarity index 99%
rename from python/cudf/cudf/_lib/pylibcudf/__init__.py
rename to python/pylibcudf/pylibcudf/__init__.py
index 9705eba84b1..677fdaf80d0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -12,6 +12,7 @@
     filling,
     groupby,
     interop,
+    io,
     join,
     lists,
     merge,
diff --git a/python/pylibcudf/pylibcudf/_version.py b/python/pylibcudf/pylibcudf/_version.py
new file mode 100644
index 00000000000..d2765e5d53c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/_version.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files(__package__)
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/pylibcudf/pylibcudf/aggregation.pxd
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
rename to python/pylibcudf/pylibcudf/aggregation.pxd
index 0981d0e855a..c9ab1eab21c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
+++ b/python/pylibcudf/pylibcudf/aggregation.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+from pylibcudf.libcudf.aggregation cimport (
     Kind as kind_t,
     aggregation,
     correlation_type,
@@ -15,7 +14,7 @@ from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     rolling_aggregation,
     scan_aggregation,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.types cimport (
     interpolation,
     nan_equality,
     null_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/pylibcudf/pylibcudf/aggregation.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
rename to python/pylibcudf/pylibcudf/aggregation.pyx
index eed2f6de585..e510b738f70 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
+++ b/python/pylibcudf/pylibcudf/aggregation.pyx
@@ -4,8 +4,7 @@ from cython.operator cimport dereference
 from libcpp.cast cimport dynamic_cast
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+from pylibcudf.libcudf.aggregation cimport (
     aggregation,
     correlation_type,
     ewm_history,
@@ -41,7 +40,7 @@ from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     rolling_aggregation,
     scan_aggregation,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.types cimport (
     interpolation,
     nan_equality,
     null_equality,
@@ -51,18 +50,16 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     size_type,
 )
 
-from cudf._lib.pylibcudf.libcudf.aggregation import Kind  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.aggregation import \
+from pylibcudf.libcudf.aggregation import Kind  # no-cython-lint
+from pylibcudf.libcudf.aggregation import \
     correlation_type as CorrelationType  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.aggregation import \
+from pylibcudf.libcudf.aggregation import \
     ewm_history as EWMHistory  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.aggregation import \
+from pylibcudf.libcudf.aggregation import \
     rank_method as RankMethod  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.aggregation import \
+from pylibcudf.libcudf.aggregation import \
     rank_percentage as RankPercentage  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.aggregation import (  # no-cython-lint
-    udf_type as UdfType,
-)
+from pylibcudf.libcudf.aggregation import udf_type as UdfType  # no-cython-lint
 
 from .types cimport DataType
 
@@ -71,7 +68,7 @@ cdef class Aggregation:
     """A type of aggregation to perform.
 
     Aggregations are passed to APIs like
-    :py:func:`~cudf._lib.pylibcudf.groupby.GroupBy.aggregate` to indicate what
+    :py:func:`~pylibcudf.groupby.GroupBy.aggregate` to indicate what
     operations to perform. Using a class for aggregations provides a unified
     API for handling parametrizable aggregations. This class should never be
     instantiated directly, only via one of the factory functions.
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd b/python/pylibcudf/pylibcudf/binaryop.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
rename to python/pylibcudf/pylibcudf/binaryop.pxd
index 2411e28ac66..06625e9e2db 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
+++ b/python/pylibcudf/pylibcudf/binaryop.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
+from pylibcudf.libcudf.binaryop cimport binary_operator
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
rename to python/pylibcudf/pylibcudf/binaryop.pyx
index 44d9f4ad04a..5a67f4d6cdb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -5,12 +5,11 @@ from cython.operator import dereference
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf cimport binaryop as cpp_binaryop
+from pylibcudf.libcudf.binaryop cimport binary_operator
+from pylibcudf.libcudf.column.column cimport column
 
-from cudf._lib.pylibcudf.libcudf cimport binaryop as cpp_binaryop
-from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-
-from cudf._lib.pylibcudf.libcudf.binaryop import \
+from pylibcudf.libcudf.binaryop import \
     binary_operator as BinaryOperator  # no-cython-lint
 
 from .column cimport Column
@@ -27,9 +26,9 @@ cpdef Column binary_operation(
     """Perform a binary operation between a column and another column or scalar.
 
     ``lhs`` and ``rhs`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`, but at least one must be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`, but at least one must be a
+    :py:class:`~pylibcudf.column.Column`.
 
     For details, see :cpp:func:`binary_operation`.
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/pylibcudf/pylibcudf/column.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/pylibcudf/column.pxd
rename to python/pylibcudf/pylibcudf/column.pxd
index 13ee0a70681..92d63e4e495 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/pylibcudf/pylibcudf/column.pxd
@@ -2,16 +2,13 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type, size_type
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.types cimport bitmask_type, size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .types cimport DataType
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx
similarity index 98%
rename from python/cudf/cudf/_lib/pylibcudf/column.pyx
rename to python/pylibcudf/pylibcudf/column.pyx
index 1d9902b0374..a37a12fc7e1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/pylibcudf/pylibcudf/column.pyx
@@ -3,16 +3,13 @@
 from cython.operator cimport dereference
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf.column.column cimport column, column_contents
+from pylibcudf.libcudf.column.column_factories cimport make_column_from_scalar
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport size_type
 
 from rmm._lib.device_buffer cimport DeviceBuffer
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
-from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
-    make_column_from_scalar,
-)
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
 from .gpumemoryview cimport gpumemoryview
 from .scalar cimport Scalar
 from .types cimport DataType, size_of, type_id
diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd b/python/pylibcudf/pylibcudf/column_factories.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/column_factories.pxd
rename to python/pylibcudf/pylibcudf/column_factories.pxd
index 9dbd74ab16c..fef02359240 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd
+++ b/python/pylibcudf/pylibcudf/column_factories.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type
+from pylibcudf.libcudf.types cimport mask_state, size_type
 
 from .column cimport Column
 from .types cimport DataType, size_type, type_id
diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/column_factories.pyx
rename to python/pylibcudf/pylibcudf/column_factories.pyx
index ef7f512f0e5..4601cba515a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx
+++ b/python/pylibcudf/pylibcudf/column_factories.pyx
@@ -1,9 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_factories cimport (
     make_duration_column as cpp_make_duration_column,
     make_empty_column as cpp_make_empty_column,
     make_fixed_point_column as cpp_make_fixed_point_column,
@@ -11,7 +10,7 @@ from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
     make_numeric_column as cpp_make_numeric_column,
     make_timestamp_column as cpp_make_timestamp_column,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type
+from pylibcudf.libcudf.types cimport mask_state, size_type
 
 from .types cimport DataType, type_id
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/concatenate.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/concatenate.pxd
rename to python/pylibcudf/pylibcudf/concatenate.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx
similarity index 80%
rename from python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
rename to python/pylibcudf/pylibcudf/concatenate.pyx
index 5e40f921b2c..8bdcc086e0f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
+++ b/python/pylibcudf/pylibcudf/concatenate.pyx
@@ -3,12 +3,11 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf cimport concatenate as cpp_concatenate
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf cimport concatenate as cpp_concatenate
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/pylibcudf/pylibcudf/copying.pxd
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/copying.pxd
rename to python/pylibcudf/pylibcudf/copying.pxd
index 06543d3ca92..7dfed437673 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd
+++ b/python/pylibcudf/pylibcudf/copying.pxd
@@ -1,12 +1,11 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool as cbool
-
-from cudf._lib.pylibcudf.libcudf.copying cimport (
+from pylibcudf.libcudf.copying cimport (
     mask_allocation_policy,
     out_of_bounds_policy,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/copying.pyx
rename to python/pylibcudf/pylibcudf/copying.pyx
index 2d59deb3864..9743119d92a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx
+++ b/python/pylibcudf/pylibcudf/copying.pyx
@@ -6,29 +6,28 @@ from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
 # TODO: We want to make cpp a more full-featured package so that we can access
 # directly from that. It will make namespacing much cleaner in pylibcudf. What
 # we really want here would be
 # cimport libcudf... libcudf.copying.algo(...)
-from cudf._lib.pylibcudf.libcudf cimport copying as cpp_copying
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf cimport copying as cpp_copying
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.copying cimport (
+from pylibcudf.libcudf.copying cimport (
     mask_allocation_policy,
     out_of_bounds_policy,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
 
-from cudf._lib.pylibcudf.libcudf.copying import \
+from pylibcudf.libcudf.copying import \
     mask_allocation_policy as MaskAllocationPolicy  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.copying import \
+from pylibcudf.libcudf.copying import \
     out_of_bounds_policy as OutOfBoundsPolicy  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/datetime.pxd
rename to python/pylibcudf/pylibcudf/datetime.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx
similarity index 78%
rename from python/cudf/cudf/_lib/pylibcudf/datetime.pyx
rename to python/pylibcudf/pylibcudf/datetime.pyx
index 82351327de6..0ddc68bcb9d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/datetime.pyx
+++ b/python/pylibcudf/pylibcudf/datetime.pyx
@@ -1,11 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.datetime cimport (
-    extract_year as cpp_extract_year,
-)
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.datetime cimport extract_year as cpp_extract_year
 
 from .column cimport Column
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd b/python/pylibcudf/pylibcudf/exception_handler.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd
rename to python/pylibcudf/pylibcudf/exception_handler.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/experimental.pxd b/python/pylibcudf/pylibcudf/experimental.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/experimental.pxd
rename to python/pylibcudf/pylibcudf/experimental.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/experimental.pyx b/python/pylibcudf/pylibcudf/experimental.pyx
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/experimental.pyx
rename to python/pylibcudf/pylibcudf/experimental.pyx
index 1e2a682d879..b25a53e13b2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/experimental.pyx
+++ b/python/pylibcudf/pylibcudf/experimental.pyx
@@ -2,8 +2,7 @@
 
 from libcpp cimport bool
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf cimport experimental as cpp_experimental
+from pylibcudf.libcudf cimport experimental as cpp_experimental
 
 
 cpdef enable_prefetching(str key):
diff --git a/python/cudf/cudf/_lib/pylibcudf/expressions.pxd b/python/pylibcudf/pylibcudf/expressions.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/pylibcudf/expressions.pxd
rename to python/pylibcudf/pylibcudf/expressions.pxd
index 64825b89d9f..65660b7c449 100644
--- a/python/cudf/cudf/_lib/pylibcudf/expressions.pxd
+++ b/python/pylibcudf/pylibcudf/expressions.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.expressions cimport (
+from pylibcudf.libcudf.expressions cimport (
     ast_operator,
     expression,
     table_reference,
diff --git a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/expressions.pyx
rename to python/pylibcudf/pylibcudf/expressions.pyx
index b983a617533..a44c9e25987 100644
--- a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
+++ b/python/pylibcudf/pylibcudf/expressions.pyx
@@ -1,7 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from cudf._lib.pylibcudf.libcudf.expressions import \
+from pylibcudf.libcudf.expressions import \
     ast_operator as ASTOperator  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.expressions import \
+from pylibcudf.libcudf.expressions import \
     table_reference as TableReference  # no-cython-lint
 
 from cython.operator cimport dereference
@@ -9,22 +9,21 @@ from libc.stdint cimport int32_t, int64_t
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
+from pylibcudf.libcudf cimport expressions as libcudf_exp
+from pylibcudf.libcudf.scalar.scalar cimport (
     duration_scalar,
     numeric_scalar,
     string_scalar,
     timestamp_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type, type_id
-from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport (
+from pylibcudf.libcudf.types cimport size_type, type_id
+from pylibcudf.libcudf.wrappers.durations cimport (
     duration_ms,
     duration_ns,
     duration_s,
     duration_us,
 )
-from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
+from pylibcudf.libcudf.wrappers.timestamps cimport (
     timestamp_ms,
     timestamp_ns,
     timestamp_s,
diff --git a/python/cudf/cudf/_lib/pylibcudf/filling.pxd b/python/pylibcudf/pylibcudf/filling.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/filling.pxd
rename to python/pylibcudf/pylibcudf/filling.pxd
index 3560ebf2ea2..b9345f8cd42 100644
--- a/python/cudf/cudf/_lib/pylibcudf/filling.pxd
+++ b/python/pylibcudf/pylibcudf/filling.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/filling.pyx
rename to python/pylibcudf/pylibcudf/filling.pyx
index 05f67681428..61b430e64aa 100644
--- a/python/cudf/cudf/_lib/pylibcudf/filling.pyx
+++ b/python/pylibcudf/pylibcudf/filling.pyx
@@ -3,16 +3,15 @@
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.filling cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.filling cimport (
     fill as cpp_fill,
     fill_in_place as cpp_fill_in_place,
     repeat as cpp_repeat,
     sequence as cpp_sequence,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd b/python/pylibcudf/pylibcudf/gpumemoryview.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd
rename to python/pylibcudf/pylibcudf/gpumemoryview.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx b/python/pylibcudf/pylibcudf/gpumemoryview.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
rename to python/pylibcudf/pylibcudf/gpumemoryview.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/pylibcudf/pylibcudf/groupby.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/groupby.pxd
rename to python/pylibcudf/pylibcudf/groupby.pxd
index eaa05c26986..79af2f1b746 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
+++ b/python/pylibcudf/pylibcudf/groupby.pxd
@@ -3,20 +3,19 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+from pylibcudf.libcudf.aggregation cimport (
     aggregation,
     groupby_aggregation,
     groupby_scan_aggregation,
 )
-from cudf._lib.pylibcudf.libcudf.groupby cimport (
+from pylibcudf.libcudf.groupby cimport (
     aggregation_request,
     aggregation_result,
     groupby,
     scan_request,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport null_order, order
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/groupby.pyx
rename to python/pylibcudf/pylibcudf/groupby.pyx
index f5bb46ca6a2..ae5d33aaa46 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
+++ b/python/pylibcudf/pylibcudf/groupby.pyx
@@ -6,18 +6,17 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.groupby cimport (
+from pylibcudf.libcudf.groupby cimport (
     aggregation_request,
     aggregation_result,
     groupby,
     groups,
     scan_request,
 )
-from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.replace cimport replace_policy
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
@@ -156,7 +155,7 @@ cdef class GroupBy:
         Parameters
         ----------
         requests : List[GroupByRequest]
-            The list of `~.cudf._lib.pylibcudf.groupby.GroupByRequest` , each
+            The list of `~.pylibcudf.groupby.GroupByRequest` , each
             representing a set of aggregations to perform on a given column of values.
 
         Returns
@@ -188,7 +187,7 @@ cdef class GroupBy:
         Parameters
         ----------
         requests : List[GroupByRequest]
-            The list of `~.cudf._lib.pylibcudf.groupby.GroupByRequest` , each
+            The list of `~.pylibcudf.groupby.GroupByRequest` , each
             representing a set of aggregations to perform on a given column of values.
 
         Returns
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
similarity index 98%
rename from python/cudf/cudf/_lib/pylibcudf/interop.pyx
rename to python/pylibcudf/pylibcudf/interop.pyx
index caa19724786..d54e5b7ba1f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -11,8 +11,8 @@ from functools import singledispatch
 
 from pyarrow import lib as pa
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.interop cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.interop cimport (
     ArrowArray,
     ArrowArrayStream,
     ArrowSchema,
@@ -22,7 +22,7 @@ from cudf._lib.pylibcudf.libcudf.interop cimport (
     to_arrow_host_raw,
     to_arrow_schema_raw,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table cimport table
 
 from . cimport copying
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
rename to python/pylibcudf/pylibcudf/io/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
rename to python/pylibcudf/pylibcudf/io/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/io/__init__.py
rename to python/pylibcudf/pylibcudf/io/__init__.py
diff --git a/python/pylibcudf/pylibcudf/io/avro.pxd b/python/pylibcudf/pylibcudf/io/avro.pxd
new file mode 100644
index 00000000000..8696fcb3c15
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/avro.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.avro cimport avro_reader_options
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef TableWithMetadata read_avro(
+    SourceInfo source_info,
+    list columns = *,
+    size_type skip_rows = *,
+    size_type num_rows = *
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx
similarity index 89%
rename from python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
rename to python/pylibcudf/pylibcudf/io/avro.pyx
index 538bd8aa322..667c67f4c36 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
+++ b/python/pylibcudf/pylibcudf/io/avro.pyx
@@ -3,13 +3,12 @@
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
-from cudf._lib.pylibcudf.libcudf.io.avro cimport (
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.avro cimport (
     avro_reader_options,
     read_avro as cpp_read_avro,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 
 cpdef TableWithMetadata read_avro(
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx
similarity index 97%
rename from python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
rename to python/pylibcudf/pylibcudf/io/csv.pyx
index e9efb5befee..b53d6771cd6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
+++ b/python/pylibcudf/pylibcudf/io/csv.pyx
@@ -5,19 +5,18 @@ from libcpp.map cimport map
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
-from cudf._lib.pylibcudf.libcudf.io.csv cimport (
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.csv cimport (
     csv_reader_options,
     read_csv as cpp_read_csv,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.libcudf.io.types cimport (
     compression_type,
     quote_style,
     table_with_metadata,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
-from cudf._lib.pylibcudf.types cimport DataType
+from pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.types cimport DataType
 
 
 cdef tuple _process_parse_dates_hex(list cols):
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pxd b/python/pylibcudf/pylibcudf/io/datasource.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/io/datasource.pxd
rename to python/pylibcudf/pylibcudf/io/datasource.pxd
index a0a9c3fa0d4..05c03dceee2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pxd
+++ b/python/pylibcudf/pylibcudf/io/datasource.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
-
-from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
-from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
+from pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
+from pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef class Datasource:
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx b/python/pylibcudf/pylibcudf/io/datasource.pyx
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
rename to python/pylibcudf/pylibcudf/io/datasource.pyx
index 8f265f585de..6cc509b74cb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
+++ b/python/pylibcudf/pylibcudf/io/datasource.pyx
@@ -3,9 +3,8 @@
 from libcpp.memory cimport shared_ptr
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 from pyarrow.lib cimport NativeFile
-
-from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
-from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
+from pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
+from pylibcudf.libcudf.io.datasource cimport datasource
 
 import warnings
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/io/json.pxd
rename to python/pylibcudf/pylibcudf/io/json.pxd
index 2e0e92a054f..ab9b5b99ce2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/io/json.pxd
@@ -1,14 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.io.types cimport (
+from pylibcudf.io.types cimport (
     SinkInfo,
     SourceInfo,
     TableWithMetadata,
     compression_type,
 )
-from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.io.json cimport json_recovery_mode_t
+from pylibcudf.libcudf.types cimport size_type
 
 
 cpdef TableWithMetadata read_json(
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx
similarity index 95%
rename from python/cudf/cudf/_lib/pylibcudf/io/json.pyx
rename to python/pylibcudf/pylibcudf/io/json.pyx
index 2710ee60075..ce086f4a489 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/pylibcudf/pylibcudf/io/json.pyx
@@ -5,14 +5,9 @@ from libcpp.map cimport map
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.concatenate cimport concatenate
-from cudf._lib.pylibcudf.io.types cimport (
-    SinkInfo,
-    SourceInfo,
-    TableWithMetadata,
-)
-from cudf._lib.pylibcudf.libcudf.io.json cimport (
+from pylibcudf.concatenate cimport concatenate
+from pylibcudf.io.types cimport SinkInfo, SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.json cimport (
     json_reader_options,
     json_recovery_mode_t,
     json_writer_options,
@@ -20,13 +15,13 @@ from cudf._lib.pylibcudf.libcudf.io.json cimport (
     schema_element,
     write_json as cpp_write_json,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.libcudf.io.types cimport (
     compression_type,
     table_metadata,
     table_with_metadata,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
-from cudf._lib.pylibcudf.types cimport DataType
+from pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.types cimport DataType
 
 
 cdef map[string, schema_element] _generate_schema_map(list dtypes):
@@ -270,7 +265,7 @@ cpdef void write_json(
     str false_value = "false"
 ):
     """
-    Writes a :py:class:`~cudf._lib.pylibcudf.table.Table` to JSON format.
+    Writes a :py:class:`~pylibcudf.table.Table` to JSON format.
 
     Parameters
     ----------
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd
similarity index 72%
rename from python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
rename to python/pylibcudf/pylibcudf/io/parquet.pxd
index 93ef849b813..47458b00159 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/io/parquet.pxd
@@ -3,14 +3,13 @@
 from libc.stdint cimport int64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.expressions cimport Expression
-from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
-from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
+from pylibcudf.expressions cimport Expression
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_reader as cpp_chunked_parquet_reader,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.types cimport DataType
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.types cimport DataType
 
 
 cdef class ChunkedParquetReader:
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
rename to python/pylibcudf/pylibcudf/io/parquet.pyx
index 84a79f9565f..fb5244a2a9e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -5,17 +5,16 @@ from libcpp cimport bool
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.expressions cimport Expression
-from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
-from cudf._lib.pylibcudf.libcudf.expressions cimport expression
-from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
+from pylibcudf.expressions cimport Expression
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.expressions cimport expression
+from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_reader as cpp_chunked_parquet_reader,
     parquet_reader_options,
     read_parquet as cpp_read_parquet,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.io.types cimport table_with_metadata
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef parquet_reader_options _setup_parquet_reader_options(
@@ -169,7 +168,7 @@ cpdef read_parquet(
     row_groups : list[list[size_type]], default None
         List of row groups to be read.
     filters : Expression, default None
-        An AST :py:class:`cudf._lib.pylibcudf.expressions.Expression`
+        An AST :py:class:`pylibcudf.expressions.Expression`
         to use for predicate pushdown.
     convert_strings_to_categories : bool, default False
         Whether to convert string columns to the category type
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/io/types.pxd
rename to python/pylibcudf/pylibcudf/io/types.pxd
index 0094bf6032c..0ab28cb0973 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/io/types.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.libcudf.io.data_sink cimport data_sink
+from pylibcudf.libcudf.io.types cimport (
     column_encoding,
     column_in_metadata,
     column_name_info,
@@ -19,7 +18,7 @@ from cudf._lib.pylibcudf.libcudf.io.types cimport (
     table_metadata,
     table_with_metadata,
 )
-from cudf._lib.pylibcudf.table cimport Table
+from pylibcudf.table cimport Table
 
 
 cdef class TableWithMetadata:
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/io/types.pyx
rename to python/pylibcudf/pylibcudf/io/types.pyx
index 95fa7d4c2ee..1600a805b37 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -6,11 +6,10 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.io.datasource cimport Datasource
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.io.datasource cimport Datasource
+from pylibcudf.libcudf.io.data_sink cimport data_sink
+from pylibcudf.libcudf.io.datasource cimport datasource
+from pylibcudf.libcudf.io.types cimport (
     column_name_info,
     host_buffer,
     source_info,
@@ -22,9 +21,9 @@ import errno
 import io
 import os
 
-from cudf._lib.pylibcudf.libcudf.io.json import \
+from pylibcudf.libcudf.io.json import \
     json_recovery_mode_t as JSONRecoveryMode  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.io.types import \
+from pylibcudf.libcudf.io.types import \
     compression_type as CompressionType  # no-cython-lint
 
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pxd b/python/pylibcudf/pylibcudf/join.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/pylibcudf/join.pxd
rename to python/pylibcudf/pylibcudf/join.pxd
index 83b4776c16e..06969b4a2db 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pxd
+++ b/python/pylibcudf/pylibcudf/join.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.types cimport null_equality
+from pylibcudf.libcudf.types cimport null_equality
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx
similarity index 95%
rename from python/cudf/cudf/_lib/pylibcudf/join.pyx
rename to python/pylibcudf/pylibcudf/join.pyx
index 2ded84d84d1..25664286f19 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pyx
+++ b/python/pylibcudf/pylibcudf/join.pyx
@@ -4,14 +4,13 @@ from cython.operator import dereference
 
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf cimport join as cpp_join
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport null_equality
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.pylibcudf.libcudf cimport join as cpp_join
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport null_equality
-
 from .column cimport Column
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
rename to python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.py b/python/pylibcudf/pylibcudf/libcudf/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd b/python/pylibcudf/pylibcudf/libcudf/aggregation.pxd
similarity index 98%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
rename to python/pylibcudf/pylibcudf/libcudf/aggregation.pxd
index fe04db52094..58c579b86de 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/aggregation.pxd
@@ -5,8 +5,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.types cimport (
     data_type,
     interpolation,
     nan_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pyx b/python/pylibcudf/pylibcudf/libcudf/aggregation.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pyx
rename to python/pylibcudf/pylibcudf/libcudf/aggregation.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
rename to python/pylibcudf/pylibcudf/libcudf/binaryop.pxd
index 78da5980db4..d39767b4aa8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd
@@ -4,12 +4,11 @@ from libc.stdint cimport int32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.exception_handler cimport libcudf_exception_handler
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/libcudf/binaryop.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pyx
rename to python/pylibcudf/pylibcudf/libcudf/binaryop.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/column/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/column/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.py b/python/pylibcudf/pylibcudf/libcudf/column/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/column/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
rename to python/pylibcudf/pylibcudf/libcudf/column/column.pxd
index dd184d31cc6..7a369701bbd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
@@ -3,14 +3,13 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from rmm._lib.device_buffer cimport device_buffer
-
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.libcudf.types cimport data_type, size_type
+
+from rmm._lib.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
rename to python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
index 2faff21a77b..f1a326bcd40 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
@@ -1,12 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from rmm._lib.device_buffer cimport device_buffer
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport (
     bitmask_type,
     data_type,
     mask_state,
@@ -14,6 +11,8 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     type_id,
 )
 
+from rmm._lib.device_buffer cimport device_buffer
+
 
 cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
     cdef unique_ptr[column] make_numeric_column(data_type type,
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_view.pxd
similarity index 97%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
rename to python/pylibcudf/pylibcudf/libcudf/column/column_view.pxd
index c6403babe89..c0e971eb5bd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column_view.pxd
@@ -2,12 +2,7 @@
 
 from libcpp cimport bool
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    bitmask_type,
-    data_type,
-    size_type,
-)
+from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type
 
 
 cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
rename to python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
index 0c362390ff2..92f5a185a54 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
@@ -2,13 +2,12 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
+from pylibcudf.libcudf.column.column cimport column, column_view
+from pylibcudf.libcudf.table.table cimport table, table_view
+from pylibcudf.libcudf.utilities.host_span cimport host_span
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view
-from cudf._lib.pylibcudf.libcudf.utilities.host_span cimport host_span
-
 
 cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil:
     # The versions of concatenate taking vectors don't exist in libcudf
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
rename to python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
index b06feacb016..cadac6a0022 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
@@ -3,12 +3,11 @@
 from libc.stdint cimport uint8_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
 
 cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
     cdef cppclass packed_columns:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
rename to python/pylibcudf/pylibcudf/libcudf/copying.pxd
index af3a16ad01b..4d4a4ba9b89 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
@@ -5,19 +5,18 @@ from libcpp cimport bool
 from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from rmm._lib.device_buffer cimport device_buffer
-
-from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.exception_handler cimport libcudf_exception_handler
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
+
+from rmm._lib.device_buffer cimport device_buffer
 
 ctypedef const scalar constscalar
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pyx b/python/pylibcudf/pylibcudf/libcudf/copying.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pyx
rename to python/pylibcudf/pylibcudf/libcudf/copying.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
rename to python/pylibcudf/pylibcudf/libcudf/datetime.pxd
index 7db77b9c7c5..a4465343197 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd b/python/pylibcudf/pylibcudf/libcudf/experimental.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd
rename to python/pylibcudf/pylibcudf/libcudf/experimental.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd b/python/pylibcudf/pylibcudf/libcudf/expressions.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
rename to python/pylibcudf/pylibcudf/libcudf/expressions.pxd
index 427e16d4ff8..5ba2dff6074 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/expressions.pxd
@@ -3,15 +3,14 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport (
     duration_scalar,
     numeric_scalar,
     timestamp_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx b/python/pylibcudf/pylibcudf/libcudf/expressions.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx
rename to python/pylibcudf/pylibcudf/libcudf/expressions.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd b/python/pylibcudf/pylibcudf/libcudf/filling.pxd
similarity index 74%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
rename to python/pylibcudf/pylibcudf/libcudf/filling.pxd
index 16ed682f930..7bed80050d2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/filling.pxd
@@ -2,16 +2,15 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/filling.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
rename to python/pylibcudf/pylibcudf/libcudf/groupby.pxd
index 16607cc3711..848462131fe 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
@@ -5,25 +5,24 @@ from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+from pylibcudf.libcudf.aggregation cimport (
     groupby_aggregation,
     groupby_scan_aggregation,
 )
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.replace cimport replace_policy
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport (
     null_order,
     null_policy,
     order,
     size_type,
     sorted,
 )
-from cudf._lib.pylibcudf.libcudf.utilities.host_span cimport host_span
+from pylibcudf.libcudf.utilities.host_span cimport host_span
 
 # workaround for https://github.com/cython/cython/issues/3885
 ctypedef const scalar constscalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
rename to python/pylibcudf/pylibcudf/libcudf/hash.pxd
index 5346252df69..51678ba69d8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
@@ -3,10 +3,9 @@
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
rename to python/pylibcudf/pylibcudf/libcudf/interop.pxd
index 24d96b602dc..c7efff2340d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
@@ -3,14 +3,11 @@
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "dlpack/dlpack.h" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt
rename to python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/io/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.py b/python/pylibcudf/pylibcudf/libcudf/io/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/io/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd b/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd
index 1d2138f8d10..54a913a9ce3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.datasource as cudf_io_datasource
 from libcpp.memory cimport shared_ptr
 from libcpp.string cimport string
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 
-cimport cudf._lib.pylibcudf.libcudf.io.datasource as cudf_io_datasource
-
 
 cdef extern from "cudf/io/arrow_io_source.hpp" \
         namespace "cudf::io" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd b/python/pylibcudf/pylibcudf/libcudf/io/avro.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/avro.pxd
index 530df5aa8f1..2d76e2f6c80 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/avro.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.types as cudf_io_types
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/avro.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/libcudf/io/csv.pxd
similarity index 98%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/csv.pxd
index b5ff6558cd8..73a6d98650c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/csv.pxd
@@ -1,15 +1,14 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.types as cudf_io_types
+cimport pylibcudf.libcudf.table.table_view as cudf_table_view
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/csv.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/data_sink.pxd b/python/pylibcudf/pylibcudf/libcudf/io/data_sink.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/data_sink.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/data_sink.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/datasource.pxd b/python/pylibcudf/pylibcudf/libcudf/io/datasource.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/datasource.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/datasource.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/json.pxd
index 86621ae184f..7514e6c5258 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
@@ -1,15 +1,14 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.types as cudf_io_types
+cimport pylibcudf.libcudf.table.table_view as cudf_table_view
 from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/json.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx b/python/pylibcudf/pylibcudf/libcudf/io/json.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx
rename to python/pylibcudf/pylibcudf/libcudf/io/json.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
similarity index 97%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
index 25f91849dea..e4a09b8feb2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
@@ -1,5 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.types as cudf_io_types
+cimport pylibcudf.libcudf.table.table_view as cudf_table_view
 from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
@@ -7,10 +9,7 @@ from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/orc.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
index a23655b06f8..db6cb0cdfa5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
@@ -1,13 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.types as cudf_io_types
 from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
 from libcpp cimport bool
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.variant cimport monostate, variant
+from pylibcudf.variant cimport monostate, variant
 
 
 cdef extern from "cudf/io/orc_metadata.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
index d86915c7da9..222d87defa0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
@@ -8,17 +8,25 @@ from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
-from cudf._lib.pylibcudf.libcudf.expressions cimport expression
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.libcudf.expressions cimport expression
+from pylibcudf.libcudf.io.types cimport (
+    compression_type,
+    dictionary_policy,
+    partition_info,
+    sink_info,
+    source_info,
+    statistics_freq,
+    table_input_metadata,
+    table_with_metadata,
+)
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
     cdef cppclass parquet_reader_options:
         parquet_reader_options() except +
-        cudf_io_types.source_info get_source_info() except +
+        source_info get_source_info() except +
         vector[vector[size_type]] get_row_groups() except +
         const optional[reference_wrapper[expression]]& get_filter() except +
         data_type get_timestamp_type() except +
@@ -38,13 +46,13 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
         @staticmethod
         parquet_reader_options_builder builder(
-            cudf_io_types.source_info src
+            source_info src
         ) except +
 
     cdef cppclass parquet_reader_options_builder:
         parquet_reader_options_builder() except +
         parquet_reader_options_builder(
-            cudf_io_types.source_info src
+            source_info src
         ) except +
         parquet_reader_options_builder& columns(
             vector[string] col_names
@@ -69,15 +77,15 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         ) except +
         parquet_reader_options build() except +
 
-    cdef cudf_io_types.table_with_metadata read_parquet(
+    cdef table_with_metadata read_parquet(
         parquet_reader_options args) except +
 
     cdef cppclass parquet_writer_options_base:
         parquet_writer_options_base() except +
-        cudf_io_types.sink_info get_sink_info() except +
-        cudf_io_types.compression_type get_compression() except +
-        cudf_io_types.statistics_freq get_stats_level() except +
-        const optional[cudf_io_types.table_input_metadata]& get_metadata(
+        sink_info get_sink_info() except +
+        compression_type get_compression() except +
+        statistics_freq get_stats_level() except +
+        const optional[table_input_metadata]& get_metadata(
         ) except +
         size_t get_row_group_size_bytes() except +
         size_type get_row_group_size_rows() except +
@@ -87,16 +95,16 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         bool is_enabled_write_arrow_schema() except +
 
         void set_metadata(
-            cudf_io_types.table_input_metadata m
+            table_input_metadata m
         ) except +
         void set_key_value_metadata(
             vector[map[string, string]] kvm
         ) except +
         void set_stats_level(
-            cudf_io_types.statistics_freq sf
+            statistics_freq sf
         ) except +
         void set_compression(
-            cudf_io_types.compression_type compression
+            compression_type compression
         ) except +
         void set_int96_timestamps(
             bool enabled
@@ -111,14 +119,14 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
         void enable_write_arrow_schema(bool val) except +
-        void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
+        void set_dictionary_policy(dictionary_policy policy) except +
 
     cdef cppclass parquet_writer_options(parquet_writer_options_base):
         parquet_writer_options() except +
-        cudf_table_view.table_view get_table() except +
+        table_view get_table() except +
         string get_column_chunks_file_paths() except +
         void set_partitions(
-            vector[cudf_io_types.partition_info] partitions
+            vector[partition_info] partitions
         ) except +
         void set_column_chunks_file_paths(
             vector[string] column_chunks_file_paths
@@ -126,24 +134,24 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
         @staticmethod
         parquet_writer_options_builder builder(
-            cudf_io_types.sink_info sink_,
-            cudf_table_view.table_view table_
+            sink_info sink_,
+            table_view table_
         ) except +
 
     cdef cppclass parquet_writer_options_builder_base[BuilderT, OptionsT]:
         parquet_writer_options_builder_base() except +
 
         BuilderT& metadata(
-            cudf_io_types.table_input_metadata m
+            table_input_metadata m
         ) except +
         BuilderT& key_value_metadata(
             vector[map[string, string]] kvm
         ) except +
         BuilderT& stats_level(
-            cudf_io_types.statistics_freq sf
+            statistics_freq sf
         ) except +
         BuilderT& compression(
-            cudf_io_types.compression_type compression
+            compression_type compression
         ) except +
         BuilderT& int96_timestamps(
             bool enabled
@@ -173,7 +181,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             bool val
         ) except +
         BuilderT& dictionary_policy(
-            cudf_io_types.dictionary_policy val
+            dictionary_policy val
         ) except +
         OptionsT build() except +
 
@@ -182,11 +190,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
                                                 parquet_writer_options]):
         parquet_writer_options_builder() except +
         parquet_writer_options_builder(
-            cudf_io_types.sink_info sink_,
-            cudf_table_view.table_view table_
+            sink_info sink_,
+            table_view table_
         ) except +
         parquet_writer_options_builder& partitions(
-            vector[cudf_io_types.partition_info] partitions
+            vector[partition_info] partitions
         ) except +
         parquet_writer_options_builder& column_chunks_file_paths(
             vector[string] column_chunks_file_paths
@@ -201,7 +209,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
-            cudf_io_types.sink_info sink_,
+            sink_info sink_,
         ) except +
 
     cdef cppclass chunked_parquet_writer_options_builder(
@@ -210,18 +218,18 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             ):
         chunked_parquet_writer_options_builder() except +
         chunked_parquet_writer_options_builder(
-            cudf_io_types.sink_info sink_,
+            sink_info sink_,
         ) except +
 
     cdef cppclass parquet_chunked_writer:
         parquet_chunked_writer() except +
         parquet_chunked_writer(chunked_parquet_writer_options args) except +
         parquet_chunked_writer& write(
-            cudf_table_view.table_view table_,
+            table_view table_,
         ) except +
         parquet_chunked_writer& write(
-            const cudf_table_view.table_view& table_,
-            const vector[cudf_io_types.partition_info]& partitions,
+            const table_view& table_,
+            const vector[partition_info]& partitions,
         ) except +
         unique_ptr[vector[uint8_t]] close(
             vector[string] column_chunks_file_paths,
@@ -237,7 +245,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             size_t pass_read_limit,
             const parquet_reader_options& options) except +
         bool has_next() except +
-        cudf_io_types.table_with_metadata read_chunk() except +
+        table_with_metadata read_chunk() except +
 
     cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata(
         const vector[unique_ptr[vector[uint8_t]]]& metadata_list
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd
index 34a299b73ab..8e6da56c9a6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd
@@ -1,12 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.types as cudf_io_types
 from libc.stdint cimport int64_t
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd b/python/pylibcudf/pylibcudf/libcudf/io/text.pxd
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/text.pxd
index bec223d4079..14397ef970d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/text.pxd
@@ -4,8 +4,7 @@ from libc.stdint cimport uint64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column cimport column
 
 
 cdef extern from "cudf/io/text/byte_range_info.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd b/python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd
index 88cb5544dc1..676901efcec 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd
@@ -4,8 +4,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.optional cimport optional
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table cimport table
 
 
 cdef extern from "cudf/timezone.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/types.pxd
index 0a6bddcd907..a3d99807876 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
@@ -1,5 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
+cimport pylibcudf.libcudf.io.datasource as cudf_io_datasource
+cimport pylibcudf.libcudf.table.table_view as cudf_table_view
 from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
@@ -9,12 +12,8 @@ from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from pyarrow.includes.libarrow cimport CRandomAccessFile
-
-cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
-cimport cudf._lib.pylibcudf.libcudf.io.datasource as cudf_io_datasource
-cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/types.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx b/python/pylibcudf/pylibcudf/libcudf/io/types.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx
rename to python/pylibcudf/pylibcudf/libcudf/io/types.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd
similarity index 88%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
rename to python/pylibcudf/pylibcudf/libcudf/join.pxd
index 32cd17f7c11..6f6c145b23c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd
@@ -4,14 +4,13 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport null_equality, size_type
 
 from rmm._lib.device_uvector cimport device_uvector
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport null_equality, size_type
-
 ctypedef unique_ptr[device_uvector[size_type]] gather_map_type
 ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
similarity index 78%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
rename to python/pylibcudf/pylibcudf/libcudf/labeling.pxd
index 54731bf29af..ec6ef6b2a41 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.py b/python/pylibcudf/pylibcudf/libcudf/lists/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/lists/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
similarity index 78%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
index 728bd840f71..d077958ce03 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/lists/combine.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd
index 40bb2e78970..81a5ad46389 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd
@@ -2,14 +2,11 @@
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.exception_handler cimport libcudf_exception_handler
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd
similarity index 61%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd
index ba57a839fbc..e283551ed0c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd
@@ -1,11 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
 
 cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd
similarity index 59%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd
index 622a866f593..c64b2715cca 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd
index 53609ba8830..2ea060d87de 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd
@@ -1,12 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column, column_view
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd
index 8403fd179f7..54f5a8409b6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/lists/filling.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd
index ab7ed141365..a762c6aa333 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd
@@ -1,11 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
 
 cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd
index 8917a6ac899..f43340a78b0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -1,10 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd
index 0382a5d42c3..43b671ebfa0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd
@@ -1,11 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
 
 cdef extern from "cudf/lists/reverse.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd
similarity index 81%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd
index eb796897f87..266f04ef6b3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd
@@ -1,12 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.types cimport nan_equality, null_equality
 
 
 cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd
index 337ac73908b..ea45f999c47 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd
@@ -1,12 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.types cimport null_order, order
 
 
 cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
similarity index 68%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
index b1fcf7800b0..d9df7c3ca2e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
@@ -1,12 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.types cimport nan_equality, null_equality
 
 
 cdef extern from "cudf/lists/stream_compaction.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd b/python/pylibcudf/pylibcudf/libcudf/merge.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
rename to python/pylibcudf/pylibcudf/libcudf/merge.pxd
index dacb3dc2d74..6930b7a0d06 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/merge.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.types as libcudf_types
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/merge.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
rename to python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
index 0cab404c05f..3fc2c7e8f1e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
@@ -2,17 +2,12 @@
 
 from libc.stdint cimport int32_t
 from libcpp.pair cimport pair
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    bitmask_type,
-    mask_state,
-    size_type,
-)
-
 ctypedef int32_t underlying_type_t_mask_state
 
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/libcudf/nvtext/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
index 033a820d2ef..fd768d22704 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
@@ -2,10 +2,9 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd
index ca1f6650a5a..d459372fb8f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd
@@ -2,9 +2,8 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/edit_distance.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
index 2034b1c1ee5..eefae746662 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd
similarity index 61%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd
index 789a1a2c35a..16c5f7f575e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/jaccard.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
similarity index 70%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index fc5577bf3f9..0c352a5068b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
similarity index 58%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
index 229f4d8f5a3..89f6e5edfc4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
index 65c63b089df..cbf121920e1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
@@ -2,9 +2,8 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd
index aaad28d2684..6bcfa1d9380 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/replace.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
similarity index 79%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
index 040d4c9de63..673bffa28ae 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
@@ -2,10 +2,9 @@
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
index cce40bcd3f6..aabac0a617b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
@@ -4,9 +4,8 @@ from libc.stdint cimport uint16_t, uint32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd
index 721a6cabd01..34c054cf36f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
rename to python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
index babb167d2a0..1ea10e8a194 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
@@ -1,15 +1,14 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.types as libcudf_types
 from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd b/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd
similarity index 70%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
rename to python/pylibcudf/pylibcudf/libcudf/quantiles.pxd
index 32cfec2d4fc..cf2350fc36c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd
@@ -3,12 +3,11 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport (
     interpolation,
     null_order,
     order,
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd b/python/pylibcudf/pylibcudf/libcudf/reduce.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
rename to python/pylibcudf/pylibcudf/libcudf/reduce.pxd
index 3ae1f1a2906..6d2f4bd23d1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/reduce.pxd
@@ -3,15 +3,11 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport pair
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport (
-    reduce_aggregation,
-    scan_aggregation,
-)
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.aggregation cimport reduce_aggregation, scan_aggregation
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pyx b/python/pylibcudf/pylibcudf/libcudf/reduce.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pyx
rename to python/pylibcudf/pylibcudf/libcudf/reduce.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/replace.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
rename to python/pylibcudf/pylibcudf/libcudf/replace.pxd
index e67efbdaba0..4ac44fc946e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/replace.pxd
@@ -2,15 +2,12 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pyx b/python/pylibcudf/pylibcudf/libcudf/replace.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pyx
rename to python/pylibcudf/pylibcudf/libcudf/replace.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd b/python/pylibcudf/pylibcudf/libcudf/reshape.pxd
similarity index 57%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
rename to python/pylibcudf/pylibcudf/libcudf/reshape.pxd
index dfd9a71c3d3..446a082ab1b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/reshape.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/reshape.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd b/python/pylibcudf/pylibcudf/libcudf/rolling.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
rename to python/pylibcudf/pylibcudf/libcudf/rolling.pxd
index d7844f99a73..9e76faa0eba 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/rolling.pxd
@@ -1,13 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport rolling_aggregation
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.aggregation cimport rolling_aggregation
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd b/python/pylibcudf/pylibcudf/libcudf/round.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
rename to python/pylibcudf/pylibcudf/libcudf/round.pxd
index 027c4634c9f..1b65133f275 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/round.pxd
@@ -2,9 +2,8 @@
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/round.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx b/python/pylibcudf/pylibcudf/libcudf/round.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx
rename to python/pylibcudf/pylibcudf/libcudf/round.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/scalar/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.py b/python/pylibcudf/pylibcudf/libcudf/scalar/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/scalar/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
rename to python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
index 662eb90096e..4b40a8a26f6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
@@ -3,11 +3,10 @@
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
-from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport scale_type
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.wrappers.decimals cimport scale_type
 
 
 cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
rename to python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd
index 8092c3d637d..ee4b47935b2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd
@@ -2,9 +2,8 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd b/python/pylibcudf/pylibcudf/libcudf/search.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
rename to python/pylibcudf/pylibcudf/libcudf/search.pxd
index e2247a1366f..5a6ad5384c9 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/search.pxd
@@ -1,12 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.types as libcudf_types
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/search.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd b/python/pylibcudf/pylibcudf/libcudf/sorting.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
rename to python/pylibcudf/pylibcudf/libcudf/sorting.pxd
index 3d7d3aa2790..9e899855486 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/sorting.pxd
@@ -1,17 +1,14 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.types as libcudf_types
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
-
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.aggregation cimport rank_method
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
rename to python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd
index 11d803e5b76..7830c9478c2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd
@@ -3,14 +3,11 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pyx
rename to python/pylibcudf/pylibcudf/libcudf/stream_compaction.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt
rename to python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/libcudf/strings/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/strings/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd
index c4d52c83663..5e510339834 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/attributes.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd
index b0771e16680..77e3f46d7ee 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd
@@ -1,12 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
-    string_character_types,
-)
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.char_types cimport string_character_types
 
 
 cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/case.pxd
similarity index 81%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/case.pxd
index 82c146b0023..7869e90f387 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/case.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/case.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
similarity index 82%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
index f63e1a93f91..5d54c1c3593 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
@@ -2,10 +2,9 @@
 
 from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/char_types/char_types.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx
rename to python/pylibcudf/pylibcudf/libcudf/strings/char_types.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
index b05e46af0d6..e4c9fa5817a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
index f8ed253ff3c..c2fb5f0dce4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/libcudf/strings/convert/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
index daac2b5be28..83a9573baad 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
index 263cee4fe1e..fa8975c4df9 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
@@ -2,10 +2,9 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
similarity index 72%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
index af357b9bde4..ebe10574353 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
@@ -2,10 +2,9 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
index 91c1abdb5e4..6f820f3c9a4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
index 5fbf2be0244..f4fc4674506 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
index 3d6c59cbfcf..f12aab0a2e4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
index 86de956b6b6..fe571cfced6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
index aba2dbcca64..109111568d8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
similarity index 72%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
index fb7e0cae6de..5c07b698454 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
new file mode 100644
index 00000000000..12cd628fc1f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.table.table cimport table
+
+
+cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[table] extract(
+        column_view source_strings,
+        regex_program) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/find.pxd
index 04e2ed554ee..1d1df1b8b8e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/find.pxd
@@ -2,11 +2,10 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
similarity index 68%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
index 1f1adc8e99f..0491644a10a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
similarity index 56%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
index 4bc450b8911..b25724586e1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/json.pxd
similarity index 79%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/json.pxd
index 5926fa1d29f..571ba7be7af 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/json.pxd
@@ -3,10 +3,9 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar, string_scalar
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar, string_scalar
 
 
 cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
similarity index 59%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
index 26681a1aa00..657fe61eb14 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
@@ -2,12 +2,11 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.side_type cimport side_type
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/regex_flags.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/regex_flags.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/regex_flags.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx
rename to python/pylibcudf/pylibcudf/libcudf/strings/regex_flags.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/regex_program.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/regex_program.pxd
index e92c8bd7737..5d1d9e583d5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/regex_program.pxd
@@ -2,8 +2,7 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
 
 
 cdef extern from "cudf/strings/regex/regex_program.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
index 9e128529406..410ff58f299 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd
index 34e03eec638..fd5f4fc4751 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd
@@ -3,11 +3,10 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/replace.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
index 739505cd51d..40f0e2fa50c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
@@ -3,13 +3,12 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/side_type.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/split/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/libcudf/strings/split/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/strings/split/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
index 5119124b3e3..4162e886a7d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
@@ -2,11 +2,10 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.table.table cimport table
 
 
 cdef extern from "cudf/strings/split/partition.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
similarity index 78%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
index 4f75664e47a..3046149aebb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
@@ -2,13 +2,12 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/split/split.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
similarity index 52%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
index 2d6fd6a9e89..b0ca771762d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.side_type cimport side_type
 
 
 cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd
similarity index 66%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd
index 02123cc0807..576dae9387f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/slice.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
index b23ac277216..85fa719128a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
@@ -4,11 +4,10 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport char_utf8
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport char_utf8
 
 
 cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
similarity index 58%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
index 1d92d445634..c0053391328 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
index 804ad30dfb1..0c8fe1060ac 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
@@ -4,13 +4,12 @@ from libc.stdint cimport uint8_t, uint16_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
 
 cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \
         "cudf::strings::udf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/table/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/table/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.py b/python/pylibcudf/pylibcudf/libcudf/table/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/table/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd b/python/pylibcudf/pylibcudf/libcudf/table/table.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
rename to python/pylibcudf/pylibcudf/libcudf/table/table.pxd
index 737a1327d45..654c29b083a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/table/table.pxd
@@ -2,13 +2,9 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport (
-    mutable_table_view,
-    table_view,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table_view cimport mutable_table_view, table_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/table/table.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd b/python/pylibcudf/pylibcudf/libcudf/table/table_view.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
rename to python/pylibcudf/pylibcudf/libcudf/table/table_view.pxd
index 00e1a89c025..3af2f6a6c2c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/table/table_view.pxd
@@ -1,12 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/table/table_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
rename to python/pylibcudf/pylibcudf/libcudf/transform.pxd
index b0a978fe5c5..38298a7c1f1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
@@ -4,20 +4,15 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.string cimport string
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.expressions cimport expression
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.expressions cimport expression
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    bitmask_type,
-    data_type,
-    size_type,
-)
-
 
 cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
     cdef pair[unique_ptr[device_buffer], size_type] bools_to_mask (
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd b/python/pylibcudf/pylibcudf/libcudf/transpose.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
rename to python/pylibcudf/pylibcudf/libcudf/transpose.pxd
index 5dcb9c165ad..9c0e3c073b0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/transpose.pxd
@@ -2,9 +2,8 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/transpose.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd b/python/pylibcudf/pylibcudf/libcudf/types.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
rename to python/pylibcudf/pylibcudf/libcudf/types.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pyx b/python/pylibcudf/pylibcudf/libcudf/types.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/types.pyx
rename to python/pylibcudf/pylibcudf/libcudf/types.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd b/python/pylibcudf/pylibcudf/libcudf/unary.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
rename to python/pylibcudf/pylibcudf/libcudf/unary.pxd
index 2a1b189af51..887f8c7fca4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/unary.pxd
@@ -3,10 +3,9 @@
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pyx b/python/pylibcudf/pylibcudf/libcudf/unary.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pyx
rename to python/pylibcudf/pylibcudf/libcudf/unary.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/utilities/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.py b/python/pylibcudf/pylibcudf/libcudf/utilities/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/utilities/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/host_span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/host_span.pxd
rename to python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
rename to python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd
index 0cc58af735b..69765e44274 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd
@@ -2,8 +2,7 @@
 
 from libcpp cimport bool
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd
rename to python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd
index 890fca3a662..fbeb6e9db90 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.types cimport type_id
+from pylibcudf.libcudf.types cimport type_id
 
 
 cdef extern from "cudf/utilities/type_dispatcher.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/wrappers/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.py b/python/pylibcudf/pylibcudf/libcudf/wrappers/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/wrappers/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
rename to python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd
index 09b0c87e4b8..558299501d6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
-
-from cudf._lib.pylibcudf.libcudf.types cimport int128
+from pylibcudf.libcudf.types cimport int128
 
 
 cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/durations.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/durations.pxd
rename to python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/timestamps.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/timestamps.pxd
rename to python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/pylibcudf/pylibcudf/lists.pxd
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/lists.pxd
rename to python/pylibcudf/pylibcudf/lists.pxd
index 17619b489d2..e7d006e6e2e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/pylibcudf/pylibcudf/lists.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, size_type
+from pylibcudf.libcudf.types cimport null_order, size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx
similarity index 95%
rename from python/cudf/cudf/_lib/pylibcudf/lists.pyx
rename to python/pylibcudf/pylibcudf/lists.pyx
index c944fc35800..947caddc485 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/pylibcudf/pylibcudf/lists.pyx
@@ -4,9 +4,8 @@ from cython.operator cimport dereference
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists cimport (
     contains as cpp_contains,
     explode as cpp_explode,
     filling as cpp_filling,
@@ -14,34 +13,34 @@ from cudf._lib.pylibcudf.libcudf.lists cimport (
     reverse as cpp_reverse,
     set_operations as cpp_set_operations,
 )
-from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
+from pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
-from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
+from pylibcudf.libcudf.lists.count_elements cimport (
     count_elements as cpp_count_elements,
 )
-from cudf._lib.pylibcudf.libcudf.lists.extract cimport (
+from pylibcudf.libcudf.lists.extract cimport (
     extract_list_element as cpp_extract_list_element,
 )
-from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
+from pylibcudf.libcudf.lists.sorting cimport (
     sort_lists as cpp_sort_lists,
     stable_sort_lists as cpp_stable_sort_lists,
 )
-from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
+from pylibcudf.libcudf.lists.stream_compaction cimport (
     apply_boolean_mask as cpp_apply_boolean_mask,
     distinct as cpp_distinct,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport (
     nan_equality,
     null_equality,
     null_order,
     order,
     size_type,
 )
-from cudf._lib.pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
+from pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
 
 from .column cimport Column, ListColumnView
 from .scalar cimport Scalar
@@ -131,8 +130,8 @@ cpdef Column contains(Column input, ColumnOrScalar search_key):
     the search_key is contained in the input.
 
     ``search_key`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     For details, see :cpp:func:`contains`.
 
@@ -192,8 +191,8 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
     key row within the corresponding list row in the lists column.
 
     ``search_key`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     For details, see :cpp:func:`index_of`.
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/merge.pxd b/python/pylibcudf/pylibcudf/merge.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/merge.pxd
rename to python/pylibcudf/pylibcudf/merge.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx
similarity index 83%
rename from python/cudf/cudf/_lib/pylibcudf/merge.pyx
rename to python/pylibcudf/pylibcudf/merge.pyx
index 5aa46c142f6..a7d43c9d158 100644
--- a/python/cudf/cudf/_lib/pylibcudf/merge.pyx
+++ b/python/pylibcudf/pylibcudf/merge.pyx
@@ -3,11 +3,10 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf cimport merge as cpp_merge
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type
+from pylibcudf.libcudf cimport merge as cpp_merge
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport null_order, order, size_type
 
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd b/python/pylibcudf/pylibcudf/quantiles.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/quantiles.pxd
rename to python/pylibcudf/pylibcudf/quantiles.pxd
index 70ff135ca77..fbc1dfb30a6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd
+++ b/python/pylibcudf/pylibcudf/quantiles.pxd
@@ -1,7 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted
+from pylibcudf.libcudf.types cimport interpolation, sorted
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/quantiles.pyx
rename to python/pylibcudf/pylibcudf/quantiles.pyx
index c1f0e30ccd3..b847ade774d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx
+++ b/python/pylibcudf/pylibcudf/quantiles.pyx
@@ -4,15 +4,14 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.quantiles cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.quantiles cimport (
     quantile as cpp_quantile,
     quantiles as cpp_quantiles,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, sorted
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport null_order, order, sorted
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/reduce.pxd b/python/pylibcudf/pylibcudf/reduce.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/reduce.pxd
rename to python/pylibcudf/pylibcudf/reduce.pxd
index 935efd4acf2..047f08297e4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reduce.pxd
+++ b/python/pylibcudf/pylibcudf/reduce.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.reduce cimport scan_type
+from pylibcudf.libcudf.reduce cimport scan_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/reduce.pyx
rename to python/pylibcudf/pylibcudf/reduce.pyx
index c272f183007..b0212a5b9c1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reduce.pyx
+++ b/python/pylibcudf/pylibcudf/reduce.pyx
@@ -3,23 +3,18 @@
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
-
-from cudf._lib.pylibcudf.libcudf cimport reduce as cpp_reduce
-from cudf._lib.pylibcudf.libcudf.aggregation cimport (
-    reduce_aggregation,
-    scan_aggregation,
-)
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.reduce cimport scan_type
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf cimport reduce as cpp_reduce
+from pylibcudf.libcudf.aggregation cimport reduce_aggregation, scan_aggregation
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.reduce cimport scan_type
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from .aggregation cimport Aggregation
 from .column cimport Column
 from .scalar cimport Scalar
 from .types cimport DataType
 
-from cudf._lib.pylibcudf.libcudf.reduce import \
-    scan_type as ScanType  # no-cython-lint
+from pylibcudf.libcudf.reduce import scan_type as ScanType  # no-cython-lint
 
 
 cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type):
diff --git a/python/cudf/cudf/_lib/pylibcudf/replace.pxd b/python/pylibcudf/pylibcudf/replace.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/replace.pxd
rename to python/pylibcudf/pylibcudf/replace.pxd
index 40484c728db..cb9fa8bf960 100644
--- a/python/cudf/cudf/_lib/pylibcudf/replace.pxd
+++ b/python/pylibcudf/pylibcudf/replace.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
+from pylibcudf.libcudf.replace cimport replace_policy
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx
similarity index 97%
rename from python/cudf/cudf/_lib/pylibcudf/replace.pyx
rename to python/pylibcudf/pylibcudf/replace.pyx
index 6e08e8f64a9..115dee132fd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/replace.pyx
+++ b/python/pylibcudf/pylibcudf/replace.pyx
@@ -6,11 +6,10 @@ from cython.operator import dereference
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf cimport replace as cpp_replace
+from pylibcudf.libcudf.column.column cimport column
 
-from cudf._lib.pylibcudf.libcudf cimport replace as cpp_replace
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-
-from cudf._lib.pylibcudf.libcudf.replace import \
+from pylibcudf.libcudf.replace import \
     replace_policy as ReplacePolicy  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/reshape.pxd b/python/pylibcudf/pylibcudf/reshape.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/pylibcudf/reshape.pxd
rename to python/pylibcudf/pylibcudf/reshape.pxd
index a7cc45d7a08..c4d3d375f7a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reshape.pxd
+++ b/python/pylibcudf/pylibcudf/reshape.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/reshape.pyx
rename to python/pylibcudf/pylibcudf/reshape.pyx
index b68eba48cd6..a99145be900 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reshape.pyx
+++ b/python/pylibcudf/pylibcudf/reshape.pyx
@@ -2,14 +2,13 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.reshape cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.reshape cimport (
     interleave_columns as cpp_interleave_columns,
     tile as cpp_tile,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/rolling.pxd b/python/pylibcudf/pylibcudf/rolling.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/rolling.pxd
rename to python/pylibcudf/pylibcudf/rolling.pxd
index cdadee68d43..9fcda21a62f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/rolling.pxd
+++ b/python/pylibcudf/pylibcudf/rolling.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx
similarity index 89%
rename from python/cudf/cudf/_lib/pylibcudf/rolling.pyx
rename to python/pylibcudf/pylibcudf/rolling.pyx
index 7aa7828a5dd..a46540d7ffa 100644
--- a/python/cudf/cudf/_lib/pylibcudf/rolling.pyx
+++ b/python/pylibcudf/pylibcudf/rolling.pyx
@@ -3,11 +3,10 @@
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf cimport rolling as cpp_rolling
-from cudf._lib.pylibcudf.libcudf.aggregation cimport rolling_aggregation
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf cimport rolling as cpp_rolling
+from pylibcudf.libcudf.aggregation cimport rolling_aggregation
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/round.pxd b/python/pylibcudf/pylibcudf/round.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/pylibcudf/round.pxd
rename to python/pylibcudf/pylibcudf/round.pxd
index ccb64fc2847..c8501b03fad 100644
--- a/python/cudf/cudf/_lib/pylibcudf/round.pxd
+++ b/python/pylibcudf/pylibcudf/round.pxd
@@ -1,7 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libc.stdint cimport int32_t
-
-from cudf._lib.pylibcudf.libcudf.round cimport rounding_method
+from pylibcudf.libcudf.round cimport rounding_method
 
 from .column cimport Column
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/round.pyx
rename to python/pylibcudf/pylibcudf/round.pyx
index cfcc2aafbb8..dc60d53b07e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/round.pyx
+++ b/python/pylibcudf/pylibcudf/round.pyx
@@ -2,16 +2,12 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf.round cimport round as cpp_round, rounding_method
 
-from cudf._lib.pylibcudf.libcudf.round cimport (
-    round as cpp_round,
-    rounding_method,
-)
-
-from cudf._lib.pylibcudf.libcudf.round import \
+from pylibcudf.libcudf.round import \
     rounding_method as RoundingMethod  # no-cython-lint
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column cimport column
 
 from .column cimport Column
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/pylibcudf/pylibcudf/scalar.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/scalar.pxd
rename to python/pylibcudf/pylibcudf/scalar.pxd
index e6c9db2f1ac..8664dfa4b7e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/scalar.pxd
@@ -2,11 +2,10 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-
 from .column cimport Column
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/scalar.pyx
rename to python/pylibcudf/pylibcudf/scalar.pyx
index 67730be07d8..3e20938af0c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
+++ b/python/pylibcudf/pylibcudf/scalar.pyx
@@ -3,14 +3,11 @@
 from cython cimport no_gc_clear
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport make_empty_scalar_like
 
 from rmm._lib.memory_resource cimport get_current_device_resource
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
-    make_empty_scalar_like,
-)
-
 from .column cimport Column
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pxd b/python/pylibcudf/pylibcudf/search.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/search.pxd
rename to python/pylibcudf/pylibcudf/search.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/search.pyx
rename to python/pylibcudf/pylibcudf/search.pyx
index 151a39f204f..ff2468f3f9c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/search.pyx
+++ b/python/pylibcudf/pylibcudf/search.pyx
@@ -3,10 +3,9 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf cimport search as cpp_search
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
+from pylibcudf.libcudf cimport search as cpp_search
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.types cimport null_order, order
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd b/python/pylibcudf/pylibcudf/sorting.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/sorting.pxd
rename to python/pylibcudf/pylibcudf/sorting.pxd
index a4ea541a03b..8127ab21ad1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
+++ b/python/pylibcudf/pylibcudf/sorting.pxd
@@ -1,14 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    null_order,
-    null_policy,
-    order,
-    size_type,
-)
+from pylibcudf.libcudf.aggregation cimport rank_method
+from pylibcudf.libcudf.types cimport null_order, null_policy, order, size_type
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/sorting.pyx
rename to python/pylibcudf/pylibcudf/sorting.pyx
index 8c5a8e26899..bd173eebacb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
+++ b/python/pylibcudf/pylibcudf/sorting.pyx
@@ -3,12 +3,11 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf cimport sorting as cpp_sorting
-from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, null_policy, order
+from pylibcudf.libcudf cimport sorting as cpp_sorting
+from pylibcudf.libcudf.aggregation cimport rank_method
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport null_order, null_policy, order
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/stream_compaction.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
rename to python/pylibcudf/pylibcudf/stream_compaction.pxd
index 6f89aaf90e7..a4f39792f0c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pxd
@@ -1,9 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.stream_compaction cimport (
-    duplicate_keep_option,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.stream_compaction cimport duplicate_keep_option
+from pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx
similarity index 95%
rename from python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
rename to python/pylibcudf/pylibcudf/stream_compaction.pyx
index 43449d3690a..b574bfa9fa2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx
@@ -3,16 +3,11 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf cimport (
-    stream_compaction as cpp_stream_compaction,
-)
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.stream_compaction cimport (
-    duplicate_keep_option,
-)
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf cimport stream_compaction as cpp_stream_compaction
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.stream_compaction cimport duplicate_keep_option
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
@@ -20,7 +15,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     size_type,
 )
 
-from cudf._lib.pylibcudf.libcudf.stream_compaction import \
+from pylibcudf.libcudf.stream_compaction import \
     duplicate_keep_option as DuplicateKeepOption  # no-cython-lint, isort:skip
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
rename to python/pylibcudf/pylibcudf/strings/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
rename to python/pylibcudf/pylibcudf/strings/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
rename to python/pylibcudf/pylibcudf/strings/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd b/python/pylibcudf/pylibcudf/strings/capitalize.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd
rename to python/pylibcudf/pylibcudf/strings/capitalize.pxd
index 9acf189fc23..b45949d4eb4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd
+++ b/python/pylibcudf/pylibcudf/strings/capitalize.pxd
@@ -1,7 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
 
 
 cpdef Column capitalize(Column input, Scalar delimiters=*)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/pylibcudf/pylibcudf/strings/capitalize.pyx
similarity index 84%
rename from python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
rename to python/pylibcudf/pylibcudf/strings/capitalize.pyx
index ccf84d25572..06b991c3cf1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
+++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyx
@@ -2,16 +2,15 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
     make_string_scalar as cpp_make_string_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.strings cimport capitalize as cpp_capitalize
-from cudf._lib.pylibcudf.scalar cimport Scalar
-from cudf._lib.pylibcudf.strings.char_types cimport string_character_types
+from pylibcudf.libcudf.strings cimport capitalize as cpp_capitalize
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.char_types cimport string_character_types
 
 from cython.operator import dereference
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd b/python/pylibcudf/pylibcudf/strings/case.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
rename to python/pylibcudf/pylibcudf/strings/case.pxd
index 225d566fe06..d3c98d5e3dc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
+++ b/python/pylibcudf/pylibcudf/strings/case.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.column cimport Column
+from pylibcudf.column cimport Column
 
 
 cpdef Column to_lower(Column input)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx b/python/pylibcudf/pylibcudf/strings/case.pyx
similarity index 79%
rename from python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
rename to python/pylibcudf/pylibcudf/strings/case.pyx
index 3a360fd6b10..9e6cd7717d3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
+++ b/python/pylibcudf/pylibcudf/strings/case.pyx
@@ -2,10 +2,9 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.strings cimport case as cpp_case
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport case as cpp_case
 
 
 cpdef Column to_lower(Column input):
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/strings/char_types.pxd
new file mode 100644
index 00000000000..ad4e4cf61d8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pxd
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.libcudf.strings.char_types cimport string_character_types
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx
similarity index 64%
rename from python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx
rename to python/pylibcudf/pylibcudf/strings/char_types.pyx
index d96161951c6..e7621fb4d84 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.strings.char_types import \
+from pylibcudf.libcudf.strings.char_types import \
     string_character_types as StringCharacterTypes  # no-cython-lint
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd
new file mode 100644
index 00000000000..2cd4891a0ea
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/contains.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+
+cpdef Column contains_re(Column input, RegexProgram prog)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx
similarity index 75%
rename from python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx
rename to python/pylibcudf/pylibcudf/strings/contains.pyx
index 8c598b7c953..1a2446f6e2c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyx
@@ -1,11 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains
-from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport contains as cpp_contains
+from pylibcudf.strings.regex_program cimport RegexProgram
 
 
 cpdef Column contains_re(
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd b/python/pylibcudf/pylibcudf/strings/find.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
rename to python/pylibcudf/pylibcudf/strings/find.pxd
index bb43069f190..e7524a9360b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
+++ b/python/pylibcudf/pylibcudf/strings/find.pxd
@@ -1,8 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
 
 ctypedef fused ColumnOrScalar:
     Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
rename to python/pylibcudf/pylibcudf/strings/find.pyx
index a0214efd0a1..22d370bf7e8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
+++ b/python/pylibcudf/pylibcudf/strings/find.pyx
@@ -1,15 +1,14 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.strings cimport find as cpp_find
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport find as cpp_find
+from pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cpdef Column find(
@@ -22,8 +21,8 @@ cpdef Column find(
     first found in each string of the provided column.
 
     ``target`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     If ``target`` is a scalar, the scalar will be searched for in each string.
     If ``target`` is a column, the corresponding string in the column will be
@@ -126,8 +125,8 @@ cpdef Column contains(
     column.
 
     ``target`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     If ``target`` is a scalar, the scalar will be searched for in each string.
     If ``target`` is a column, the corresponding string in the column will be
@@ -180,8 +179,8 @@ cpdef Column starts_with(
     column.
 
     ``target`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     If ``target`` is a scalar, the scalar will be searched for in each string.
     If ``target`` is a column, the corresponding string in the column will be
@@ -233,8 +232,8 @@ cpdef Column ends_with(
     target string was found at the end of the string in the provided column.
 
     ``target`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     If ``target`` is a scalar, the scalar will be searched for in each string.
     If ``target`` is a column, the corresponding string in the column will be
diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pxd b/python/pylibcudf/pylibcudf/strings/regex_flags.pxd
new file mode 100644
index 00000000000..1ce3cd07df8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pxd
@@ -0,0 +1,2 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx
similarity index 59%
rename from python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx
rename to python/pylibcudf/pylibcudf/strings/regex_flags.pyx
index 903c2ddd503..ce3b6b10a42 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx
+++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags import \
+from pylibcudf.libcudf.strings.regex_flags import \
     regex_flags as RegexFlags  # no-cython-lint
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd b/python/pylibcudf/pylibcudf/strings/regex_program.pxd
similarity index 70%
rename from python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd
rename to python/pylibcudf/pylibcudf/strings/regex_program.pxd
index 61ed268fb2d..045cc1e1c6b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd
+++ b/python/pylibcudf/pylibcudf/strings/regex_program.pxd
@@ -2,8 +2,7 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 cdef class RegexProgram:
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx b/python/pylibcudf/pylibcudf/strings/regex_program.pyx
similarity index 84%
rename from python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
rename to python/pylibcudf/pylibcudf/strings/regex_program.pyx
index 5f0b8868452..f426b6888ae 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
+++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyx
@@ -4,12 +4,12 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
 
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.strings.regex_flags import RegexFlags
 
-from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags
-from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.strings.regex_flags cimport regex_flags
 
 
 cdef class RegexProgram:
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd b/python/pylibcudf/pylibcudf/strings/replace.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd
rename to python/pylibcudf/pylibcudf/strings/replace.pxd
index 52e2dc3c738..26273b96c57 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd
+++ b/python/pylibcudf/pylibcudf/strings/replace.pxd
@@ -1,8 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
 
 
 cpdef Column replace(
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx
rename to python/pylibcudf/pylibcudf/strings/replace.pyx
index c757150a600..9d0ebf4a814 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx
+++ b/python/pylibcudf/pylibcudf/strings/replace.pyx
@@ -2,20 +2,19 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
     make_string_scalar as cpp_make_string_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.strings.replace cimport (
+from pylibcudf.libcudf.strings.replace cimport (
     replace as cpp_replace,
     replace_multiple as cpp_replace_multiple,
     replace_slice as cpp_replace_slice,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
 
 
 cpdef Column replace(
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd b/python/pylibcudf/pylibcudf/strings/slice.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
rename to python/pylibcudf/pylibcudf/strings/slice.pxd
index 7d8d0006ef4..01e9f2b3c88 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
+++ b/python/pylibcudf/pylibcudf/strings/slice.pxd
@@ -1,7 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
 
 ctypedef fused ColumnOrScalar:
     Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx b/python/pylibcudf/pylibcudf/strings/slice.pyx
similarity index 81%
rename from python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
rename to python/pylibcudf/pylibcudf/strings/slice.pyx
index df75134fb71..70d10cab36c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
+++ b/python/pylibcudf/pylibcudf/strings/slice.pyx
@@ -2,16 +2,15 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
-from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
     make_fixed_width_scalar as cpp_make_fixed_width_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.strings cimport substring as cpp_slice
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.libcudf.strings cimport substring as cpp_slice
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
 
@@ -25,9 +24,9 @@ cpdef Column slice_strings(
     """Perform a slice operation on a strings column.
 
     ``start`` and ``stop`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. But ``step`` must be a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`. But ``step`` must be a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     For details, see :cpp:func:`cudf::strings::slice_strings`.
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/pylibcudf/pylibcudf/table.pxd
similarity index 78%
rename from python/cudf/cudf/_lib/pylibcudf/table.pxd
rename to python/pylibcudf/pylibcudf/table.pxd
index e476fc770e3..cf5c0aa80f2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pxd
+++ b/python/pylibcudf/pylibcudf/table.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef class Table:
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/table.pyx
rename to python/pylibcudf/pylibcudf/table.pyx
index d91fa0474b0..5f77b89a605 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/pylibcudf/pylibcudf/table.pyx
@@ -4,10 +4,9 @@ from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table cimport table
 
 from .column cimport Column
 
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py
similarity index 97%
rename from python/cudf/cudf/pylibcudf_tests/common/utils.py
rename to python/pylibcudf/pylibcudf/tests/common/utils.py
index acb2b5be85c..babe6634318 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/pylibcudf/pylibcudf/tests/common/utils.py
@@ -6,11 +6,11 @@
 
 import numpy as np
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from pyarrow.parquet import write_table as pq_write_table
-
-from cudf._lib import pylibcudf as plc
-from cudf._lib.pylibcudf.io.types import CompressionType
+from pylibcudf.io.types import CompressionType
 
 
 def metadata_from_arrow_type(
@@ -157,13 +157,13 @@ def _flatten_arrays(arr):
         for lh_arr, rh_arr in zip(lhs, rhs):
             # Check NaNs positions match
             # and then filter out nans
-            lhs_nans = pa.compute.is_nan(lh_arr)
-            rhs_nans = pa.compute.is_nan(rh_arr)
+            lhs_nans = pc.is_nan(lh_arr)
+            rhs_nans = pc.is_nan(rh_arr)
             assert lhs_nans.equals(rhs_nans)
 
-            if pa.compute.any(lhs_nans) or pa.compute.any(rhs_nans):
+            if pc.any(lhs_nans) or pc.any(rhs_nans):
                 # masks must be equal at this point
-                mask = pa.compute.fill_null(pa.compute.invert(lhs_nans), True)
+                mask = pc.fill_null(pc.invert(lhs_nans), True)
                 lh_arr = lh_arr.filter(mask)
                 rh_arr = rh_arr.filter(mask)
 
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/conftest.py
rename to python/pylibcudf/pylibcudf/tests/conftest.py
index 945e1689229..fdce6f353ca 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/pylibcudf/pylibcudf/tests/conftest.py
@@ -8,10 +8,9 @@
 
 import numpy as np
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
-
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.types import CompressionType
+from pylibcudf.io.types import CompressionType
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/io/test_avro.py
rename to python/pylibcudf/pylibcudf/tests/io/test_avro.py
index 061d6792ce3..0cd5064a697 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
@@ -5,11 +5,10 @@
 
 import fastavro
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_table_and_meta_eq
 
-import cudf._lib.pylibcudf as plc
-
 avro_dtype_pairs = [
     ("boolean", pa.bool_()),
     ("int", pa.int32()),
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/io/test_csv.py
rename to python/pylibcudf/pylibcudf/tests/io/test_csv.py
index 95326a8b681..ccd7eef54f3 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_csv.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
@@ -5,7 +5,9 @@
 
 import pandas as pd
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
+from pylibcudf.io.types import CompressionType
 from utils import (
     _convert_numeric_types_to_floating,
     assert_table_and_meta_eq,
@@ -13,9 +15,6 @@
     write_source_str,
 )
 
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.types import CompressionType
-
 # Shared kwargs to pass to make_source
 _COMMON_CSV_SOURCE_KWARGS = {
     "format": "csv",
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py
similarity index 99%
rename from python/cudf/cudf/pylibcudf_tests/io/test_json.py
rename to python/pylibcudf/pylibcudf/tests/io/test_json.py
index 4239f2438bb..9d976fedf00 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py
@@ -3,7 +3,9 @@
 
 import pandas as pd
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
+from pylibcudf.io.types import CompressionType
 from utils import (
     assert_table_and_meta_eq,
     make_source,
@@ -11,9 +13,6 @@
     write_source_str,
 )
 
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.types import CompressionType
-
 # Shared kwargs to pass to make_source
 _COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"}
 
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
similarity index 97%
rename from python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
rename to python/pylibcudf/pylibcudf/tests/io/test_parquet.py
index dbd20cd473e..f6e843ccf66 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
@@ -1,18 +1,17 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
 import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from pyarrow.parquet import read_table
-from utils import assert_table_and_meta_eq, make_source
-
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.expressions import (
+from pylibcudf.expressions import (
     ASTOperator,
     ColumnNameReference,
     ColumnReference,
     Literal,
     Operation,
 )
+from utils import assert_table_and_meta_eq, make_source
 
 # Shared kwargs to pass to make_source
 _COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"}
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
rename to python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
index 438c482b77a..747f58ec8cf 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
@@ -2,10 +2,9 @@
 
 import io
 
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
 def io_class(request):
diff --git a/python/cudf/cudf/pylibcudf_tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini
similarity index 100%
rename from python/cudf/cudf/pylibcudf_tests/pytest.ini
rename to python/pylibcudf/pylibcudf/tests/pytest.ini
diff --git a/python/cudf/cudf/pylibcudf_tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
similarity index 99%
rename from python/cudf/cudf/pylibcudf_tests/test_binaryops.py
rename to python/pylibcudf/pylibcudf/tests/test_binaryops.py
index a83caf39ead..f784cb3c191 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_binaryops.py
+++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
@@ -4,11 +4,10 @@
 
 import numpy as np
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 def idfn(param):
     ltype, rtype, outtype, plc_op, _ = param
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_factories.py b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
similarity index 99%
rename from python/cudf/cudf/pylibcudf_tests/test_column_factories.py
rename to python/pylibcudf/pylibcudf/tests/test_column_factories.py
index 4c05770a41f..8cedbc6d42f 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_column_factories.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
@@ -1,11 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq
 
-from cudf._lib import pylibcudf as plc
-
 EMPTY_COL_SIZE = 3
 
 NUMERIC_TYPES = [
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
similarity index 97%
rename from python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
rename to python/pylibcudf/pylibcudf/tests/test_column_from_device.py
index 78ee2cb100e..0e129fdf0ef 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
@@ -1,13 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
 import rmm
 
-from cudf._lib import pylibcudf as plc
-
 VALID_TYPES = [
     pa.int8(),
     pa.int16(),
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/pylibcudf/pylibcudf/tests/test_copying.py
similarity index 99%
rename from python/cudf/cudf/pylibcudf_tests/test_copying.py
rename to python/pylibcudf/pylibcudf/tests/test_copying.py
index f27fe4e942e..628682d0a66 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_copying.py
+++ b/python/pylibcudf/pylibcudf/tests/test_copying.py
@@ -2,6 +2,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import (
     DEFAULT_STRUCT_TESTING_TYPE,
@@ -15,8 +16,6 @@
     metadata_from_arrow_type,
 )
 
-from cudf._lib import pylibcudf as plc
-
 
 # TODO: consider moving this to conftest and "pairing"
 # it with pa_type, so that they don't get out of sync
diff --git a/python/cudf/cudf/pylibcudf_tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py
similarity index 83%
rename from python/cudf/cudf/pylibcudf_tests/test_datetime.py
rename to python/pylibcudf/pylibcudf/tests/test_datetime.py
index 75af0fa6ca1..d3aa6101e2d 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py
@@ -3,11 +3,11 @@
 import datetime
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture
 def column(has_nulls):
@@ -25,6 +25,6 @@ def column(has_nulls):
 def test_extract_year(column):
     got = plc.datetime.extract_year(column)
     # libcudf produces an int16, arrow produces an int64
-    expect = pa.compute.year(plc.interop.to_arrow(column)).cast(pa.int16())
+    expect = pc.year(plc.interop.to_arrow(column)).cast(pa.int16())
 
     assert_column_eq(expect, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_expressions.py b/python/pylibcudf/pylibcudf/tests/test_expressions.py
similarity index 97%
rename from python/cudf/cudf/pylibcudf_tests/test_expressions.py
rename to python/pylibcudf/pylibcudf/tests/test_expressions.py
index f661512caad..5894ef4624c 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_expressions.py
+++ b/python/pylibcudf/pylibcudf/tests/test_expressions.py
@@ -1,9 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 # We can't really evaluate these expressions, so just make sure
 # construction works properly
 
diff --git a/python/cudf/cudf/pylibcudf_tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/test_interop.py
rename to python/pylibcudf/pylibcudf/tests/test_interop.py
index 5c05f460e28..01c998f16d4 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_interop.py
+++ b/python/pylibcudf/pylibcudf/tests/test_interop.py
@@ -1,10 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 
 def test_list_dtype_roundtrip():
     list_type = pa.list_(pa.int32())
diff --git a/python/cudf/cudf/pylibcudf_tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py
similarity index 94%
rename from python/cudf/cudf/pylibcudf_tests/test_join.py
rename to python/pylibcudf/pylibcudf/tests/test_join.py
index eb25ed915b1..61e02f4d28d 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_join.py
+++ b/python/pylibcudf/pylibcudf/tests/test_join.py
@@ -2,10 +2,9 @@
 
 import numpy as np
 import pyarrow as pa
+import pylibcudf as plc
 from utils import assert_table_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 def test_cross_join():
     left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"])
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py
similarity index 99%
rename from python/cudf/cudf/pylibcudf_tests/test_lists.py
rename to python/pylibcudf/pylibcudf/tests/test_lists.py
index 33f95a7d364..2353a6ff8f9 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_lists.py
@@ -2,11 +2,11 @@
 
 import numpy as np
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 @pytest.fixture
 def test_data():
@@ -184,7 +184,7 @@ def test_extract_list_element_scalar(list_column):
     plc_column = plc.interop.from_arrow(pa.array(list_column))
 
     res = plc.lists.extract_list_element(plc_column, 0)
-    expect = pa.compute.list_element(list_column, 0)
+    expect = pc.list_element(list_column, 0)
 
     assert_column_eq(expect, res)
 
diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
similarity index 99%
rename from python/cudf/cudf/pylibcudf_tests/test_quantiles.py
rename to python/pylibcudf/pylibcudf/tests/test_quantiles.py
index 13f3b037606..bac56691306 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py
+++ b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
@@ -3,11 +3,10 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
-import cudf._lib.pylibcudf as plc
-
 # Map pylibcudf interpolation options to pyarrow options
 interp_mapping = {
     plc.types.Interpolation.LINEAR: "linear",
diff --git a/python/cudf/cudf/pylibcudf_tests/test_regex_program.py b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
similarity index 89%
rename from python/cudf/cudf/pylibcudf_tests/test_regex_program.py
rename to python/pylibcudf/pylibcudf/tests/test_regex_program.py
index 3a9bcec3616..777315df538 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_regex_program.py
+++ b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
@@ -1,9 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.mark.parametrize("pat", ["(", "*", "\\"])
 def test_regex_program_invalid(pat):
diff --git a/python/cudf/cudf/pylibcudf_tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py
similarity index 96%
rename from python/cudf/cudf/pylibcudf_tests/test_reshape.py
rename to python/pylibcudf/pylibcudf/tests/test_reshape.py
index da1157e5832..01115bc363a 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_reshape.py
+++ b/python/pylibcudf/pylibcudf/tests/test_reshape.py
@@ -1,11 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def reshape_data():
diff --git a/python/cudf/cudf/pylibcudf_tests/test_round.py b/python/pylibcudf/pylibcudf/tests/test_round.py
similarity index 86%
rename from python/cudf/cudf/pylibcudf_tests/test_round.py
rename to python/pylibcudf/pylibcudf/tests/test_round.py
index 991e6ed310d..0b30316b9a0 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_round.py
+++ b/python/pylibcudf/pylibcudf/tests/test_round.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(params=["float32", "float64"])
 def column(request, has_nulls):
@@ -26,8 +26,6 @@ def test_round(column, round_mode, decimals):
         "half_to_even": plc.round.RoundingMethod.HALF_EVEN,
     }[round_mode]
     got = plc.round.round(column, decimals, method)
-    expect = pa.compute.round(
-        plc.interop.to_arrow(column), decimals, round_mode
-    )
+    expect = pc.round(plc.interop.to_arrow(column), decimals, round_mode)
 
     assert_column_eq(expect, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
similarity index 86%
rename from python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
rename to python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
index c4e437fe5d9..176ccc55b96 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def str_data():
@@ -34,7 +34,7 @@ def str_data():
 def test_capitalize(str_data):
     pa_data, plc_data = str_data
     got = plc.strings.capitalize.capitalize(plc_data)
-    expected = pa.compute.utf8_capitalize(pa_data)
+    expected = pc.utf8_capitalize(pa_data)
     assert_column_eq(expected, got)
 
 
@@ -43,12 +43,12 @@ def test_title(str_data):
     got = plc.strings.capitalize.title(
         plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES
     )
-    expected = pa.compute.utf8_title(pa_data)
+    expected = pc.utf8_title(pa_data)
     assert_column_eq(expected, got)
 
 
 def test_is_title(str_data):
     pa_data, plc_data = str_data
     got = plc.strings.capitalize.is_title(plc_data)
-    expected = pa.compute.utf8_is_title(pa_data)
+    expected = pc.utf8_is_title(pa_data)
     assert_column_eq(expected, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/pylibcudf/pylibcudf/tests/test_string_case.py
similarity index 80%
rename from python/cudf/cudf/pylibcudf_tests/test_string_case.py
rename to python/pylibcudf/pylibcudf/tests/test_string_case.py
index 1039859b2cf..233cc253b14 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_case.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_case.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def string_col():
@@ -17,19 +17,19 @@ def string_col():
 def test_to_upper(string_col):
     plc_col = plc.interop.from_arrow(string_col)
     got = plc.strings.case.to_upper(plc_col)
-    expected = pa.compute.utf8_upper(string_col)
+    expected = pc.utf8_upper(string_col)
     assert_column_eq(expected, got)
 
 
 def test_to_lower(string_col):
     plc_col = plc.interop.from_arrow(string_col)
     got = plc.strings.case.to_lower(plc_col)
-    expected = pa.compute.utf8_lower(string_col)
+    expected = pc.utf8_lower(string_col)
     assert_column_eq(expected, got)
 
 
 def test_swapcase(string_col):
     plc_col = plc.interop.from_arrow(string_col)
     got = plc.strings.case.swapcase(plc_col)
-    expected = pa.compute.utf8_swapcase(string_col)
+    expected = pc.utf8_swapcase(string_col)
     assert_column_eq(expected, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
similarity index 92%
rename from python/cudf/cudf/pylibcudf_tests/test_string_contains.py
rename to python/pylibcudf/pylibcudf/tests/test_string_contains.py
index fc8c6656b5d..4f88e09183f 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def target_col():
@@ -44,7 +44,7 @@ def plc_target_pat(pa_target_scalar):
 def test_contains_re(target_col, pa_target_scalar, plc_target_pat):
     pa_target_col, plc_target_col = target_col
     got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat)
-    expected = pa.compute.match_substring_regex(
+    expected = pc.match_substring_regex(
         pa_target_col, pa_target_scalar.as_py()
     )
     assert_column_eq(got, expected)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_find.py b/python/pylibcudf/pylibcudf/tests/test_string_find.py
similarity index 97%
rename from python/cudf/cudf/pylibcudf_tests/test_string_find.py
rename to python/pylibcudf/pylibcudf/tests/test_string_find.py
index 95a1a3cf731..db3b13a5aae 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_find.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def data_col():
@@ -223,7 +223,7 @@ def test_starts_with(data_col, target_scalar):
     pa_target_scalar, plc_target_scalar = target_scalar
     py_target = pa_target_scalar.as_py()
     got = plc.strings.find.starts_with(plc_data_col, plc_target_scalar)
-    expected = pa.compute.starts_with(pa_data_col, py_target)
+    expected = pc.starts_with(pa_data_col, py_target)
     assert_column_eq(expected, got)
 
 
@@ -242,7 +242,7 @@ def test_ends_with(data_col, target_scalar):
     pa_target_scalar, plc_target_scalar = target_scalar
     py_target = pa_target_scalar.as_py()
     got = plc.strings.find.ends_with(plc_data_col, plc_target_scalar)
-    expected = pa.compute.ends_with(pa_data_col, py_target)
+    expected = pc.ends_with(pa_data_col, py_target)
     assert_column_eq(expected, got)
 
 
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
similarity index 95%
rename from python/cudf/cudf/pylibcudf_tests/test_string_replace.py
rename to python/pylibcudf/pylibcudf/tests/test_string_replace.py
index f20edf6a506..5a9c2007b73 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def data_col():
@@ -64,7 +64,7 @@ def test_replace(data_col, scalar_repl_target, scalar_repl, maxrepl):
         plc_data_col, plc_target, plc_repl, maxrepl
     )
 
-    expected = pa.compute.replace_substring(
+    expected = pc.replace_substring(
         pa_data_col,
         pattern=pa_target,
         replacement=pa_repl,
@@ -90,7 +90,7 @@ def test_replace_slice(data_col, scalar_repl, startstop):
         # count_characters on the input, take the max and set stop to that
         stop = 1000
 
-    expected = pa.compute.utf8_replace_slice(pa_data_col, start, stop, pa_repl)
+    expected = pc.utf8_replace_slice(pa_data_col, start, stop, pa_repl)
 
     assert_column_eq(expected, got)
 
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/test_string_slice.py
rename to python/pylibcudf/pylibcudf/tests/test_string_slice.py
index bd63987b30f..d9ce5591b98 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
@@ -1,11 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def pa_col():
diff --git a/python/cudf/cudf/pylibcudf_tests/test_table.py b/python/pylibcudf/pylibcudf/tests/test_table.py
similarity index 93%
rename from python/cudf/cudf/pylibcudf_tests/test_table.py
rename to python/pylibcudf/pylibcudf/tests/test_table.py
index cf1d51f6491..e822d6a97a8 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_table.py
+++ b/python/pylibcudf/pylibcudf/tests/test_table.py
@@ -1,10 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.mark.parametrize(
     "arrow_tbl",
diff --git a/python/cudf/cudf/pylibcudf_tests/test_traits.py b/python/pylibcudf/pylibcudf/tests/test_traits.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/test_traits.py
rename to python/pylibcudf/pylibcudf/tests/test_traits.py
index 6c22cb02f21..2570e8abd51 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_traits.py
+++ b/python/pylibcudf/pylibcudf/tests/test_traits.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib import pylibcudf as plc
+import pylibcudf as plc
 
 
 def test_is_relationally_comparable():
diff --git a/python/cudf/cudf/pylibcudf_tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py
similarity index 95%
rename from python/cudf/cudf/pylibcudf_tests/test_transform.py
rename to python/pylibcudf/pylibcudf/tests/test_transform.py
index 312939888dd..06fc35d8835 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_transform.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transform.py
@@ -3,10 +3,9 @@
 import math
 
 import pyarrow as pa
+import pylibcudf as plc
 from utils import assert_column_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 def test_nans_to_nulls(has_nans):
     if has_nans:
diff --git a/python/cudf/cudf/pylibcudf_tests/test_unary.py b/python/pylibcudf/pylibcudf/tests/test_unary.py
similarity index 93%
rename from python/cudf/cudf/pylibcudf_tests/test_unary.py
rename to python/pylibcudf/pylibcudf/tests/test_unary.py
index b5e4f0cb0e8..9b8085d5c52 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_unary.py
+++ b/python/pylibcudf/pylibcudf/tests/test_unary.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib import pylibcudf as plc
+import pylibcudf as plc
 
 
 def test_is_supported_cast():
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pxd b/python/pylibcudf/pylibcudf/traits.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/traits.pxd
rename to python/pylibcudf/pylibcudf/traits.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/pylibcudf/pylibcudf/traits.pyx
similarity index 98%
rename from python/cudf/cudf/_lib/pylibcudf/traits.pyx
rename to python/pylibcudf/pylibcudf/traits.pyx
index d2370f8d641..5a1c67e1f6c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/traits.pyx
+++ b/python/pylibcudf/pylibcudf/traits.pyx
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.libcudf.utilities cimport traits
+from pylibcudf.libcudf.utilities cimport traits
 
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/transform.pxd
rename to python/pylibcudf/pylibcudf/transform.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/transform.pyx
rename to python/pylibcudf/pylibcudf/transform.pyx
index a734e71b820..100ccb580ce 100644
--- a/python/cudf/cudf/_lib/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -2,12 +2,11 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
+from pylibcudf.libcudf cimport transform as cpp_transform
+from pylibcudf.libcudf.types cimport size_type
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
-from cudf._lib.pylibcudf.libcudf cimport transform as cpp_transform
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/pylibcudf/pylibcudf/types.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/pylibcudf/types.pxd
rename to python/pylibcudf/pylibcudf/types.pxd
index 1f3e1aa2fbb..aa48979d961 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pxd
+++ b/python/pylibcudf/pylibcudf/types.pxd
@@ -2,8 +2,7 @@
 
 from libc.stdint cimport int32_t
 from libcpp cimport bool as cbool
-
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.types cimport (
     data_type,
     interpolation,
     mask_state,
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx
similarity index 66%
rename from python/cudf/cudf/_lib/pylibcudf/types.pyx
rename to python/pylibcudf/pylibcudf/types.pyx
index 311f9ce4046..58c7d97e9bc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/pylibcudf/pylibcudf/types.pyx
@@ -1,25 +1,24 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
-
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.types cimport (
     data_type,
     size_of as cpp_size_of,
     size_type,
     type_id,
 )
-from cudf._lib.pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id
-
-from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import null_policy as NullPolicy  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import interpolation as Interpolation  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import mask_state as MaskState  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import nan_equality as NanEquality  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import null_equality as NullEquality  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import order as Order  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import sorted as Sorted  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id
+
+from pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import null_policy as NullPolicy  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import interpolation as Interpolation  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import mask_state as MaskState  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import nan_equality as NanEquality  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import null_equality as NullEquality  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import null_order as NullOrder  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import order as Order  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import sorted as Sorted  # no-cython-lint, isort:skip
 
 
 cdef class DataType:
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pxd b/python/pylibcudf/pylibcudf/unary.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/unary.pxd
rename to python/pylibcudf/pylibcudf/unary.pxd
index d07df838172..9ee08653599 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pxd
+++ b/python/pylibcudf/pylibcudf/unary.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
+from pylibcudf.libcudf.unary cimport unary_operator
 
 from .column cimport Column
 from .types cimport DataType
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/unary.pyx
rename to python/pylibcudf/pylibcudf/unary.pyx
index 8da46f0a832..839360ef406 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pyx
+++ b/python/pylibcudf/pylibcudf/unary.pyx
@@ -3,12 +3,11 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf cimport unary as cpp_unary
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.unary cimport unary_operator
 
-from cudf._lib.pylibcudf.libcudf cimport unary as cpp_unary
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
-
-from cudf._lib.pylibcudf.libcudf.unary import \
+from pylibcudf.libcudf.unary import \
     unary_operator as UnaryOperator  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pxd b/python/pylibcudf/pylibcudf/utils.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/pylibcudf/utils.pxd
rename to python/pylibcudf/pylibcudf/utils.pxd
index 77c05086397..6b994f20b61 100644
--- a/python/cudf/cudf/_lib/pylibcudf/utils.pxd
+++ b/python/pylibcudf/pylibcudf/utils.pxd
@@ -2,9 +2,8 @@
 
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport bitmask_type
 
 
 cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pyx b/python/pylibcudf/pylibcudf/utils.pyx
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/utils.pyx
rename to python/pylibcudf/pylibcudf/utils.pyx
index 42e3575ed44..ee4421ddeaf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/utils.pyx
+++ b/python/pylibcudf/pylibcudf/utils.pyx
@@ -5,11 +5,10 @@ from cython.operator import dereference
 from libc.stdint cimport uintptr_t
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
-
 from cuda import cudart
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport bitmask_type
 
 from .scalar cimport Scalar
 
diff --git a/python/cudf/cudf/_lib/variant.pxd b/python/pylibcudf/pylibcudf/variant.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/variant.pxd
rename to python/pylibcudf/pylibcudf/variant.pxd
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
new file mode 100644
index 00000000000..b037508d03f
--- /dev/null
+++ b/python/pylibcudf/pyproject.toml
@@ -0,0 +1,123 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+[build-system]
+build-backend = "rapids_build_backend.build"
+requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "scikit-build-core[pyproject]>=0.10.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project]
+name = "pylibcudf"
+dynamic = ["version"]
+description = "pylibcudf - Python bindings for libcudf"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.9"
+dependencies = [
+    "cuda-python>=11.7.1,<12.0a0",
+    "nvtx>=0.2.1",
+    "packaging",
+    "pyarrow>=16.1.0,<16.2.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
+    "typing_extensions>=4.0.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+classifiers = [
+    "Intended Audience :: Developers",
+    "Topic :: Database",
+    "Topic :: Scientific/Engineering",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+]
+
+[project.optional-dependencies]
+test = [
+    "fastavro>=0.22.9",
+    "hypothesis",
+    "numpy",
+    "pandas",
+    "pytest-cov",
+    "pytest-xdist",
+    "pytest<8",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cudf"
+Documentation = "https://docs.rapids.ai/api/cudf/stable/"
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_dask = [
+    "dask",
+    "distributed",
+    "dask_cuda",
+]
+known_rapids = [
+    "rmm",
+]
+known_first_party = [
+    "cudf",
+]
+default_section = "THIRDPARTY"
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "DASK",
+    "RAPIDS",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "__init__.py",
+]
+
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
+requires = [
+    "cmake>=3.26.4,!=3.30.0",
+    "cython>=3.0.3",
+    "ninja",
+    "numpy==1.23.*",
+    "pyarrow==16.1.0.*",
+    "rmm==24.10.*,>=0.0.0a0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
+ninja.make-fallback = true
+sdist.exclude = ["*tests*"]
+sdist.reproducible = true
+wheel.packages = ["pylibcudf"]
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "pylibcudf/VERSION"
+regex = "(?P<value>.*)"

From 10cdd5fc5dcfc73404ae825f5d4bcf357c69ff24 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 16 Aug 2024 12:49:28 -0700
Subject: [PATCH 078/270] Reenable arrow tests (#16556)

This PR reenables the tests that were disabled in #16379, converting them to use the new C data interface functions instead of the old libarrow-based ones.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16556
---
 cpp/tests/CMakeLists.txt              |   4 -
 cpp/tests/interop/arrow_utils.hpp     |   5 +-
 cpp/tests/interop/from_arrow_test.cpp | 145 +++++++++++++------
 cpp/tests/interop/to_arrow_test.cpp   | 192 ++++++++++++++++----------
 cpp/tests/streams/interop_test.cpp    |  78 -----------
 5 files changed, 224 insertions(+), 200 deletions(-)
 delete mode 100644 cpp/tests/streams/interop_test.cpp

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8c4b0f1e367..006b36add0e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -690,10 +690,6 @@ ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE tes
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
-# Deprecation from 16297 and fixes in 16379 caused this test to be empty This will be reenabled once
-# the deprecated APIs have been replaced in 24.10.
-#
-# ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp
index 1fdf02e02f1..08eada632a5 100644
--- a/cpp/tests/interop/arrow_utils.hpp
+++ b/cpp/tests/interop/arrow_utils.hpp
@@ -32,6 +32,8 @@
 
 #include <arrow/util/bitmap_builders.h>
 
+#include <algorithm>
+
 #pragma once
 
 template <typename T>
@@ -154,8 +156,9 @@ std::shared_ptr<arrow::Array> get_arrow_list_array(std::vector<T> data,
                "Failed to append values to buffer builder");
   CUDF_EXPECTS(buff_builder.Finish(&offset_buffer).ok(), "Failed to allocate buffer");
 
+  auto nullable = std::accumulate(list_validity.begin(), list_validity.end(), 0) > 0;
   return std::make_shared<arrow::ListArray>(
-    arrow::list(data_array->type()),
+    arrow::list(arrow::field("", data_array->type(), nullable)),
     offsets.size() - 1,
     offset_buffer,
     data_array,
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 733e5814425..81c406c0faf 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -14,13 +14,6 @@
  * limitations under the License.
  */
 
-// These interop functions are deprecated. We keep the code in this
-// test and will migrate the tests to export the arrow C data
-// interface which we consume with from_arrow_host. For now, the tests
-// are commented out.
-
-#if 0
-
 #include <tests/interop/arrow_utils.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -43,6 +36,10 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <arrow/c/bridge.h>
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow_device.h>
+
 std::unique_ptr<cudf::table> get_cudf_table()
 {
   std::vector<std::unique_ptr<cudf::column>> columns;
@@ -93,6 +90,45 @@ struct FromArrowTest : public cudf::test::BaseFixture {};
 template <typename T>
 struct FromArrowTestDurationsTest : public cudf::test::BaseFixture {};
 
+std::optional<std::unique_ptr<cudf::table>> export_table(std::shared_ptr<arrow::Table> arrow_table)
+{
+  ArrowSchema schema;
+  if (!arrow::ExportSchema(*arrow_table->schema(), &schema).ok()) { return std::nullopt; }
+  auto batch = arrow_table->CombineChunksToBatch().ValueOrDie();
+  ArrowArray arr;
+  if (!arrow::ExportRecordBatch(*batch, &arr).ok()) { return std::nullopt; }
+  auto ret = cudf::from_arrow(&schema, &arr);
+  arr.release(&arr);
+  schema.release(&schema);
+  return {std::move(ret)};
+}
+
+std::optional<std::unique_ptr<cudf::scalar>> export_scalar(arrow::Scalar const& arrow_scalar)
+{
+  auto maybe_array = arrow::MakeArrayFromScalar(arrow_scalar, 1);
+  if (!maybe_array.ok()) { return std::nullopt; }
+  auto array = *maybe_array;
+
+  ArrowSchema schema;
+  if (!arrow::ExportType(*array->type(), &schema).ok()) { return std::nullopt; }
+
+  ArrowArray arr;
+  if (!arrow::ExportArray(*array, &arr).ok()) { return std::nullopt; }
+
+  auto col = cudf::from_arrow_column(&schema, &arr);
+  auto ret = cudf::get_element(col->view(), 0);
+
+  arr.release(&arr);
+  schema.release(&schema);
+  return {std::move(ret)};
+}
+
+std::optional<std::unique_ptr<cudf::scalar>> export_scalar(
+  std::shared_ptr<arrow::Scalar> const arrow_scalar)
+{
+  return export_scalar(*arrow_scalar);
+}
+
 TYPED_TEST_SUITE(FromArrowTestDurationsTest, cudf::test::DurationTypes);
 
 TEST_F(FromArrowTest, EmptyTable)
@@ -102,9 +138,10 @@ TEST_F(FromArrowTest, EmptyTable)
   auto expected_cudf_table = tables.first->view();
   auto arrow_table         = tables.second;
 
-  auto got_cudf_table = cudf::from_arrow(*arrow_table);
+  auto got_cudf_table = export_table(arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table.value()->view());
 }
 
 TEST_F(FromArrowTest, DateTimeTable)
@@ -127,9 +164,10 @@ TEST_F(FromArrowTest, DateTimeTable)
 
   auto arrow_table = arrow::Table::Make(schema, {arr});
 
-  auto got_cudf_table = cudf::from_arrow(*arrow_table);
+  auto got_cudf_table = export_table(arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table.value()->view());
 }
 
 TYPED_TEST(FromArrowTestDurationsTest, DurationTable)
@@ -160,9 +198,10 @@ TYPED_TEST(FromArrowTestDurationsTest, DurationTable)
 
   auto arrow_table = arrow::Table::Make(schema, {arr});
 
-  auto got_cudf_table = cudf::from_arrow(*arrow_table);
+  auto got_cudf_table = export_table(arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table.value()->view());
 }
 
 TEST_F(FromArrowTest, NestedList)
@@ -188,8 +227,9 @@ TEST_F(FromArrowTest, NestedList)
 
   auto arrow_table = arrow::Table::Make(schema, {nested_list_arr});
 
-  auto got_cudf_table = cudf::from_arrow(*arrow_table);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view());
+  auto got_cudf_table = export_table(arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table.value()->view());
 }
 
 TEST_F(FromArrowTest, StructColumn)
@@ -274,9 +314,10 @@ TEST_F(FromArrowTest, StructColumn)
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
   auto input  = arrow::Table::Make(schema, {struct_array});
 
-  auto got_cudf_table = cudf::from_arrow(*input);
+  auto got_cudf_table = export_table(input);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table.value()->view());
 }
 
 TEST_F(FromArrowTest, DictionaryIndicesType)
@@ -304,9 +345,10 @@ TEST_F(FromArrowTest, DictionaryIndicesType)
 
   cudf::table expected_table(std::move(columns));
 
-  auto got_cudf_table = cudf::from_arrow(*arrow_table);
+  auto got_cudf_table = export_table(arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.view(), got_cudf_table->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.view(), got_cudf_table.value()->view());
 }
 
 TEST_F(FromArrowTest, ChunkedArray)
@@ -369,9 +411,10 @@ TEST_F(FromArrowTest, ChunkedArray)
 
   auto expected_cudf_table = get_cudf_table();
 
-  auto got_cudf_table = cudf::from_arrow(*arrow_table);
+  auto got_cudf_table = export_table(arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table->view(), got_cudf_table->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table->view(), got_cudf_table.value()->view());
 }
 
 struct FromArrowTestSlice
@@ -388,13 +431,14 @@ TEST_P(FromArrowTestSlice, SliceTest)
   auto sliced_cudf_table   = cudf::slice(cudf_table_view, {start, end})[0];
   auto expected_cudf_table = cudf::table{sliced_cudf_table};
   auto sliced_arrow_table  = arrow_table->Slice(start, end - start);
-  auto got_cudf_table      = cudf::from_arrow(*sliced_arrow_table);
+  auto got_cudf_table      = export_table(sliced_arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
   // This has been added to take-care of empty string column issue with no children
-  if (got_cudf_table->num_rows() == 0 and expected_cudf_table.num_rows() == 0) {
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table->view());
+  if (got_cudf_table.value()->num_rows() == 0 and expected_cudf_table.num_rows() == 0) {
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table.value()->view());
   } else {
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table.view(), got_cudf_table->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table.view(), got_cudf_table.value()->view());
   }
 }
 
@@ -417,9 +461,10 @@ TEST_F(FromArrowTest, FixedPoint128Table)
     auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
     auto const arrow_table   = arrow::Table::Make(schema, {arr});
 
-    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+    auto got_cudf_table = export_table(arrow_table);
+    ASSERT_TRUE(got_cudf_table.has_value());
 
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table.value()->view());
   }
 }
 
@@ -441,9 +486,10 @@ TEST_F(FromArrowTest, FixedPoint128TableLarge)
     auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
     auto const arrow_table   = arrow::Table::Make(schema, {arr});
 
-    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+    auto got_cudf_table = export_table(arrow_table);
+    ASSERT_TRUE(got_cudf_table.has_value());
 
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table.value()->view());
   }
 }
 
@@ -466,9 +512,10 @@ TEST_F(FromArrowTest, FixedPoint128TableNulls)
     auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
     auto const arrow_table   = arrow::Table::Make(schema, {arr});
 
-    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+    auto got_cudf_table = export_table(arrow_table);
+    ASSERT_TRUE(got_cudf_table.has_value());
 
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table.value()->view());
   }
 }
 
@@ -493,9 +540,10 @@ TEST_F(FromArrowTest, FixedPoint128TableNullsLarge)
     auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
     auto const arrow_table   = arrow::Table::Make(schema, {arr});
 
-    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+    auto got_cudf_table = export_table(arrow_table);
+    ASSERT_TRUE(got_cudf_table.has_value());
 
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table.value()->view());
   }
 }
 
@@ -519,9 +567,12 @@ TYPED_TEST(FromArrowNumericScalarTest, Basic)
 {
   TypeParam const value{42};
   auto const arrow_scalar = arrow::MakeScalar(value);
-  auto const cudf_scalar  = cudf::from_arrow(*arrow_scalar);
+
+  auto const cudf_scalar = export_scalar(arrow_scalar);
+  ASSERT_TRUE(cudf_scalar.has_value());
+
   auto const cudf_numeric_scalar =
-    dynamic_cast<cudf::numeric_scalar<TypeParam>*>(cudf_scalar.get());
+    dynamic_cast<cudf::numeric_scalar<TypeParam>*>(cudf_scalar.value().get());
   if (cudf_numeric_scalar == nullptr) { CUDF_FAIL("Attempted to test with a non-numeric type."); }
   EXPECT_EQ(cudf_numeric_scalar->type(), cudf::data_type(cudf::type_to_id<TypeParam>()));
   EXPECT_EQ(cudf_numeric_scalar->value(), value);
@@ -535,12 +586,13 @@ TEST_F(FromArrowDecimalScalarTest, Basic)
   auto const value{42};
   auto const precision{8};
   auto const scale{4};
-  auto arrow_scalar = arrow::Decimal128Scalar(value, arrow::decimal128(precision, -scale));
-  auto cudf_scalar  = cudf::from_arrow(arrow_scalar);
+  auto arrow_scalar      = arrow::Decimal128Scalar(value, arrow::decimal128(precision, -scale));
+  auto const cudf_scalar = export_scalar(arrow_scalar);
+  ASSERT_TRUE(cudf_scalar.has_value());
 
   // Arrow offers a minimum of 128 bits for the Decimal type.
   auto const cudf_decimal_scalar =
-    dynamic_cast<cudf::fixed_point_scalar<numeric::decimal128>*>(cudf_scalar.get());
+    dynamic_cast<cudf::fixed_point_scalar<numeric::decimal128>*>(cudf_scalar.value().get());
   EXPECT_EQ(cudf_decimal_scalar->type(),
             cudf::data_type(cudf::type_to_id<numeric::decimal128>(), scale));
   EXPECT_EQ(cudf_decimal_scalar->value(), value);
@@ -552,9 +604,10 @@ TEST_F(FromArrowStringScalarTest, Basic)
 {
   auto const value        = std::string("hello world");
   auto const arrow_scalar = arrow::StringScalar(value);
-  auto const cudf_scalar  = cudf::from_arrow(arrow_scalar);
+  auto const cudf_scalar  = export_scalar(arrow_scalar);
+  ASSERT_TRUE(cudf_scalar.has_value());
 
-  auto const cudf_string_scalar = dynamic_cast<cudf::string_scalar*>(cudf_scalar.get());
+  auto const cudf_string_scalar = dynamic_cast<cudf::string_scalar*>(cudf_scalar.value().get());
   EXPECT_EQ(cudf_string_scalar->type(), cudf::data_type(cudf::type_id::STRING));
   EXPECT_EQ(cudf_string_scalar->to_string(), value);
 }
@@ -572,9 +625,10 @@ TEST_F(FromArrowListScalarTest, Basic)
   auto const array       = *maybe_array;
 
   auto const arrow_scalar = arrow::ListScalar(array);
-  auto const cudf_scalar  = cudf::from_arrow(arrow_scalar);
+  auto const cudf_scalar  = export_scalar(arrow_scalar);
+  ASSERT_TRUE(cudf_scalar.has_value());
 
-  auto const cudf_list_scalar = dynamic_cast<cudf::list_scalar*>(cudf_scalar.get());
+  auto const cudf_list_scalar = dynamic_cast<cudf::list_scalar*>(cudf_scalar.value().get());
   EXPECT_EQ(cudf_list_scalar->type(), cudf::data_type(cudf::type_id::LIST));
 
   cudf::test::fixed_width_column_wrapper<int64_t> const lhs(
@@ -592,9 +646,10 @@ TEST_F(FromArrowStructScalarTest, Basic)
   auto const field        = arrow::field("", underlying_arrow_scalar->type);
   auto const arrow_type   = arrow::struct_({field});
   auto const arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type);
-  auto const cudf_scalar  = cudf::from_arrow(arrow_scalar);
+  auto const cudf_scalar  = export_scalar(arrow_scalar);
+  ASSERT_TRUE(cudf_scalar.has_value());
 
-  auto const cudf_struct_scalar = dynamic_cast<cudf::struct_scalar*>(cudf_scalar.get());
+  auto const cudf_struct_scalar = dynamic_cast<cudf::struct_scalar*>(cudf_scalar.value().get());
   EXPECT_EQ(cudf_struct_scalar->type(), cudf::data_type(cudf::type_id::STRUCT));
 
   cudf::test::fixed_width_column_wrapper<int64_t> const col({value});
@@ -602,5 +657,3 @@ TEST_F(FromArrowStructScalarTest, Basic)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(lhs, cudf_struct_scalar->view());
 }
-
-#endif
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 328ba210a3f..90ae12cdd90 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -14,13 +14,6 @@
  * limitations under the License.
  */
 
-// These interop functions are deprecated. We keep the code in this
-// test and will migrate the tests to export via the arrow C data
-// interface with to_arrow_host which arrow can consume. For now, the
-// test is commented out.
-
-#if 0
-
 #include <tests/interop/arrow_utils.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -38,6 +31,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -45,6 +39,8 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <arrow/c/bridge.h>
+
 using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
 
 std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_tables(
@@ -130,7 +126,7 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
   auto keys       = cudf::test::to_host<int64_t>(view.keys()).first;
   auto indices    = cudf::test::to_host<uint32_t>(view.indices()).first;
   auto dict_array = get_arrow_dict_array(std::vector<int64_t>(keys.begin(), keys.end()),
-                                         std::vector<int32_t>(indices.begin(), indices.end()),
+                                         std::vector<uint32_t>(indices.begin(), indices.end()),
                                          validity);
   auto boolarray  = get_arrow_array<bool>(bool_data, bool_validity);
   auto list_array = get_arrow_list_array<int64_t>(
@@ -168,6 +164,21 @@ struct ToArrowTest : public cudf::test::BaseFixture {};
 template <typename T>
 struct ToArrowTestDurationsTest : public cudf::test::BaseFixture {};
 
+auto is_equal(cudf::table_view const& table,
+              cudf::host_span<cudf::column_metadata const> metadata,
+              std::shared_ptr<arrow::Table> expected_arrow_table)
+{
+  auto got_arrow_schema = cudf::to_arrow_schema(table, metadata);
+  auto got_arrow_table  = cudf::to_arrow_host(table);
+
+  for (auto i = 0; i < got_arrow_schema->n_children; ++i) {
+    auto arr = arrow::ImportArray(got_arrow_table->array.children[i], got_arrow_schema->children[i])
+                 .ValueOrDie();
+    if (!expected_arrow_table->column(i)->Equals(arrow::ChunkedArray(arr))) { return false; }
+  }
+  return true;
+}
+
 TYPED_TEST_SUITE(ToArrowTestDurationsTest, cudf::test::DurationTypes);
 
 TEST_F(ToArrowTest, EmptyTable)
@@ -179,10 +190,9 @@ TEST_F(ToArrowTest, EmptyTable)
   auto struct_meta          = cudf::column_metadata{"f"};
   struct_meta.children_meta = {{"integral"}, {"string"}};
 
-  auto got_arrow_table =
-    cudf::to_arrow(cudf_table_view, {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, struct_meta});
-
-  ASSERT_EQ(expected_arrow_table->Equals(*got_arrow_table, true), true);
+  std::vector<cudf::column_metadata> const metadata = {
+    {"a"}, {"b"}, {"c"}, {"d"}, {"e"}, struct_meta};
+  ASSERT_TRUE(is_equal(cudf_table_view, metadata, expected_arrow_table));
 }
 
 TEST_F(ToArrowTest, DateTimeTable)
@@ -203,12 +213,10 @@ TEST_F(ToArrowTest, DateTimeTable)
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
 
-
   auto expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-  auto got_arrow_table = cudf::to_arrow(input_view, {{"a"}});
-
-  ASSERT_EQ(expected_arrow_table->Equals(*got_arrow_table, true), true);
+  std::vector<cudf::column_metadata> const metadata = {{"a"}};
+  ASSERT_TRUE(is_equal(input_view, metadata, expected_arrow_table));
 }
 
 TYPED_TEST(ToArrowTestDurationsTest, DurationTable)
@@ -239,9 +247,8 @@ TYPED_TEST(ToArrowTestDurationsTest, DurationTable)
 
   auto expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-  auto got_arrow_table = cudf::to_arrow(input_view, {{"a"}});
-
-  ASSERT_EQ(expected_arrow_table->Equals(*got_arrow_table, true), true);
+  std::vector<cudf::column_metadata> const metadata = {{"a"}};
+  ASSERT_TRUE(is_equal(input_view, metadata, expected_arrow_table));
 }
 
 TEST_F(ToArrowTest, NestedList)
@@ -255,20 +262,20 @@ TEST_F(ToArrowTest, NestedList)
   auto list_arr = get_arrow_list_array<int64_t>({6, 7, 8, 9}, {0, 1, 4}, {1, 0, 1, 1});
   std::vector<int32_t> offset{0, 0, 2};
   auto mask_buffer     = arrow::internal::BytesToBits({0, 1}).ValueOrDie();
-  auto nested_list_arr = std::make_shared<arrow::ListArray>(arrow::list(list(arrow::int64())),
-                                                            offset.size() - 1,
-                                                            arrow::Buffer::Wrap(offset),
-                                                            list_arr,
-                                                            mask_buffer);
+  auto nested_list_arr = std::make_shared<arrow::ListArray>(
+    arrow::list(arrow::field("a", arrow::list(arrow::int64()), false)),
+    offset.size() - 1,
+    arrow::Buffer::Wrap(offset),
+    list_arr,
+    mask_buffer);
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector(
     {arrow::field("a", nested_list_arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
 
-  auto expected_arrow_table = arrow::Table::Make(schema, {nested_list_arr});
-  auto got_arrow_table      = cudf::to_arrow(input_view, {{"a"}});
-
-  ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  auto expected_arrow_table                         = arrow::Table::Make(schema, {nested_list_arr});
+  std::vector<cudf::column_metadata> const metadata = {{"a"}};
+  ASSERT_TRUE(is_equal(input_view, metadata, expected_arrow_table));
 }
 
 TEST_F(ToArrowTest, StructColumn)
@@ -324,7 +331,10 @@ TEST_F(ToArrowTest, StructColumn)
   auto list_arr = get_arrow_list_array<int64_t>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9});
   std::vector<int32_t> offset{0, 3, 4, 6};
   auto nested_list_arr = std::make_shared<arrow::ListArray>(
-    arrow::list(list(arrow::int64())), offset.size() - 1, arrow::Buffer::Wrap(offset), list_arr);
+    arrow::list(arrow::field("a", arrow::list(arrow::field("a", arrow::int64(), false)), false)),
+    offset.size() - 1,
+    arrow::Buffer::Wrap(offset),
+    list_arr);
 
   std::vector<std::shared_ptr<arrow::Array>> child_arrays2({str2_array, int2_array});
   auto fields2 = std::vector<std::shared_ptr<arrow::Field>>{
@@ -356,9 +366,8 @@ TEST_F(ToArrowTest, StructColumn)
 
   auto expected_arrow_table = arrow::Table::Make(schema, {struct_array});
 
-  auto got_arrow_table = cudf::to_arrow(input_view, {metadata});
-
-  ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  std::vector<cudf::column_metadata> const meta = {metadata};
+  ASSERT_TRUE(is_equal(input_view, meta, expected_arrow_table));
 }
 
 template <typename T>
@@ -380,9 +389,8 @@ TEST_F(ToArrowTest, FixedPoint64Table)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
 
@@ -402,9 +410,8 @@ TEST_F(ToArrowTest, FixedPoint128Table)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
 
@@ -431,9 +438,8 @@ TEST_F(ToArrowTest, FixedPoint64TableLarge)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
 
@@ -455,9 +461,8 @@ TEST_F(ToArrowTest, FixedPoint128TableLarge)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
 
@@ -479,9 +484,8 @@ TEST_F(ToArrowTest, FixedPoint64TableNullsSimple)
     auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
     auto const arrow_table   = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, arrow_table));
   }
 }
 
@@ -503,9 +507,8 @@ TEST_F(ToArrowTest, FixedPoint128TableNullsSimple)
     auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
     auto const arrow_table   = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, arrow_table));
   }
 }
 
@@ -529,9 +532,8 @@ TEST_F(ToArrowTest, FixedPoint64TableNulls)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
 
@@ -554,9 +556,8 @@ TEST_F(ToArrowTest, FixedPoint128TableNulls)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    auto const got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
 
@@ -575,10 +576,10 @@ TEST_P(ToArrowTestSlice, SliceTest)
   auto expected_arrow_table = arrow_table->Slice(start, end - start);
   auto struct_meta          = cudf::column_metadata{"f"};
   struct_meta.children_meta = {{"integral"}, {"string"}};
-  auto got_arrow_table =
-    cudf::to_arrow(sliced_cudf_table, {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, struct_meta});
 
-  ASSERT_EQ(expected_arrow_table->Equals(*got_arrow_table, true), true);
+  std::vector<cudf::column_metadata> const metadata = {
+    {"a"}, {"b"}, {"c"}, {"d"}, {"e"}, struct_meta};
+  ASSERT_TRUE(is_equal(sliced_cudf_table, metadata, expected_arrow_table));
 }
 
 INSTANTIATE_TEST_CASE_P(ToArrowTest,
@@ -595,13 +596,58 @@ using NumericTypesNotBool =
   cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
 TYPED_TEST_SUITE(ToArrowNumericScalarTest, NumericTypesNotBool);
 
+auto col_to_arrow_type(cudf::column_view const& col)
+{
+  switch (col.type().id()) {
+    case cudf::type_id::BOOL8: return arrow::boolean();
+    case cudf::type_id::INT8: return arrow::int8();
+    case cudf::type_id::INT16: return arrow::int16();
+    case cudf::type_id::INT32: return arrow::int32();
+    case cudf::type_id::INT64: return arrow::int64();
+    case cudf::type_id::UINT8: return arrow::uint8();
+    case cudf::type_id::UINT16: return arrow::uint16();
+    case cudf::type_id::UINT32: return arrow::uint32();
+    case cudf::type_id::UINT64: return arrow::uint64();
+    case cudf::type_id::FLOAT32: return arrow::float32();
+    case cudf::type_id::FLOAT64: return arrow::float64();
+    case cudf::type_id::TIMESTAMP_DAYS: return arrow::date32();
+    case cudf::type_id::STRING: return arrow::utf8();
+    case cudf::type_id::LIST:
+      return arrow::list(col_to_arrow_type(col.child(cudf::lists_column_view::child_column_index)));
+    case cudf::type_id::DECIMAL128: return arrow::decimal(38, -col.type().scale());
+    default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
+  }
+}
+
+std::optional<std::shared_ptr<arrow::Scalar>> cudf_scalar_to_arrow(
+  cudf::scalar const& scalar, std::optional<cudf::column_metadata> metadata = std::nullopt)
+{
+  auto const cudf_column   = cudf::make_column_from_scalar(scalar, 1);
+  auto const c_arrow_array = cudf::to_arrow_host(*cudf_column);
+  auto const arrow_array   = [&]() {
+    if (metadata.has_value()) {
+      auto const table = cudf::table_view({cudf_column->view()});
+      std::vector<cudf::column_metadata> const table_metadata = {metadata.value()};
+      auto const arrow_schema = cudf::to_arrow_schema(table, table_metadata);
+      return arrow::ImportArray(&c_arrow_array->array, arrow_schema->children[0]).ValueOrDie();
+    } else {
+      auto const arrow_type = col_to_arrow_type(cudf_column->view());
+      return arrow::ImportArray(&c_arrow_array->array, arrow_type).ValueOrDie();
+    }
+  }();
+  auto const maybe_scalar = arrow_array->GetScalar(0);
+  if (!maybe_scalar.ok()) { return std::nullopt; }
+  return maybe_scalar.ValueOrDie();
+}
+
 TYPED_TEST(ToArrowNumericScalarTest, Basic)
 {
   TypeParam const value{42};
   auto const cudf_scalar = cudf::make_fixed_width_scalar<TypeParam>(value);
 
-  cudf::column_metadata const metadata{""};
-  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+  auto const maybe_scalar = cudf_scalar_to_arrow(*cudf_scalar);
+  ASSERT_TRUE(maybe_scalar.has_value());
+  auto const arrow_scalar = *maybe_scalar;
 
   auto const ref_arrow_scalar = arrow::MakeScalar(value);
   EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar));
@@ -621,8 +667,9 @@ TEST_F(ToArrowDecimalScalarTest, Basic)
   auto const cudf_scalar =
     cudf::make_fixed_point_scalar<numeric::decimal128>(value, numeric::scale_type{scale});
 
-  cudf::column_metadata const metadata{""};
-  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+  auto const maybe_scalar = cudf_scalar_to_arrow(*cudf_scalar);
+  ASSERT_TRUE(maybe_scalar.has_value());
+  auto const arrow_scalar = *maybe_scalar;
 
   auto const maybe_ref_arrow_scalar =
     arrow::MakeScalar(arrow::decimal128(precision, -scale), value);
@@ -636,9 +683,10 @@ struct ToArrowStringScalarTest : public cudf::test::BaseFixture {};
 TEST_F(ToArrowStringScalarTest, Basic)
 {
   std::string const value{"hello world"};
-  auto const cudf_scalar = cudf::make_string_scalar(value);
-  cudf::column_metadata const metadata{""};
-  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+  auto const cudf_scalar  = cudf::make_string_scalar(value);
+  auto const maybe_scalar = cudf_scalar_to_arrow(*cudf_scalar);
+  ASSERT_TRUE(maybe_scalar.has_value());
+  auto const arrow_scalar = *maybe_scalar;
 
   auto const ref_arrow_scalar = arrow::MakeScalar(value);
   EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar));
@@ -656,8 +704,9 @@ TEST_F(ToArrowListScalarTest, Basic)
 
   auto const cudf_scalar = cudf::make_list_scalar(col);
 
-  cudf::column_metadata const metadata{""};
-  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+  auto const maybe_scalar = cudf_scalar_to_arrow(*cudf_scalar);
+  ASSERT_TRUE(maybe_scalar.has_value());
+  auto const arrow_scalar = *maybe_scalar;
 
   arrow::Int64Builder builder;
   auto const status      = builder.AppendValues(host_values, host_validity);
@@ -682,7 +731,10 @@ TEST_F(ToArrowStructScalarTest, Basic)
 
   cudf::column_metadata metadata{""};
   metadata.children_meta.emplace_back(field_name);
-  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+
+  auto const maybe_scalar = cudf_scalar_to_arrow(*cudf_scalar, metadata);
+  ASSERT_TRUE(maybe_scalar.has_value());
+  auto const arrow_scalar = *maybe_scalar;
 
   auto const underlying_arrow_scalar = arrow::MakeScalar(value);
   auto const field            = arrow::field(field_name, underlying_arrow_scalar->type, false);
@@ -693,5 +745,3 @@ TEST_F(ToArrowStructScalarTest, Basic)
 }
 
 CUDF_TEST_PROGRAM_MAIN()
-
-#endif
diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp
deleted file mode 100644
index 9ba862585d0..00000000000
--- a/cpp/tests/streams/interop_test.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// These interop functions are deprecated. We keep the code in this
-// test and will migrate the tests to export via the arrow C data
-// interface with to_arrow_host which arrow can consume. For now, the
-// test is commented out.
-
-#if 0
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/default_stream.hpp>
-
-#include <cudf/interop.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/table/table_view.hpp>
-
-struct ArrowTest : public cudf::test::BaseFixture {};
-
-TEST_F(ArrowTest, ToArrow)
-{
-  int32_t const value{42};
-  auto col = cudf::test::fixed_width_column_wrapper<int32_t>{{value}};
-  cudf::table_view tbl{{col}};
-
-  std::vector<cudf::column_metadata> metadata{{""}};
-  cudf::to_arrow(tbl, metadata, cudf::test::get_default_stream());
-}
-
-TEST_F(ArrowTest, FromArrow)
-{
-  std::vector<int64_t> host_values = {1, 2, 3, 5, 6, 7, 8};
-  std::vector<bool> host_validity  = {true, true, true, false, true, true, true};
-
-  arrow::Int64Builder builder;
-  auto status      = builder.AppendValues(host_values, host_validity);
-  auto maybe_array = builder.Finish();
-  auto array       = *maybe_array;
-
-  auto field  = arrow::field("", arrow::int32());
-  auto schema = arrow::schema({field});
-  auto table  = arrow::Table::Make(schema, {array});
-  cudf::from_arrow(*table, cudf::test::get_default_stream());
-}
-
-TEST_F(ArrowTest, ToArrowScalar)
-{
-  int32_t const value{42};
-  auto cudf_scalar =
-    cudf::make_fixed_width_scalar<int32_t>(value, cudf::test::get_default_stream());
-
-  cudf::column_metadata metadata{""};
-  cudf::to_arrow(*cudf_scalar, metadata, cudf::test::get_default_stream());
-}
-
-TEST_F(ArrowTest, FromArrowScalar)
-{
-  int32_t const value{42};
-  auto arrow_scalar = arrow::MakeScalar(value);
-  cudf::from_arrow(*arrow_scalar, cudf::test::get_default_stream());
-}
-
-#endif

From cb843dbdc2fc0c73c8af98909304c768bb65c16f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 16 Aug 2024 11:02:06 -1000
Subject: [PATCH 079/270] Fix DataFrame reductions with median returning scalar
 instead of Series (#16527)

xref https://github.com/rapidsai/cudf/issues/16507

This turned into a little bit of a refactor that also fixes the following:

* `cudf.DataFrame.from_pandas` not preserving the `pandas.DataFrame.column.dtype`
* `cudf.DataFrame.<reduction>(axis=0)` not preserving the `.column` properties in the resulting `.index`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16527
---
 python/cudf/cudf/core/column_accessor.py  |   3 +
 python/cudf/cudf/core/dataframe.py        | 120 ++++++++--------------
 python/cudf/cudf/core/indexed_frame.py    |  36 +------
 python/cudf/cudf/tests/test_dataframe.py  |   6 ++
 python/cudf/cudf/tests/test_reductions.py |  35 +++++++
 5 files changed, 90 insertions(+), 110 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 48bc84070b1..67c19f11e41 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -352,6 +352,9 @@ def insert(
             new_values = self.columns[:loc] + (value,) + self.columns[loc:]
             self._data = self._data.__class__(zip(new_keys, new_values))
         self._clear_cache(old_ncols, old_ncols + 1)
+        if old_ncols == 0:
+            # The type(name) may no longer match the prior label_dtype
+            self.label_dtype = None
 
     def copy(self, deep=False) -> ColumnAccessor:
         """
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6ee3d69441f..97684129203 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5497,14 +5497,9 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
             )
 
         if isinstance(dataframe, pd.DataFrame):
-            if not dataframe.columns.is_unique:
-                raise ValueError("Duplicate column names are not allowed")
-
             data = {
-                col_name: column.as_column(
-                    col_value.array, nan_as_null=nan_as_null
-                )
-                for col_name, col_value in dataframe.items()
+                i: column.as_column(col_value.array, nan_as_null=nan_as_null)
+                for i, (_, col_value) in enumerate(dataframe.items())
             }
             if isinstance(dataframe.index, pd.MultiIndex):
                 index = cudf.MultiIndex.from_pandas(
@@ -5515,14 +5510,8 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
                     dataframe.index, nan_as_null=nan_as_null
                 )
             df = cls._from_data(data, index)
-            df._data._level_names = tuple(dataframe.columns.names)
-
-            if isinstance(dataframe.columns, pd.RangeIndex):
-                df._data.rangeindex = True
-            # Set columns only if it is a MultiIndex
-            elif isinstance(dataframe.columns, pd.MultiIndex):
-                df.columns = dataframe.columns
-
+            # Checks duplicate columns and sets column metadata
+            df.columns = dataframe.columns
             return df
         elif hasattr(dataframe, "__dataframe__"):
             # TODO: Probably should be handled in the constructor as
@@ -6382,8 +6371,11 @@ def _reduce(
         source = self
 
         if axis is None:
+            assert PANDAS_LT_300, "Replace if/else with just axis=2"
+            # TODO(pandas3.0): Remove if/else for just axis = 2
             if op in {"sum", "product", "std", "var"}:
-                # Do not remove until pandas 2.0 support is added.
+                # pandas only raises FutureWarning for these ops
+                # though it applies for all reductions
                 warnings.warn(
                     f"In a future version, {type(self).__name__}"
                     f".{op}(axis=None) will return a scalar {op} over "
@@ -6402,9 +6394,7 @@ def _reduce(
 
         if numeric_only:
             numeric_cols = (
-                name
-                for name in self._data.names
-                if is_numeric_dtype(self._data[name].dtype)
+                name for name, dtype in self._dtypes if is_numeric_dtype(dtype)
             )
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
@@ -6414,62 +6404,41 @@ def _reduce(
                     else source.index,
                     dtype="float64",
                 )
-        if axis in {0, 2}:
-            if axis == 2 and op in ("kurtosis", "kurt", "skew"):
-                # TODO: concat + op can probably be done in the general case
-                # for axis == 2.
-                # https://github.com/rapidsai/cudf/issues/14930
-                return getattr(concat_columns(source._data.columns), op)(
-                    **kwargs
-                )
-            try:
-                result = [
-                    getattr(source._data[col], op)(**kwargs)
-                    for col in source._data.names
-                ]
-            except AttributeError:
-                numeric_ops = (
-                    "mean",
-                    "min",
-                    "max",
-                    "sum",
-                    "product",
-                    "prod",
-                    "std",
-                    "var",
-                    "kurtosis",
-                    "kurt",
-                    "skew",
-                )
-
-                if op in numeric_ops:
+        if (
+            axis == 2
+            and op in {"kurtosis", "skew"}
+            and self._num_rows < 4
+            and self._num_columns > 1
+        ):
+            # Total number of elements may satisfy the min number of values
+            # to compute skew/kurtosis
+            return getattr(concat_columns(source._columns), op)(**kwargs)
+        elif axis == 1:
+            return source._apply_cupy_method_axis_1(op, **kwargs)
+        else:
+            axis_0_results = []
+            for col_label, col in source._data.items():
+                try:
+                    axis_0_results.append(getattr(col, op)(**kwargs))
+                except AttributeError as err:
                     if numeric_only:
-                        try:
-                            result = [
-                                getattr(source._data[col], op)(**kwargs)
-                                for col in source._data.names
-                            ]
-                        except AttributeError:
-                            raise NotImplementedError(
-                                f"Not all column dtypes support op {op}"
-                            )
-                    elif any(
-                        not is_numeric_dtype(self._data[name].dtype)
-                        for name in self._data.names
-                    ):
+                        raise NotImplementedError(
+                            f"Column {col_label} with type {col.dtype} does not support {op}"
+                        ) from err
+                    elif not is_numeric_dtype(col.dtype):
                         raise TypeError(
                             "Non numeric columns passed with "
                             "`numeric_only=False`, pass `numeric_only=True` "
                             f"to perform DataFrame.{op}"
-                        )
-                else:
-                    raise
+                        ) from err
+                    else:
+                        raise
             if axis == 2:
-                return getattr(as_column(result, nan_as_null=False), op)(
-                    **kwargs
-                )
+                return getattr(
+                    as_column(axis_0_results, nan_as_null=False), op
+                )(**kwargs)
             else:
-                source_dtypes = [c.dtype for c in source._data.columns]
+                source_dtypes = [dtype for _, dtype in source._dtypes]
                 common_dtype = find_common_type(source_dtypes)
                 if (
                     is_object_dtype(common_dtype)
@@ -6483,17 +6452,14 @@ def _reduce(
                         "Columns must all have the same dtype to "
                         f"perform {op=} with {axis=}"
                     )
+                pd_index = source._data.to_pandas_index()
                 if source._data.multiindex:
-                    idx = MultiIndex.from_tuples(
-                        source._data.names, names=source._data.level_names
-                    )
+                    idx = MultiIndex.from_pandas(pd_index)
                 else:
-                    idx = cudf.Index(source._data.names)
-                return Series._from_column(as_column(result), index=idx)
-        elif axis == 1:
-            return source._apply_cupy_method_axis_1(op, **kwargs)
-        else:
-            raise ValueError(f"Invalid value of {axis=} received for {op}")
+                    idx = cudf.Index.from_pandas(pd_index)
+                return Series._from_column(
+                    as_column(axis_0_results), index=idx
+                )
 
     @_performance_tracking
     def _scan(
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 2263dfd5c98..e46e24dd0d8 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1386,11 +1386,6 @@ def sum(
         a    10
         b    34
         dtype: int64
-
-        .. pandas-compat::
-           :meth:`pandas.DataFrame.sum`, :meth:`pandas.Series.sum`
-
-            Parameters currently not supported are `level`, `numeric_only`.
         """
         return self._reduce(
             "sum",
@@ -1447,11 +1442,6 @@ def product(
         a      24
         b    5040
         dtype: int64
-
-        .. pandas-compat::
-            :meth:`pandas.DataFrame.product`, :meth:`pandas.Series.product`
-
-            Parameters currently not supported are level`, `numeric_only`.
         """
 
         return self._reduce(
@@ -1508,7 +1498,9 @@ def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs):
             **kwargs,
         )
 
-    def median(self, axis=None, skipna=True, numeric_only=None, **kwargs):
+    def median(
+        self, axis=no_default, skipna=True, numeric_only=None, **kwargs
+    ):
         """
         Return the median of the values for the requested axis.
 
@@ -1542,11 +1534,6 @@ def median(self, axis=None, skipna=True, numeric_only=None, **kwargs):
         dtype: int64
         >>> ser.median()
         17.0
-
-        .. pandas-compat::
-            :meth:`pandas.DataFrame.median`, :meth:`pandas.Series.median`
-
-            Parameters currently not supported are `level` and `numeric_only`.
         """
         return self._reduce(
             "median",
@@ -1598,12 +1585,6 @@ def std(
         a    1.290994
         b    1.290994
         dtype: float64
-
-        .. pandas-compat::
-            :meth:`pandas.DataFrame.std`, :meth:`pandas.Series.std`
-
-            Parameters currently not supported are `level` and
-            `numeric_only`
         """
 
         return self._reduce(
@@ -1657,12 +1638,6 @@ def var(
         a    1.666667
         b    1.666667
         dtype: float64
-
-        .. pandas-compat::
-            :meth:`pandas.DataFrame.var`, :meth:`pandas.Series.var`
-
-            Parameters currently not supported are `level` and
-            `numeric_only`
         """
         return self._reduce(
             "var",
@@ -1713,11 +1688,6 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         a   -1.2
         b   -1.2
         dtype: float64
-
-        .. pandas-compat::
-            :meth:`pandas.DataFrame.kurtosis`
-
-            Parameters currently not supported are `level` and `numeric_only`
         """
         if axis not in (0, "index", None, no_default):
             raise NotImplementedError("Only axis=0 is currently supported.")
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 89eb5a12c71..9122a1074ac 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11114,3 +11114,9 @@ def test_bool_raises():
         lfunc_args_and_kwargs=[[cudf.DataFrame()]],
         rfunc_args_and_kwargs=[[pd.DataFrame()]],
     )
+
+
+def test_from_pandas_preserve_column_dtype():
+    df = pd.DataFrame([[1, 2]], columns=pd.Index([1, 2], dtype="int8"))
+    result = cudf.DataFrame.from_pandas(df)
+    pd.testing.assert_index_equal(result.columns, df.columns, exact=True)
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 8be6463c699..a70a2ea15dd 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -358,6 +358,30 @@ def test_reductions_axis_none_warning(op):
     assert_eq(expected, actual, check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "op",
+    [
+        "sum",
+        "product",
+        "std",
+        "var",
+        "kurt",
+        "kurtosis",
+        "skew",
+        "min",
+        "max",
+        "mean",
+        "median",
+    ],
+)
+def test_dataframe_reduction_no_args(op):
+    df = cudf.DataFrame({"a": range(10), "b": range(10)})
+    pdf = df.to_pandas()
+    result = getattr(df, op)()
+    expected = getattr(pdf, op)()
+    assert_eq(result, expected)
+
+
 def test_reduction_column_multiindex():
     idx = cudf.MultiIndex.from_tuples(
         [("a", 1), ("a", 2)], names=["foo", "bar"]
@@ -374,3 +398,14 @@ def test_dtype_deprecated(op):
     with pytest.warns(FutureWarning):
         result = getattr(ser, op)(dtype=np.dtype(np.int8))
     assert isinstance(result, np.int8)
+
+
+@pytest.mark.parametrize(
+    "columns", [pd.RangeIndex(2), pd.Index([0, 1], dtype="int8")]
+)
+def test_dataframe_axis_0_preserve_column_type_in_index(columns):
+    pd_df = pd.DataFrame([[1, 2]], columns=columns)
+    cudf_df = cudf.DataFrame.from_pandas(pd_df)
+    result = cudf_df.sum(axis=0)
+    expected = pd_df.sum(axis=0)
+    assert_eq(result, expected, check_index_type=True)

From fd44adc9e02dec4cdde9626f46ba231bda4a7ea6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 16 Aug 2024 13:02:49 -1000
Subject: [PATCH 080/270] Make CategoricalColumn.__init__ strict (#16456)

This PR transfers some of the validation logic in `build_column` directly into `CategoricalColumn` just in case `CategoricalColumn` is called independently of `build_column`. Additionally adds stricter validation of `data`, `dtype` and `children` so the column doesn't represent an invalid state

xref https://github.com/rapidsai/cudf/issues/16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16456
---
 python/cudf/cudf/_lib/column.pyx            |  6 +--
 python/cudf/cudf/core/column/categorical.py | 56 +++++++++++++--------
 python/cudf/cudf/core/column/column.py      |  9 +---
 3 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 2e400f775d3..e27c595edda 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -86,7 +86,7 @@ cdef class Column:
         object mask=None,
         int offset=0,
         object null_count=None,
-        object children=()
+        tuple children=()
     ):
         if size < 0:
             raise ValueError("size must be >=0")
@@ -297,11 +297,11 @@ cdef class Column:
                 dtypes = [
                     base_child.dtype for base_child in self.base_children
                 ]
-                self._children = [
+                self._children = tuple(
                     child._with_type_metadata(dtype) for child, dtype in zip(
                         children, dtypes
                     )
-                ]
+                )
         return self._children
 
     def set_base_children(self, value):
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 66aed38bffd..1fdaf9f8c07 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -465,6 +465,18 @@ def reorder_categories(
         )
 
 
+def validate_categorical_children(children) -> None:
+    if not (
+        len(children) == 1
+        and isinstance(children[0], cudf.core.column.numerical.NumericalColumn)
+        and children[0].dtype.kind in "iu"
+    ):
+        # TODO: Enforce unsigned integer?
+        raise ValueError(
+            "Must specify exactly one child NumericalColumn of integers for representing the codes."
+        )
+
+
 class CategoricalColumn(column.ColumnBase):
     """
     Implements operations for Columns of Categorical type
@@ -481,8 +493,7 @@ class CategoricalColumn(column.ColumnBase):
         respectively
     """
 
-    dtype: cudf.core.dtypes.CategoricalDtype
-    _codes: NumericalColumn | None
+    dtype: CategoricalDtype
     _children: tuple[NumericalColumn]
     _VALID_REDUCTIONS = {
         "max",
@@ -499,25 +510,29 @@ class CategoricalColumn(column.ColumnBase):
 
     def __init__(
         self,
+        data: None,
+        size: int | None,
         dtype: CategoricalDtype,
         mask: Buffer | None = None,
-        size: int | None = None,
         offset: int = 0,
         null_count: int | None = None,
-        children: tuple["column.ColumnBase", ...] = (),
+        children: tuple[NumericalColumn] = (),  # type: ignore[assignment]
     ):
+        if data is not None:
+            raise ValueError(f"{data=} must be None")
+        validate_categorical_children(children)
         if size is None:
-            for child in children:
-                assert child.offset == 0
-                assert child.base_mask is None
-            size = children[0].size
+            child = children[0]
+            assert child.offset == 0
+            assert child.base_mask is None
+            size = child.size
             size = size - offset
-        if isinstance(dtype, pd.api.types.CategoricalDtype):
-            dtype = CategoricalDtype.from_pandas(dtype)
         if not isinstance(dtype, CategoricalDtype):
-            raise ValueError("dtype must be instance of CategoricalDtype")
+            raise ValueError(
+                f"{dtype=} must be cudf.CategoricalDtype instance."
+            )
         super().__init__(
-            data=None,
+            data=data,
             size=size,
             dtype=dtype,
             mask=mask,
@@ -525,7 +540,7 @@ def __init__(
             null_count=null_count,
             children=children,
         )
-        self._codes = None
+        self._codes = self.children[0].set_mask(self.mask)
 
     @property
     def base_size(self) -> int:
@@ -558,13 +573,14 @@ def _process_values_for_isin(
         rhs = cudf.core.column.as_column(values, dtype=self.dtype)
         return lhs, rhs
 
-    def set_base_mask(self, value: Buffer | None):
+    def set_base_mask(self, value: Buffer | None) -> None:
         super().set_base_mask(value)
-        self._codes = None
+        self._codes = self.children[0].set_mask(self.mask)
 
-    def set_base_children(self, value: tuple[ColumnBase, ...]):
+    def set_base_children(self, value: tuple[NumericalColumn]) -> None:  # type: ignore[override]
         super().set_base_children(value)
-        self._codes = None
+        validate_categorical_children(value)
+        self._codes = value[0].set_mask(self.mask)
 
     @property
     def children(self) -> tuple[NumericalColumn]:
@@ -586,9 +602,7 @@ def categories(self) -> ColumnBase:
 
     @property
     def codes(self) -> NumericalColumn:
-        if self._codes is None:
-            self._codes = self.children[0].set_mask(self.mask)
-        return cast(cudf.core.column.NumericalColumn, self._codes)
+        return self._codes
 
     @property
     def ordered(self) -> bool:
@@ -1131,7 +1145,7 @@ def _mimic_inplace(
     ) -> Self | None:
         out = super()._mimic_inplace(other_col, inplace=inplace)
         if inplace and isinstance(other_col, CategoricalColumn):
-            self._codes = other_col._codes
+            self._codes = other_col.codes
         return out
 
     def view(self, dtype: Dtype) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 090c02da990..19d6bf84d3f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1578,19 +1578,14 @@ def build_column(
         return col
 
     if isinstance(dtype, CategoricalDtype):
-        if not len(children) == 1:
-            raise ValueError(
-                "Must specify exactly one child column for CategoricalColumn"
-            )
-        if not isinstance(children[0], ColumnBase):
-            raise TypeError("children must be a tuple of Columns")
         return cudf.core.column.CategoricalColumn(
+            data=data,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             size=size,
             offset=offset,
             null_count=null_count,
-            children=children,
+            children=children,  # type: ignore[arg-type]
         )
     elif dtype.type is np.datetime64:
         return cudf.core.column.DatetimeColumn(

From b63ba70f2cf3724eeb118f9d2ec03a370c135f23 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 16 Aug 2024 18:27:07 -0700
Subject: [PATCH 081/270] Add build job for pylibcudf (#16587)

This was missed in #16299 and is necessary to get builds published.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16587
---
 .github/workflows/build.yaml | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2fc39c06fad..9943b02a521 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -67,7 +67,27 @@ jobs:
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
+  wheel-build-pylibcudf:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_pylibcudf.sh
+  wheel-publish-pylibcudf:
+    needs: wheel-build-pylibcudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: pylibcudf
   wheel-build-cudf:
+    needs: wheel-publish-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
@@ -109,7 +129,7 @@ jobs:
       date: ${{ inputs.date }}
       package-name: dask_cudf
   wheel-build-cudf-polars:
-    needs: wheel-publish-cudf
+    needs: wheel-publish-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:

From dd2c12dd8a8682b562bb3b420e0982f79a99438d Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 16 Aug 2024 23:18:42 -0400
Subject: [PATCH 082/270] Revert "Make proxy NumPy arrays pass isinstance check
 in `cudf.pandas`" (#16586)

Reverts rapidsai/cudf#16286

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16586
---
 python/cudf/cudf/pandas/_wrappers/numpy.py    |  3 ---
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 20 +---------------
 python/cudf/cudf/pandas/proxy_base.py         | 23 -------------------
 .../cudf_pandas_tests/test_cudf_pandas.py     |  8 -------
 4 files changed, 1 insertion(+), 53 deletions(-)
 delete mode 100644 python/cudf/cudf/pandas/proxy_base.py

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index eabea9713f1..3b012169676 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -14,7 +14,6 @@
     make_final_proxy_type,
     make_intermediate_proxy_type,
 )
-from ..proxy_base import ProxyNDarrayBase
 from .common import (
     array_interface,
     array_method,
@@ -112,14 +111,12 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
     numpy.ndarray,
     fast_to_slow=cupy.ndarray.get,
     slow_to_fast=cupy.asarray,
-    bases=(ProxyNDarrayBase,),
     additional_attributes={
         "__array__": array_method,
         # So that pa.array(wrapped-numpy-array) works
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
         "__array_interface__": array_interface,
-        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
         # ndarrays are unhashable
         "__hash__": None,
         # iter(cupy-array) produces an iterable of zero-dim device
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 61aa6310082..bb678fd1efe 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -19,7 +19,6 @@
 from ..options import _env_get_bool
 from ..testing import assert_eq
 from .annotation import nvtx
-from .proxy_base import ProxyNDarrayBase
 
 
 def call_operator(fn, args, kwargs):
@@ -565,11 +564,7 @@ def _fsproxy_wrap(cls, value, func):
         _FinalProxy subclasses can override this classmethod if they
         need particular behaviour when wrapped up.
         """
-        base_class = _get_proxy_base_class(cls)
-        if base_class is object:
-            proxy = base_class.__new__(cls)
-        else:
-            proxy = base_class.__new__(cls, value)
+        proxy = object.__new__(cls)
         proxy._fsproxy_wrapped = value
         return proxy
 
@@ -1198,19 +1193,6 @@ def is_proxy_object(obj: Any) -> bool:
     return False
 
 
-def _get_proxy_base_class(cls):
-    """Returns the proxy base class if one exists"""
-    for proxy_class in PROXY_BASE_CLASSES:
-        if proxy_class in cls.__mro__:
-            return proxy_class
-    return object
-
-
-PROXY_BASE_CLASSES: set[type] = {
-    ProxyNDarrayBase,
-}
-
-
 NUMPY_TYPES: set[str] = set(np.sctypeDict.values())
 
 
diff --git a/python/cudf/cudf/pandas/proxy_base.py b/python/cudf/cudf/pandas/proxy_base.py
deleted file mode 100644
index 61d9cde127c..00000000000
--- a/python/cudf/cudf/pandas/proxy_base.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-import cupy as cp
-import numpy as np
-
-
-class ProxyNDarrayBase(np.ndarray):
-    def __new__(cls, arr):
-        if isinstance(arr, cp.ndarray):
-            obj = np.asarray(arr.get()).view(cls)
-            return obj
-        elif isinstance(arr, np.ndarray):
-            obj = np.asarray(arr).view(cls)
-            return obj
-        else:
-            raise TypeError(
-                "Unsupported array type. Must be numpy.ndarray or cupy.ndarray"
-            )
-
-    def __array_finalize__(self, obj):
-        self._fsproxy_wrapped = getattr(obj, "_fsproxy_wrapped", None)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index e5483fff913..6292022d8e4 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1632,11 +1632,3 @@ def test_change_index_name(index):
 
         assert s.index.name == name
         assert df.index.name == name
-
-
-def test_numpy_ndarray_isinstancecheck(series):
-    s1, s2 = series
-    arr1 = s1.values
-    arr2 = s2.values
-    assert isinstance(arr1, np.ndarray)
-    assert isinstance(arr2, np.ndarray)

From 592342c152af743390a923f125a380fe3b8f41c1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 19 Aug 2024 09:28:35 -0400
Subject: [PATCH 083/270] Remove invalid column_view usage in
 string-scalar-to-column function (#16530)

Fixes the `make_column_from_scalar` function for `string_scalar` internal usage of a temporary `column_view` with non-zero size but no data or children to call `cudf::strings::detail::fill`. This relied too much on fragile internal logic which has cause several headaches including the recent work adding prefetch logic to libcudf.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16530
---
 cpp/src/column/column_factories.cu | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu
index bad20d6817c..ad9c5e4d3a0 100644
--- a/cpp/src/column/column_factories.cu
+++ b/cpp/src/column/column_factories.cu
@@ -20,11 +20,12 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/lists/detail/lists_column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/detail/fill.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
+#include <thrust/uninitialized_fill.h>
 
 namespace cudf {
 
@@ -57,15 +58,26 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
 {
   if (size == 0) return make_empty_column(value.type());
 
-  // Since we are setting every row to the scalar, the fill() never needs to access
-  // any of the children in the strings column which would otherwise cause an exception.
-  column_view sc{value.type(), size, nullptr, nullptr, 0};
-  auto& sv = static_cast<scalar_type_t<cudf::string_view> const&>(value);
+  if (!value.is_valid(stream)) {
+    return make_strings_column(
+      size,
+      make_column_from_scalar(numeric_scalar<int32_t>(0), size + 1, stream, mr),
+      rmm::device_buffer{},
+      size,
+      cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr));
+  }
 
-  // fill the column with the scalar
-  auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, stream, mr);
+  auto& ss         = static_cast<scalar_type_t<cudf::string_view> const&>(value);
+  auto const d_str = ss.value(stream);  // no actual data is copied
 
-  return output;
+  // fill the column with the scalar
+  rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(size, stream);
+  auto const row_value =
+    d_str.empty() ? cudf::strings::detail::string_index_pair{"", 0}
+                  : cudf::strings::detail::string_index_pair{d_str.data(), d_str.size_bytes()};
+  thrust::uninitialized_fill(
+    rmm::exec_policy_nosync(stream), indices.begin(), indices.end(), row_value);
+  return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
 }
 
 template <>

From 1b18cbc1e0b0e5dd7109228ce34c0fde5a2ddcb8 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 19 Aug 2024 08:03:54 -0700
Subject: [PATCH 084/270] Add `ToCudfBackend` expression to dask-cudf (#16573)

Adds a `ToCudfBackend` expression for "pandas" to "cudf" conversion, preventing `to_backend("cudf")` operations from blocking useful optimizations like predicate pushdown.

This is the dask-cudf component of https://github.com/dask/dask-expr/pull/1115

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/cudf/pull/16573
---
 python/dask_cudf/dask_cudf/backends.py        | 20 +++++++-----
 python/dask_cudf/dask_cudf/expr/_expr.py      | 31 ++++++++++++++++++-
 python/dask_cudf/dask_cudf/tests/test_core.py | 16 +++++++++-
 python/dask_cudf/dask_cudf/tests/utils.py     |  4 +++
 4 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index a65ae819b44..16b2c8959e2 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -537,6 +537,12 @@ def to_cudf_dispatch_from_pandas(data, nan_as_null=None, **kwargs):
     return cudf.from_pandas(data, nan_as_null=nan_as_null)
 
 
+@to_cudf_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index))
+def to_cudf_dispatch_from_cudf(data, **kwargs):
+    _unsupported_kwargs("cudf", "cudf", kwargs)
+    return data
+
+
 # Define "cudf" backend engine to be registered with Dask
 class CudfBackendEntrypoint(DataFrameBackendEntrypoint):
     """Backend-entrypoint class for Dask-DataFrame
@@ -643,20 +649,20 @@ class CudfDXBackendEntrypoint(DataFrameBackendEntrypoint):
     Examples
     --------
     >>> import dask
-    >>> import dask_expr
+    >>> import dask_expr as dx
     >>> with dask.config.set({"dataframe.backend": "cudf"}):
     ...     ddf = dx.from_dict({"a": range(10)})
     >>> type(ddf._meta)
     <class 'cudf.core.dataframe.DataFrame'>
     """
 
-    @classmethod
-    def to_backend_dispatch(cls):
-        return CudfBackendEntrypoint.to_backend_dispatch()
+    @staticmethod
+    def to_backend(data, **kwargs):
+        import dask_expr as dx
 
-    @classmethod
-    def to_backend(cls, *args, **kwargs):
-        return CudfBackendEntrypoint.to_backend(*args, **kwargs)
+        from dask_cudf.expr._expr import ToCudfBackend
+
+        return dx.new_collection(ToCudfBackend(data, kwargs))
 
     @staticmethod
     def from_dict(
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index 8fccaccb695..8a2c50d3fe7 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -4,12 +4,41 @@
 import dask_expr._shuffle as _shuffle_module
 from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
-from dask_expr._expr import Expr, VarColumns
+from dask_expr._expr import Elemwise, Expr, VarColumns
 from dask_expr._reductions import Reduction, Var
 
 from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
 from dask.dataframe.dispatch import is_categorical_dtype
 
+import cudf
+
+##
+## Custom expressions
+##
+
+
+class ToCudfBackend(Elemwise):
+    # TODO: Inherit from ToBackend when rapids-dask-dependency
+    # is pinned to dask>=2024.8.1
+    _parameters = ["frame", "options"]
+    _projection_passthrough = True
+    _filter_passthrough = True
+    _preserves_partitioning_information = True
+
+    @staticmethod
+    def operation(df, options):
+        from dask_cudf.backends import to_cudf_dispatch
+
+        return to_cudf_dispatch(df, **options)
+
+    def _simplify_down(self):
+        if isinstance(
+            self.frame._meta, (cudf.DataFrame, cudf.Series, cudf.Index)
+        ):
+            # We already have cudf data
+            return self.frame
+
+
 ##
 ## Custom expression patching
 ##
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 174923c2c7e..905d8c08135 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -15,7 +15,11 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr
+from dask_cudf.tests.utils import (
+    require_dask_expr,
+    skip_dask_expr,
+    xfail_dask_expr,
+)
 
 
 def test_from_dict_backend_dispatch():
@@ -993,3 +997,13 @@ def test_series_isin_error():
         ser.isin([1, 5, "a"])
     with pytest.raises(TypeError):
         ddf.isin([1, 5, "a"]).compute()
+
+
+@require_dask_expr()
+def test_to_backend_simplify():
+    # Check that column projection is not blocked by to_backend
+    with dask.config.set({"dataframe.backend": "pandas"}):
+        df = dd.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}, npartitions=2)
+        df2 = df.to_backend("cudf")[["y"]].simplify()
+        df3 = df[["y"]].to_backend("cudf").to_backend("cudf").simplify()
+        assert df2._name == df3._name
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index c7dedbb6b4a..cc0c6899804 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -48,3 +48,7 @@ def xfail_dask_expr(reason=_default_reason, lt_version=None):
     else:
         xfail = QUERY_PLANNING_ON
     return pytest.mark.xfail(xfail, reason=reason)
+
+
+def require_dask_expr(reason="requires dask-expr"):
+    return pytest.mark.skipif(not QUERY_PLANNING_ON, reason=reason)

From 049177839e79dd28c776b5edfb2fd3f6c1b884a2 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 19 Aug 2024 17:05:16 +0200
Subject: [PATCH 085/270] MAINT: Adapt to numpy hiding flagsobject away
 (#16593)

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16593
---
 python/cudf/cudf/pandas/_wrappers/numpy.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index 3b012169676..90ac5198270 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -7,7 +7,7 @@
 import cupy
 import cupy._core.flags
 import numpy
-import numpy.core.multiarray
+from packaging import version
 
 from ..fast_slow_proxy import (
     _FastSlowAttribute,
@@ -141,10 +141,15 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
     },
 )
 
+if version.parse(numpy.__version__) >= version.parse("2.0"):
+    # NumPy 2 introduced `_core` and gives warnings for access to `core`.
+    from numpy._core.multiarray import flagsobj as _numpy_flagsobj
+else:
+    from numpy.core.multiarray import flagsobj as _numpy_flagsobj
 
 # Mapping flags between slow and fast types
 _ndarray_flags = make_intermediate_proxy_type(
     "_ndarray_flags",
     cupy._core.flags.Flags,
-    numpy.core.multiarray.flagsobj,
+    _numpy_flagsobj,
 )

From c516fc48694b6bdbeeb5b31ebdc760034efdb285 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 19 Aug 2024 06:55:03 -1000
Subject: [PATCH 086/270] Make ListColumn.__init__ strict (#16465)

This PR makes `ListColumn.__init__` strict putting restrictions on data, dtype, size and children so these columns cannot be constructed into to an invalid state. It also aligns the signature with the base class.

xref https://github.com/rapidsai/cudf/issues/16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16465
---
 python/cudf/cudf/core/column/column.py |  5 +-
 python/cudf/cudf/core/column/lists.py  | 64 +++++++++++++++++---------
 python/cudf/cudf/core/column/string.py |  1 +
 python/cudf/cudf/tests/test_list.py    |  2 +
 4 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 19d6bf84d3f..0857727d23f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1625,12 +1625,13 @@ def build_column(
         )
     elif isinstance(dtype, ListDtype):
         return cudf.core.column.ListColumn(
-            size=size,
+            data=None,
+            size=size,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
-            children=children,
+            children=children,  # type: ignore[arg-type]
         )
     elif isinstance(dtype, IntervalDtype):
         return cudf.core.column.IntervalColumn(
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 1b7cd95b3d0..302f04a0e71 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -29,30 +29,46 @@
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethods, ParentType
+from cudf.core.column.numerical import NumericalColumn
 from cudf.core.dtypes import ListDtype
 from cudf.core.missing import NA
 
 if TYPE_CHECKING:
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
+    from cudf.core.buffer import Buffer
 
 
 class ListColumn(ColumnBase):
-    dtype: ListDtype
     _VALID_BINARY_OPERATIONS = {"__add__", "__radd__"}
 
     def __init__(
         self,
-        size,
-        dtype,
-        mask=None,
-        offset=0,
-        null_count=None,
-        children=(),
+        data: None,
+        size: int,
+        dtype: ListDtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple[NumericalColumn, ColumnBase] = (),  # type: ignore[assignment]
     ):
+        if data is not None:
+            raise ValueError("data must be None")
+        if not isinstance(dtype, ListDtype):
+            raise ValueError("dtype must be a cudf.ListDtype")
+        if not (
+            len(children) == 2
+            and isinstance(children[0], NumericalColumn)
+            # TODO: Enforce int32_t (size_type) used in libcudf?
+            and children[0].dtype.kind == "i"
+            and isinstance(children[1], ColumnBase)
+        ):
+            raise ValueError(
+                "children must a tuple of 2 columns of (signed integer offsets, list values)"
+            )
         super().__init__(
-            None,
-            size,
-            dtype,
+            data=data,
+            size=size,
+            dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
@@ -131,7 +147,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             raise TypeError("can only concatenate list to list")
 
     @property
-    def elements(self):
+    def elements(self) -> ColumnBase:
         """
         Column containing the elements of each list (may itself be a
         ListColumn)
@@ -139,11 +155,11 @@ def elements(self):
         return self.children[1]
 
     @property
-    def offsets(self):
+    def offsets(self) -> NumericalColumn:
         """
         Integer offsets to elements specifying each row of the ListColumn
         """
-        return self.children[0]
+        return cast(NumericalColumn, self.children[0])
 
     def to_arrow(self):
         offsets = self.offsets.to_arrow()
@@ -172,10 +188,9 @@ def set_base_data(self, value):
         else:
             super().set_base_data(value)
 
-    def set_base_children(self, value: tuple[ColumnBase, ...]):
+    def set_base_children(self, value: tuple[NumericalColumn, ColumnBase]):  # type: ignore[override]
         super().set_base_children(value)
-        _, values = value
-        self._dtype = cudf.ListDtype(element_type=values.dtype)
+        self._dtype = cudf.ListDtype(element_type=value[1].dtype)
 
     @property
     def __cuda_array_interface__(self):
@@ -196,12 +211,13 @@ def _with_type_metadata(
                 dtype.element_type
             )
             return ListColumn(
+                data=None,
                 dtype=dtype,
                 mask=self.base_mask,
                 size=self.size,
                 offset=self.offset,
                 null_count=self.null_count,
-                children=(self.base_children[0], elements),
+                children=(self.base_children[0], elements),  # type: ignore[arg-type]
             )
 
         return self
@@ -226,24 +242,25 @@ def from_sequences(
         """
         data_col = column.column_empty(0)
         mask_col = []
-        offset_col = [0]
+        offset_vals = [0]
         offset = 0
 
         # Build Data, Mask & Offsets
         for data in arbitrary:
             if cudf._lib.scalar._is_null_host_scalar(data):
                 mask_col.append(False)
-                offset_col.append(offset)
+                offset_vals.append(offset)
             else:
                 mask_col.append(True)
                 data_col = data_col.append(as_column(data))
                 offset += len(data)
-                offset_col.append(offset)
+                offset_vals.append(offset)
 
-        offset_col = column.as_column(offset_col, dtype=size_type_dtype)
+        offset_col = column.as_column(offset_vals, dtype=size_type_dtype)
 
         # Build ListColumn
         res = cls(
+            data=None,
             size=len(arbitrary),
             dtype=cudf.ListDtype(data_col.dtype),
             mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
@@ -283,12 +300,13 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self:
         for c in cc:
             o = c.children[0]
             lc = cudf.core.column.ListColumn(  # type: ignore
+                data=None,
                 size=c.size,
                 dtype=cudf.ListDtype(lc.dtype),
                 mask=c.mask,
                 offset=c.offset,
                 null_count=c.null_count,
-                children=(o, lc),
+                children=(o, lc),  # type: ignore[arg-type]
             )
         return lc
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a710a9f46c2..6f7508822d4 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -549,6 +549,7 @@ def _split_by_character(self):
         offset_col = col.children[0]
 
         return cudf.core.column.ListColumn(
+            data=None,
             size=len(col),
             dtype=cudf.ListDtype(col.dtype),
             mask=col.mask,
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index c4c883ca9f9..7d87fc73621 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -928,6 +928,7 @@ def test_empty_nested_list_uninitialized_offsets_memory_usage():
     col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64")))
     nested_col = col.children[1]
     empty_inner = type(nested_col)(
+        data=None,
         size=nested_col.size,
         dtype=nested_col.dtype,
         mask=nested_col.mask,
@@ -939,6 +940,7 @@ def test_empty_nested_list_uninitialized_offsets_memory_usage():
         ),
     )
     col_empty_offset = type(col)(
+        data=None,
         size=col.size,
         dtype=col.dtype,
         mask=col.mask,

From 074abcc0fa9eb9d2944b145f29fa02eb9edddc55 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 19 Aug 2024 13:37:21 -0700
Subject: [PATCH 087/270] Add `public` qualifier for some member functions in
 Java class `Schema` (#16583)

This adds the public qualifier for some member functions of `Schema` class in Java code, allowing them to be accessed outside of the `ai.rapids.cudf` package such as from spark-rapids-jni or Spark plugin.

Java docs are also added for the newly became public functions as well as some existing public functions.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/16583
---
 java/src/main/java/ai/rapids/cudf/Schema.java | 56 +++++++++++++++----
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index 43603386649..76b2799aad6 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -120,7 +120,7 @@ private void flattenIfNeeded() {
 
   private int flattenedLength(int startingLength) {
     if (childSchemas != null) {
-      for (Schema child: childSchemas) {
+      for (Schema child : childSchemas) {
         startingLength++;
         startingLength = child.flattenedLength(startingLength);
       }
@@ -150,11 +150,19 @@ public static Builder builder() {
     return new Builder(DType.STRUCT);
   }
 
+  /**
+   * Get names of the columns flattened from all levels in schema by depth-first traversal.
+   * @return An array containing names of all columns in schema.
+   */
   public String[] getFlattenedColumnNames() {
     flattenIfNeeded();
     return flattenedNames;
   }
 
+  /**
+   * Get names of the top level child columns in schema.
+   * @return An array containing names of top level child columns.
+   */
   public String[] getColumnNames() {
     if (childNames == null) {
       return null;
@@ -162,6 +170,10 @@ public String[] getColumnNames() {
     return childNames.toArray(new String[childNames.size()]);
   }
 
+  /**
+   * Check if the schema is nested (i.e., top level type is LIST or STRUCT).
+   * @return true if the schema is nested, false otherwise.
+   */
   public boolean isNested() {
     return childSchemas != null && childSchemas.size() > 0;
   }
@@ -173,7 +185,7 @@ public boolean isNested() {
    */
   public boolean hasNestedChildren() {
     if (childSchemas != null) {
-      for (Schema child: childSchemas) {
+      for (Schema child : childSchemas) {
         if (child.isNested()) {
           return true;
         }
@@ -182,7 +194,11 @@ public boolean hasNestedChildren() {
     return false;
   }
 
-  int[] getFlattenedTypeIds() {
+  /**
+   * Get type ids of the columns flattened from all levels in schema by depth-first traversal.
+   * @return An array containing type ids of all columns in schema.
+   */
+  public int[] getFlattenedTypeIds() {
     flattenIfNeeded();
     if (flattenedTypes == null) {
       return null;
@@ -194,7 +210,11 @@ int[] getFlattenedTypeIds() {
     return ret;
   }
 
-  int[] getFlattenedTypeScales() {
+  /**
+   * Get scales of the columns' types flattened from all levels in schema by depth-first traversal.
+   * @return An array containing type scales of all columns in schema.
+   */
+  public int[] getFlattenedTypeScales() {
     flattenIfNeeded();
     if (flattenedTypes == null) {
       return null;
@@ -206,11 +226,19 @@ int[] getFlattenedTypeScales() {
     return ret;
   }
 
-  DType[] getFlattenedTypes() {
+  /**
+   * Get the types of the columns in schema flattened from all levels by depth-first traversal.
+   * @return An array containing types of all columns in schema.
+   */
+  public DType[] getFlattenedTypes() {
     flattenIfNeeded();
     return flattenedTypes;
   }
 
+  /**
+   * Get types of the top level child columns in schema.
+   * @return An array containing types of top level child columns.
+   */
   public DType[] getChildTypes() {
     if (childSchemas == null) {
       return null;
@@ -222,6 +250,10 @@ public DType[] getChildTypes() {
     return ret;
   }
 
+  /**
+   * Get number of top level child columns in schema.
+   * @return Number of child columns.
+   */
   public int getNumChildren() {
     if (childSchemas == null) {
       return 0;
@@ -229,7 +261,11 @@ public int getNumChildren() {
     return childSchemas.size();
   }
 
-  int[] getFlattenedNumChildren() {
+  /**
+   * Get numbers of child columns for each level in schema.
+   * @return Numbers of child columns for all levels flattened by depth-first traversal.
+   */
+  public int[] getFlattenedNumChildren() {
     flattenIfNeeded();
     return flattenedCounts;
   }
@@ -253,7 +289,7 @@ public boolean isStructOrHasStructDescendant() {
 
   public HostColumnVector.DataType asHostDataType() {
     if (topLevelType == DType.LIST) {
-      assert(childSchemas != null && childSchemas.size() == 1);
+      assert (childSchemas != null && childSchemas.size() == 1);
       HostColumnVector.DataType element = childSchemas.get(0).asHostDataType();
       return new HostColumnVector.ListType(true, element);
     } else if (topLevelType == DType.STRUCT) {
@@ -261,7 +297,7 @@ public HostColumnVector.DataType asHostDataType() {
         return new HostColumnVector.StructType(true);
       } else {
         List<HostColumnVector.DataType> childTypes =
-                childSchemas.stream().map(Schema::asHostDataType).collect(Collectors.toList());
+            childSchemas.stream().map(Schema::asHostDataType).collect(Collectors.toList());
         return new HostColumnVector.StructType(true, childTypes);
       }
     } else {
@@ -269,7 +305,7 @@ public HostColumnVector.DataType asHostDataType() {
     }
   }
 
-    public static class Builder {
+  public static class Builder {
     private final DType topLevelType;
     private final List<String> names;
     private final List<Builder> types;
@@ -326,7 +362,7 @@ public Schema build() {
       List<Schema> children = null;
       if (types != null) {
         children = new ArrayList<>(types.size());
-        for (Builder b: types) {
+        for (Builder b : types) {
           children.add(b.build());
         }
       }

From 79a5a97b2662bab6862ed895a6d802edd17d2502 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 19 Aug 2024 13:59:10 -0700
Subject: [PATCH 088/270] Remove NativeFile support from cudf Python (#16589)

This PR removes all support for passing NativeFile objects through cudf's I/O routines.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16589
---
 python/cudf/cudf/_lib/csv.pyx                 |   9 +-
 python/cudf/cudf/_lib/orc.pyx                 |  10 -
 python/cudf/cudf/_lib/parquet.pyx             |  43 +---
 python/cudf/cudf/io/csv.py                    |  11 +-
 python/cudf/cudf/io/orc.py                    |  33 +--
 python/cudf/cudf/io/parquet.py                | 102 ++------
 python/cudf/cudf/tests/test_csv.py            |  13 -
 python/cudf/cudf/tests/test_gcs.py            |   6 +-
 python/cudf/cudf/tests/test_parquet.py        |  33 +--
 python/cudf/cudf/tests/test_s3.py             | 168 +++----------
 python/cudf/cudf/utils/ioutils.py             | 234 ++----------------
 python/cudf/cudf/utils/utils.py               |  26 --
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |  48 ----
 python/pylibcudf/pylibcudf/io/datasource.pxd  |   7 -
 python/pylibcudf/pylibcudf/io/datasource.pyx  |  24 --
 15 files changed, 86 insertions(+), 681 deletions(-)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index e9aa97ecbc9..a90fe0f9ac6 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -7,7 +7,6 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport pylibcudf.libcudf.types as libcudf_types
-from pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
 
 from cudf._lib.types cimport dtype_to_pylibcudf_type
 
@@ -35,8 +34,6 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.io.utils cimport make_sink_info
 from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
 
-from pyarrow.lib import NativeFile
-
 import pylibcudf as plc
 
 from cudf.api.types import is_hashable
@@ -127,9 +124,7 @@ def read_csv(
     cudf.read_csv
     """
 
-    if not isinstance(datasource, (BytesIO, StringIO, bytes,
-                                   Datasource,
-                                   NativeFile)):
+    if not isinstance(datasource, (BytesIO, StringIO, bytes)):
         if not os.path.isfile(datasource):
             raise FileNotFoundError(
                 errno.ENOENT, os.strerror(errno.ENOENT), datasource
@@ -139,8 +134,6 @@ def read_csv(
         datasource = datasource.read().encode()
     elif isinstance(datasource, str) and not os.path.isfile(datasource):
         datasource = datasource.encode()
-    elif isinstance(datasource, NativeFile):
-        datasource = NativeFileDatasource(datasource)
 
     validate_args(delimiter, sep, delim_whitespace, decimal, thousands,
                   nrows, skipfooter, byte_range, skiprows)
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index d506dcd4346..adeba6fffb1 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -22,7 +22,6 @@ except ImportError:
     import json
 
 cimport pylibcudf.libcudf.io.types as cudf_io_types
-from pylibcudf.io.datasource cimport NativeFileDatasource
 from pylibcudf.libcudf.io.data_sink cimport data_sink
 from pylibcudf.libcudf.io.orc cimport (
     chunked_orc_writer_options,
@@ -71,8 +70,6 @@ from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 from cudf._lib.types cimport underlying_type_t_type_id
 from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
-from pyarrow.lib import NativeFile
-
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 
@@ -204,10 +201,6 @@ cpdef read_parsed_orc_statistics(filepath_or_buffer):
     cudf.io.orc.read_orc_statistics
     """
 
-    # Handle NativeFile input
-    if isinstance(filepath_or_buffer, NativeFile):
-        filepath_or_buffer = NativeFileDatasource(filepath_or_buffer)
-
     cdef parsed_orc_statistics parsed = (
         libcudf_read_parsed_orc_statistics(make_source_info([filepath_or_buffer]))
     )
@@ -490,9 +483,6 @@ cdef orc_reader_options make_orc_reader_options(
     bool use_index
 ) except*:
 
-    for i, datasource in enumerate(filepaths_or_buffers):
-        if isinstance(datasource, NativeFile):
-            filepaths_or_buffers[i] = NativeFileDatasource(datasource)
     cdef vector[vector[size_type]] strps = stripes
     cdef orc_reader_options opts
     cdef source_info src = make_source_info(filepaths_or_buffers)
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 4bfb79ff651..c874a51e220 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -34,7 +34,6 @@ from libcpp.vector cimport vector
 cimport pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
 cimport pylibcudf.libcudf.io.types as cudf_io_types
 from pylibcudf.expressions cimport Expression
-from pylibcudf.io.datasource cimport NativeFileDatasource
 from pylibcudf.io.parquet cimport ChunkedParquetReader
 from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_writer_options,
@@ -62,8 +61,6 @@ from cudf._lib.io.utils cimport (
 )
 from cudf._lib.utils cimport table_view_from_table
 
-from pyarrow.lib import NativeFile
-
 import pylibcudf as plc
 
 from pylibcudf cimport Table
@@ -133,7 +130,6 @@ cdef object _process_metadata(object df,
                               list per_file_user_data,
                               object row_groups,
                               object filepaths_or_buffers,
-                              list pa_buffers,
                               bool allow_range_index,
                               bool use_pandas_metadata,
                               size_type nrows=-1,
@@ -199,9 +195,7 @@ cdef object _process_metadata(object df,
                     pa.parquet.read_metadata(
                         # Pyarrow cannot read directly from bytes
                         io.BytesIO(s) if isinstance(s, bytes) else s
-                    ) for s in (
-                        pa_buffers or filepaths_or_buffers
-                    )
+                    ) for s in filepaths_or_buffers
                 ]
 
                 filtered_idx = []
@@ -274,27 +268,13 @@ def read_parquet_chunked(
     size_type nrows=-1,
     int64_t skip_rows=0
 ):
-    # Convert NativeFile buffers to NativeFileDatasource,
-    # but save original buffers in case we need to use
-    # pyarrow for metadata processing
-    # (See: https://github.com/rapidsai/cudf/issues/9599)
-
-    pa_buffers = []
-
-    new_bufs = []
-    for i, datasource in enumerate(filepaths_or_buffers):
-        if isinstance(datasource, NativeFile):
-            new_bufs.append(NativeFileDatasource(datasource))
-        else:
-            new_bufs.append(datasource)
-
     # Note: If this function ever takes accepts filters
     # allow_range_index needs to be False when a filter is passed
     # (see read_parquet)
     allow_range_index = columns is not None and len(columns) != 0
 
     reader = ChunkedParquetReader(
-        plc.io.SourceInfo(new_bufs),
+        plc.io.SourceInfo(filepaths_or_buffers),
         columns,
         row_groups,
         use_pandas_metadata,
@@ -333,7 +313,7 @@ def read_parquet_chunked(
     )
     df = _process_metadata(df, column_names, child_names,
                            per_file_user_data, row_groups,
-                           filepaths_or_buffers, pa_buffers,
+                           filepaths_or_buffers,
                            allow_range_index, use_pandas_metadata,
                            nrows=nrows, skip_rows=skip_rows)
     return df
@@ -356,16 +336,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     cudf.io.parquet.to_parquet
     """
 
-    # Convert NativeFile buffers to NativeFileDatasource,
-    # but save original buffers in case we need to use
-    # pyarrow for metadata processing
-    # (See: https://github.com/rapidsai/cudf/issues/9599)
-    pa_buffers = []
-    for i, datasource in enumerate(filepaths_or_buffers):
-        if isinstance(datasource, NativeFile):
-            pa_buffers.append(datasource)
-            filepaths_or_buffers[i] = NativeFileDatasource(datasource)
-
     allow_range_index = True
     if columns is not None and len(columns) == 0 or filters:
         allow_range_index = False
@@ -389,7 +359,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
 
     df = _process_metadata(df, tbl_w_meta.column_names(include_children=False),
                            tbl_w_meta.child_names, tbl_w_meta.per_file_user_data,
-                           row_groups, filepaths_or_buffers, pa_buffers,
+                           row_groups, filepaths_or_buffers,
                            allow_range_index, use_pandas_metadata,
                            nrows=nrows, skip_rows=skip_rows)
     return df
@@ -403,11 +373,6 @@ cpdef read_parquet_metadata(filepaths_or_buffers):
     cudf.io.parquet.read_parquet
     cudf.io.parquet.to_parquet
     """
-    # Convert NativeFile buffers to NativeFileDatasource
-    for i, datasource in enumerate(filepaths_or_buffers):
-        if isinstance(datasource, NativeFile):
-            filepaths_or_buffers[i] = NativeFileDatasource(datasource)
-
     cdef cudf_io_types.source_info source = make_source_info(filepaths_or_buffers)
 
     args = move(source)
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 0f2820a01e9..e61fc5063dc 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -5,7 +5,6 @@
 from io import BytesIO, StringIO
 
 import numpy as np
-from pyarrow.lib import NativeFile
 
 import cudf
 from cudf import _lib as libcudf
@@ -50,7 +49,6 @@ def read_csv(
     comment=None,
     delim_whitespace=False,
     byte_range=None,
-    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
@@ -63,12 +61,6 @@ def read_csv(
             FutureWarning,
         )
 
-    if use_python_file_object and bytes_per_thread is not None:
-        raise ValueError(
-            "bytes_per_thread is only supported when "
-            "`use_python_file_object=False`"
-        )
-
     if bytes_per_thread is None:
         bytes_per_thread = ioutils._BYTES_PER_THREAD_DEFAULT
 
@@ -84,8 +76,7 @@ def read_csv(
     filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         compression=compression,
-        iotypes=(BytesIO, StringIO, NativeFile),
-        use_python_file_object=use_python_file_object,
+        iotypes=(BytesIO, StringIO),
         storage_options=storage_options,
         bytes_per_thread=bytes_per_thread,
     )
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 289292b5182..4f04caafc5d 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -10,7 +10,6 @@
 from cudf._lib import orc as liborc
 from cudf.api.types import is_list_like
 from cudf.utils import ioutils
-from cudf.utils.utils import maybe_filter_deprecation
 
 
 def _make_empty_df(filepath_or_buffer, columns):
@@ -281,7 +280,6 @@ def read_orc(
     num_rows=None,
     use_index=True,
     timestamp_type=None,
-    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
@@ -321,9 +319,6 @@ def read_orc(
             )
 
     filepaths_or_buffers = []
-    have_nativefile = any(
-        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
-    )
     for source in filepath_or_buffer:
         if ioutils.is_directory(
             path_or_data=source, storage_options=storage_options
@@ -339,7 +334,6 @@ def read_orc(
         tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
             compression=None,
-            use_python_file_object=use_python_file_object,
             storage_options=storage_options,
             bytes_per_thread=bytes_per_thread,
         )
@@ -364,24 +358,17 @@ def read_orc(
             stripes = selected_stripes
 
     if engine == "cudf":
-        # Don't want to warn if use_python_file_object causes us to get
-        # a NativeFile (there is a separate deprecation warning for that)
-        with maybe_filter_deprecation(
-            not have_nativefile,
-            message="Support for reading pyarrow's NativeFile is deprecated",
-            category=FutureWarning,
-        ):
-            return DataFrame._from_data(
-                *liborc.read_orc(
-                    filepaths_or_buffers,
-                    columns,
-                    stripes,
-                    skiprows,
-                    num_rows,
-                    use_index,
-                    timestamp_type,
-                )
+        return DataFrame._from_data(
+            *liborc.read_orc(
+                filepaths_or_buffers,
+                columns,
+                stripes,
+                skiprows,
+                num_rows,
+                use_index,
+                timestamp_type,
             )
+        )
     else:
         from pyarrow import orc
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 4a419a2fbb6..fac51a9e471 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -15,7 +15,6 @@
 
 import numpy as np
 import pandas as pd
-import pyarrow as pa
 from pyarrow import dataset as ds
 
 import cudf
@@ -24,7 +23,6 @@
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
-from cudf.utils.utils import maybe_filter_deprecation
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -352,8 +350,6 @@ def read_parquet_metadata(filepath_or_buffer):
             path_or_data=source,
             compression=None,
             fs=fs,
-            use_python_file_object=None,
-            open_file_options=None,
             storage_options=None,
             bytes_per_thread=None,
         )
@@ -534,9 +530,7 @@ def read_parquet(
     filters=None,
     row_groups=None,
     use_pandas_metadata=True,
-    use_python_file_object=None,
     categorical_partitions=True,
-    open_file_options=None,
     bytes_per_thread=None,
     dataset_kwargs=None,
     nrows=None,
@@ -549,16 +543,6 @@ def read_parquet(
         raise ValueError(
             f"Only supported engines are {{'cudf', 'pyarrow'}}, got {engine=}"
         )
-    # Do not allow the user to set file-opening options
-    # when `use_python_file_object=False` is specified
-    if use_python_file_object is False:
-        if open_file_options:
-            raise ValueError(
-                "open_file_options is not currently supported when "
-                "use_python_file_object is set to False."
-            )
-        open_file_options = {}
-
     if bytes_per_thread is None:
         bytes_per_thread = ioutils._BYTES_PER_THREAD_DEFAULT
 
@@ -612,23 +596,11 @@ def read_parquet(
     filepath_or_buffer = paths if paths else filepath_or_buffer
 
     filepaths_or_buffers = []
-    if use_python_file_object:
-        open_file_options = _default_open_file_options(
-            open_file_options=open_file_options,
-            columns=columns,
-            row_groups=row_groups,
-            fs=fs,
-        )
-    have_nativefile = any(
-        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
-    )
     for source in filepath_or_buffer:
         tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
             compression=None,
             fs=fs,
-            use_python_file_object=use_python_file_object,
-            open_file_options=open_file_options,
             storage_options=storage_options,
             bytes_per_thread=bytes_per_thread,
         )
@@ -669,28 +641,20 @@ def read_parquet(
         )
 
     # Convert parquet data to a cudf.DataFrame
-
-    # Don't want to warn if use_python_file_object causes us to get
-    # a NativeFile (there is a separate deprecation warning for that)
-    with maybe_filter_deprecation(
-        not have_nativefile,
-        message="Support for reading pyarrow's NativeFile is deprecated",
-        category=FutureWarning,
-    ):
-        df = _parquet_to_frame(
-            filepaths_or_buffers,
-            engine,
-            *args,
-            columns=columns,
-            row_groups=row_groups,
-            use_pandas_metadata=use_pandas_metadata,
-            partition_keys=partition_keys,
-            partition_categories=partition_categories,
-            dataset_kwargs=dataset_kwargs,
-            nrows=nrows,
-            skip_rows=skip_rows,
-            **kwargs,
-        )
+    df = _parquet_to_frame(
+        filepaths_or_buffers,
+        engine,
+        *args,
+        columns=columns,
+        row_groups=row_groups,
+        use_pandas_metadata=use_pandas_metadata,
+        partition_keys=partition_keys,
+        partition_categories=partition_categories,
+        dataset_kwargs=dataset_kwargs,
+        nrows=nrows,
+        skip_rows=skip_rows,
+        **kwargs,
+    )
     # Apply filters row-wise (if any are defined), and return
     df = _apply_post_filters(df, filters)
     if projected_columns:
@@ -1570,44 +1534,6 @@ def __exit__(self, *args):
         self.close()
 
 
-def _default_open_file_options(
-    open_file_options, columns, row_groups, fs=None
-):
-    """
-    Set default fields in open_file_options.
-
-    Copies and updates `open_file_options` to
-    include column and row-group information
-    under the "precache_options" key. By default,
-    we set "method" to "parquet", but precaching
-    will be disabled if the user chooses `method=None`
-
-    Parameters
-    ----------
-    open_file_options : dict or None
-    columns : list
-    row_groups : list
-    fs : fsspec.AbstractFileSystem, Optional
-    """
-    if fs and ioutils._is_local_filesystem(fs):
-        # Quick return for local fs
-        return open_file_options or {}
-    # Assume remote storage if `fs` was not specified
-    open_file_options = (open_file_options or {}).copy()
-    precache_options = open_file_options.pop("precache_options", {}).copy()
-    if precache_options.get("method", "parquet") == "parquet":
-        precache_options.update(
-            {
-                "method": "parquet",
-                "engine": precache_options.get("engine", "pyarrow"),
-                "columns": columns,
-                "row_groups": row_groups,
-            }
-        )
-    open_file_options["precache_options"] = precache_options
-    return open_file_options
-
-
 def _hive_dirname(name, val):
     # Simple utility to produce hive directory name
     if pd.isna(val):
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 6a21cb1b9d7..40ba415e681 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -13,7 +13,6 @@
 import numpy as np
 import pandas as pd
 import pytest
-from pyarrow import fs as pa_fs
 
 import cudf
 from cudf import read_csv
@@ -1080,18 +1079,6 @@ def test_csv_reader_filepath_or_buffer(tmpdir, path_or_buf, src):
     assert_eq(expect, got)
 
 
-def test_csv_reader_arrow_nativefile(path_or_buf):
-    # Check that we can read a file opened with the
-    # Arrow FileSystem interface
-    expect = cudf.read_csv(path_or_buf("filepath"))
-    fs, path = pa_fs.FileSystem.from_uri(path_or_buf("filepath"))
-    with pytest.warns(FutureWarning):
-        with fs.open_input_file(path) as fil:
-            got = cudf.read_csv(fil)
-
-    assert_eq(expect, got)
-
-
 def test_small_zip(tmpdir):
     df = pd.DataFrame(
         {
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index 28fdfb5c2f1..82ecd356bbf 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -42,12 +42,8 @@ def mock_size(*args):
     monkeypatch.setattr(gcsfs.core.GCSFileSystem, "size", mock_size)
 
     # Test read from explicit path.
-    # Since we are monkey-patching, we cannot use
-    # use_python_file_object=True, because the pyarrow
-    # `open_input_file` command will fail (since it doesn't
-    # use the monkey-patched `open` definition)
     with pytest.warns(FutureWarning):
-        got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
+        got = cudf.read_csv(f"gcs://{fpath}")
     assert_eq(pdf, got)
 
     # AbstractBufferedFile -> PythonFile conversion
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 879a2c50db7..db4f1c9c8bd 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -19,7 +19,7 @@
 import pytest
 from fsspec.core import get_fs_token_paths
 from packaging import version
-from pyarrow import fs as pa_fs, parquet as pq
+from pyarrow import parquet as pq
 
 import cudf
 from cudf._lib.parquet import read_parquet_chunked
@@ -705,40 +705,17 @@ def test_parquet_reader_filepath_or_buffer(parquet_path_or_buf, src):
     assert_eq(expect, got)
 
 
-def test_parquet_reader_arrow_nativefile(parquet_path_or_buf):
-    # Check that we can read a file opened with the
-    # Arrow FileSystem interface
-    expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
-    fs, path = pa_fs.FileSystem.from_uri(parquet_path_or_buf("filepath"))
-    with fs.open_input_file(path) as fil:
-        with pytest.warns(FutureWarning):
-            got = cudf.read_parquet(fil)
-
-    assert_eq(expect, got)
-
-
-@pytest.mark.parametrize("use_python_file_object", [True, False])
-def test_parquet_reader_use_python_file_object(
-    parquet_path_or_buf, use_python_file_object
-):
-    # Check that the non-default `use_python_file_object=True`
-    # option works as expected
+def test_parquet_reader_file_types(parquet_path_or_buf):
     expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
     fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath"))
 
     # Pass open fsspec file
-    with pytest.warns(FutureWarning):
-        with fs.open(paths[0], mode="rb") as fil:
-            got1 = cudf.read_parquet(
-                fil, use_python_file_object=use_python_file_object
-            )
+    with fs.open(paths[0], mode="rb") as fil:
+        got1 = cudf.read_parquet(fil)
     assert_eq(expect, got1)
 
     # Pass path only
-    with pytest.warns(FutureWarning):
-        got2 = cudf.read_parquet(
-            paths[0], use_python_file_object=use_python_file_object
-        )
+    got2 = cudf.read_parquet(paths[0])
     assert_eq(expect, got2)
 
 
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 3ae318d3bf5..6579fd23634 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -7,7 +7,6 @@
 
 import numpy as np
 import pandas as pd
-import pyarrow.fs as pa_fs
 import pytest
 from fsspec.core import get_fs_token_paths
 
@@ -138,48 +137,17 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     buffer = pdf.to_csv(index=False)
 
     # Use fsspec file object
-    with pytest.warns(FutureWarning):
-        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-            got = cudf.read_csv(
-                f"s3://{bucket}/{fname}",
-                storage_options=s3so,
-                bytes_per_thread=bytes_per_thread,
-                use_python_file_object=False,
-            )
-    assert_eq(pdf, got)
-
-    # Use Arrow PythonFile object
-    with pytest.warns(FutureWarning):
-        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-            got = cudf.read_csv(
-                f"s3://{bucket}/{fname}",
-                storage_options=s3so,
-                use_python_file_object=True,
-            )
-    assert_eq(pdf, got)
-
-
-def test_read_csv_arrow_nativefile(s3_base, s3so, pdf):
-    # Write to buffer
-    fname = "test_csv_reader_arrow_nativefile.csv"
-    bucket = "csv"
-    buffer = pdf.to_csv(index=False)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        fs = pa_fs.S3FileSystem(
-            endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+        got = cudf.read_csv(
+            f"s3://{bucket}/{fname}",
+            storage_options=s3so,
+            bytes_per_thread=bytes_per_thread,
         )
-        with pytest.warns(FutureWarning):
-            with fs.open_input_file(f"{bucket}/{fname}") as fil:
-                got = cudf.read_csv(fil)
-
     assert_eq(pdf, got)
 
 
 @pytest.mark.parametrize("bytes_per_thread", [32, 1024])
-@pytest.mark.parametrize("use_python_file_object", [True, False])
-def test_read_csv_byte_range(
-    s3_base, s3so, pdf, bytes_per_thread, use_python_file_object
-):
+def test_read_csv_byte_range(s3_base, s3so, pdf, bytes_per_thread):
     # Write to buffer
     fname = "test_csv_reader_byte_range.csv"
     bucket = "csv"
@@ -187,18 +155,14 @@ def test_read_csv_byte_range(
 
     # Use fsspec file object
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            got = cudf.read_csv(
-                f"s3://{bucket}/{fname}",
-                storage_options=s3so,
-                byte_range=(74, 73),
-                bytes_per_thread=bytes_per_thread
-                if not use_python_file_object
-                else None,
-                header=None,
-                names=["Integer", "Float", "Integer2", "String", "Boolean"],
-                use_python_file_object=use_python_file_object,
-            )
+        got = cudf.read_csv(
+            f"s3://{bucket}/{fname}",
+            storage_options=s3so,
+            byte_range=(74, 73),
+            bytes_per_thread=bytes_per_thread,
+            header=None,
+            names=["Integer", "Float", "Integer2", "String", "Boolean"],
+        )
 
     assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
 
@@ -226,16 +190,12 @@ def test_write_csv(s3_base, s3so, pdf, chunksize):
 
 @pytest.mark.parametrize("bytes_per_thread", [32, 1024])
 @pytest.mark.parametrize("columns", [None, ["Float", "String"]])
-@pytest.mark.parametrize("precache", [None, "parquet"])
-@pytest.mark.parametrize("use_python_file_object", [True, False])
 def test_read_parquet(
     s3_base,
     s3so,
     pdf,
     bytes_per_thread,
     columns,
-    precache,
-    use_python_file_object,
 ):
     fname = "test_parquet_reader.parquet"
     bucket = "parquet"
@@ -245,19 +205,12 @@ def test_read_parquet(
     # Check direct path handling
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            got1 = cudf.read_parquet(
-                f"s3://{bucket}/{fname}",
-                open_file_options=(
-                    {"precache_options": {"method": precache}}
-                    if use_python_file_object
-                    else None
-                ),
-                storage_options=s3so,
-                bytes_per_thread=bytes_per_thread,
-                columns=columns,
-                use_python_file_object=use_python_file_object,
-            )
+        got1 = cudf.read_parquet(
+            f"s3://{bucket}/{fname}",
+            storage_options=s3so,
+            bytes_per_thread=bytes_per_thread,
+            columns=columns,
+        )
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got1)
 
@@ -268,13 +221,11 @@ def test_read_parquet(
             f"s3://{bucket}/{fname}", storage_options=s3so
         )[0]
         with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f:
-            with pytest.warns(FutureWarning):
-                got2 = cudf.read_parquet(
-                    f,
-                    bytes_per_thread=bytes_per_thread,
-                    columns=columns,
-                    use_python_file_object=use_python_file_object,
-                )
+            got2 = cudf.read_parquet(
+                f,
+                bytes_per_thread=bytes_per_thread,
+                columns=columns,
+            )
     assert_eq(expect, got2)
 
 
@@ -350,28 +301,7 @@ def test_read_parquet_multi_file(s3_base, s3so, pdf):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("columns", [None, ["Float", "String"]])
-def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
-    # Write to buffer
-    fname = "test_parquet_reader_arrow_nativefile.parquet"
-    bucket = "parquet"
-    buffer = BytesIO()
-    pdf.to_parquet(path=buffer)
-    buffer.seek(0)
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            fs = pa_fs.S3FileSystem(
-                endpoint_override=s3so["client_kwargs"]["endpoint_url"],
-            )
-            with fs.open_input_file(f"{bucket}/{fname}") as fil:
-                got = cudf.read_parquet(fil, columns=columns)
-
-    expect = pdf[columns] if columns else pdf
-    assert_eq(expect, got)
-
-
-@pytest.mark.parametrize("precache", [None, "parquet"])
-def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
+def test_read_parquet_filters(s3_base, s3so, pdf_ext):
     fname = "test_parquet_reader_filters.parquet"
     bucket = "parquet"
     buffer = BytesIO()
@@ -379,13 +309,11 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
     buffer.seek(0)
     filters = [("String", "==", "Omega")]
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            got = cudf.read_parquet(
-                f"s3://{bucket}/{fname}",
-                storage_options=s3so,
-                filters=filters,
-                open_file_options={"precache_options": {"method": precache}},
-            )
+        got = cudf.read_parquet(
+            f"s3://{bucket}/{fname}",
+            storage_options=s3so,
+            filters=filters,
+        )
 
     # All row-groups should be filtered out
     assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True))
@@ -445,33 +373,8 @@ def test_read_json(s3_base, s3so):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("use_python_file_object", [False, True])
-@pytest.mark.parametrize("columns", [None, ["string1"]])
-def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
-    source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
-    fname = "test_orc_reader.orc"
-    bucket = "orc"
-    expect = pd.read_orc(source_file)
-
-    with open(source_file, "rb") as f:
-        buffer = f.read()
-
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            got = cudf.read_orc(
-                f"s3://{bucket}/{fname}",
-                columns=columns,
-                storage_options=s3so,
-                use_python_file_object=use_python_file_object,
-            )
-
-    if columns:
-        expect = expect[columns]
-    assert_eq(expect, got)
-
-
 @pytest.mark.parametrize("columns", [None, ["string1"]])
-def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
+def test_read_orc(s3_base, s3so, datadir, columns):
     source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
     fname = "test_orc_reader.orc"
     bucket = "orc"
@@ -481,12 +384,11 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
         buffer = f.read()
 
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        fs = pa_fs.S3FileSystem(
-            endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+        got = cudf.read_orc(
+            f"s3://{bucket}/{fname}",
+            columns=columns,
+            storage_options=s3so,
         )
-        with pytest.warns(FutureWarning):
-            with fs.open_input_file(f"{bucket}/{fname}") as fil:
-                got = cudf.read_orc(fil, columns=columns)
 
     if columns:
         expect = expect[columns]
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 448a815fe1b..4ac9b63985f 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -13,19 +13,10 @@
 import numpy as np
 import pandas as pd
 from fsspec.core import get_fs_token_paths
-from pyarrow import PythonFile as ArrowPythonFile
-from pyarrow.lib import NativeFile
 
-from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
 
-try:
-    import fsspec.parquet as fsspec_parquet
-
-except ImportError:
-    fsspec_parquet = None
-
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
 _ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
 
@@ -173,32 +164,12 @@
 use_pandas_metadata : boolean, default True
     If True and dataset has custom PANDAS schema metadata, ensure that index
     columns are also loaded.
-use_python_file_object : boolean, default True
-    If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time.
-
-    .. deprecated:: 24.08
-        `use_python_file_object` is deprecated and will be removed in a future
-        version of cudf, as PyArrow NativeFiles will no longer be accepted as
-        input/output in cudf readers/writers in the future.
-open_file_options : dict, optional
-    Dictionary of key-value pairs to pass to the function used to open remote
-    files. By default, this will be `fsspec.parquet.open_parquet_file`. To
-    deactivate optimized precaching, set the "method" to `None` under the
-    "precache_options" key. Note that the `open_file_func` key can also be
-    used to specify a custom file-open function.
-
-    .. deprecated:: 24.08
-        `open_file_options` is deprecated as it was intended for
-        pyarrow file inputs, which will no longer be accepted as
-        input/output cudf readers/writers in the future.
 bytes_per_thread : int, default None
     Determines the number of bytes to be allocated per thread to read the
     files in parallel. When there is a file of large size, we get slightly
     better throughput by decomposing it and transferring multiple "blocks"
     in parallel (using a python thread pool). Default allocation is
     {bytes_per_thread} bytes.
-    This parameter is functional only when `use_python_file_object=False`.
 skiprows : int, default None
     If not None, the number of rows to skip from the start of the file.
 
@@ -485,14 +456,6 @@
     This parameter is deprecated.
 use_index : bool, default True
     If True, use row index if available for faster seeking.
-use_python_file_object : boolean, default True
-    If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time.
-
-    .. deprecated:: 24.08
-        `use_python_file_object` is deprecated and will be removed in a future
-        version of cudf, as PyArrow NativeFiles will no longer be accepted as
-        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -506,7 +469,6 @@
     better throughput by decomposing it and transferring multiple "blocks"
     in parallel (using a python thread pool). Default allocation is
     {bytes_per_thread} bytes.
-    This parameter is functional only when `use_python_file_object=False`.
 
 Returns
 -------
@@ -1209,14 +1171,6 @@
     size to zero to read all data after the offset location. Reads the row
     that starts before or at the end of the range, even if it ends after
     the end of the range.
-use_python_file_object : boolean, default True
-    If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time.
-
-    .. deprecated:: 24.08
-        `use_python_file_object` is deprecated and will be removed in a future
-        version of cudf, as PyArrow NativeFiles will no longer be accepted as
-        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -1230,7 +1184,6 @@
     better throughput by decomposing it and transferring multiple "blocks"
     in parallel (using a python thread pool). Default allocation is
     {bytes_per_thread} bytes.
-    This parameter is functional only when `use_python_file_object=False`.
 Returns
 -------
 GPU ``DataFrame`` object.
@@ -1454,22 +1407,6 @@
     Mode in which file is opened
 iotypes : (), default (BytesIO)
     Object type to exclude from file-like check
-use_python_file_object : boolean, default False
-    If True, Arrow-backed PythonFile objects will be used in place
-    of fsspec AbstractBufferedFile objects.
-
-    .. deprecated:: 24.08
-        `use_python_file_object` is deprecated and will be removed in a future
-        version of cudf, as PyArrow NativeFiles will no longer be accepted as
-        input/output in cudf readers/writers.
-open_file_options : dict, optional
-    Optional dictionary of keyword arguments to pass to
-    `_open_remote_files` (used for remote storage only).
-
-    .. deprecated:: 24.08
-        `open_file_options` is deprecated as it was intended for
-        pyarrow file inputs, which will no longer be accepted as
-        input/output cudf readers/writers in the future.
 allow_raw_text_input : boolean, default False
     If True, this indicates the input `path_or_data` could be a raw text
     input and will not check for its existence in the filesystem. If False,
@@ -1490,7 +1427,6 @@
     better throughput by decomposing it and transferring multiple "blocks"
     in parallel (using a Python thread pool). Default allocation is
     {bytes_per_thread} bytes.
-    This parameter is functional only when `use_python_file_object=False`.
 
 Returns
 -------
@@ -1635,119 +1571,13 @@ def _get_filesystem_and_paths(path_or_data, storage_options):
     return fs, return_paths
 
 
-def _set_context(obj, stack):
-    # Helper function to place open file on context stack
-    if stack is None:
-        return obj
-    return stack.enter_context(obj)
-
-
-def _open_remote_files(
-    paths,
-    fs,
-    context_stack=None,
-    open_file_func=None,
-    precache_options=None,
-    **kwargs,
-):
-    """Return a list of open file-like objects given
-    a list of remote file paths.
-
-    Parameters
-    ----------
-    paths : list(str)
-        List of file-path strings.
-    fs : fsspec.AbstractFileSystem
-        Fsspec file-system object.
-    context_stack : contextlib.ExitStack, Optional
-        Context manager to use for open files.
-    open_file_func : Callable, Optional
-        Call-back function to use for opening. If this argument
-        is specified, all other arguments will be ignored.
-    precache_options : dict, optional
-        Dictionary of key-word arguments to pass to use for
-        precaching. Unless the input contains ``{"method": None}``,
-        ``fsspec.parquet.open_parquet_file`` will be used for remote
-        storage.
-    **kwargs :
-        Key-word arguments to be passed to format-specific
-        open functions.
-    """
-
-    # Just use call-back function if one was specified
-    if open_file_func is not None:
-        return [
-            _set_context(open_file_func(path, **kwargs), context_stack)
-            for path in paths
-        ]
-
-    # Check if the "precache" option is supported.
-    # In the future, fsspec should do this check for us
-    precache_options = (precache_options or {}).copy()
-    precache = precache_options.pop("method", None)
-    if precache not in ("parquet", None):
-        raise ValueError(f"{precache} not a supported `precache` option.")
-
-    # Check that "parts" caching (used for all format-aware file handling)
-    # is supported by the installed fsspec/s3fs version
-    if precache == "parquet" and not fsspec_parquet:
-        warnings.warn(
-            f"This version of fsspec ({fsspec.__version__}) does "
-            f"not support parquet-optimized precaching. Please upgrade "
-            f"to the latest fsspec version for better performance."
-        )
-        precache = None
-
-    if precache == "parquet":
-        # Use fsspec.parquet module.
-        # TODO: Use `cat_ranges` to collect "known"
-        # parts for all files at once.
-        row_groups = precache_options.pop("row_groups", None) or (
-            [None] * len(paths)
-        )
-        return [
-            ArrowPythonFile(
-                _set_context(
-                    fsspec_parquet.open_parquet_file(
-                        path,
-                        fs=fs,
-                        row_groups=rgs,
-                        **precache_options,
-                        **kwargs,
-                    ),
-                    context_stack,
-                )
-            )
-            for path, rgs in zip(paths, row_groups)
-        ]
-
-    # Avoid top-level pyarrow.fs import.
-    # Importing pyarrow.fs initializes a S3 SDK with a finalizer
-    # that runs atexit. In some circumstances it appears this
-    # runs a call into a logging system that is already shutdown.
-    # To avoid this, we only import this subsystem if it is
-    # really needed.
-    # See https://github.com/aws/aws-sdk-cpp/issues/2681
-    from pyarrow.fs import FSSpecHandler, PyFileSystem
-
-    # Default open - Use pyarrow filesystem API
-    pa_fs = PyFileSystem(FSSpecHandler(fs))
-    return [
-        _set_context(pa_fs.open_input_file(fpath), context_stack)
-        for fpath in paths
-    ]
-
-
 @doc_get_reader_filepath_or_buffer()
 def get_reader_filepath_or_buffer(
     path_or_data,
     compression,
     mode="rb",
     fs=None,
-    iotypes=(BytesIO, NativeFile),
-    # no_default aliases to False
-    use_python_file_object=no_default,
-    open_file_options=None,
+    iotypes=(BytesIO,),
     allow_raw_text_input=False,
     storage_options=None,
     bytes_per_thread=_BYTES_PER_THREAD_DEFAULT,
@@ -1758,30 +1588,6 @@ def get_reader_filepath_or_buffer(
 
     path_or_data = stringify_pathlike(path_or_data)
 
-    if use_python_file_object is no_default:
-        use_python_file_object = False
-    elif use_python_file_object is not None:
-        warnings.warn(
-            "The 'use_python_file_object' keyword is deprecated and "
-            "will be removed in a future version.",
-            FutureWarning,
-        )
-    else:
-        # Preserve the readers (e.g. read_csv) default of True
-        # if no use_python_file_object option is specified by the user
-        # for now (note: this is different from the default for this
-        # function of False)
-        # TODO: when non-pyarrow file reading perf is good enough
-        # we can default this to False
-        use_python_file_object = True
-
-    if open_file_options is not None:
-        warnings.warn(
-            "The 'open_file_options' keyword is deprecated and "
-            "will be removed in a future version.",
-            FutureWarning,
-        )
-
     if isinstance(path_or_data, str):
         # Get a filesystem object if one isn't already available
         paths = [path_or_data]
@@ -1866,38 +1672,28 @@ def get_reader_filepath_or_buffer(
                 raise FileNotFoundError(
                     f"{path_or_data} could not be resolved to any files"
                 )
-            if use_python_file_object:
-                path_or_data = _open_remote_files(
-                    paths,
-                    fs,
-                    **(open_file_options or {}),
-                )
-            else:
-                path_or_data = [
-                    BytesIO(
-                        _fsspec_data_transfer(
-                            fpath,
-                            fs=fs,
-                            mode=mode,
-                            bytes_per_thread=bytes_per_thread,
-                        )
+            path_or_data = [
+                BytesIO(
+                    _fsspec_data_transfer(
+                        fpath,
+                        fs=fs,
+                        mode=mode,
+                        bytes_per_thread=bytes_per_thread,
                     )
-                    for fpath in paths
-                ]
+                )
+                for fpath in paths
+            ]
             if len(path_or_data) == 1:
                 path_or_data = path_or_data[0]
 
     elif not isinstance(path_or_data, iotypes) and is_file_like(path_or_data):
         if isinstance(path_or_data, TextIOWrapper):
             path_or_data = path_or_data.buffer
-        if use_python_file_object:
-            path_or_data = ArrowPythonFile(path_or_data)
-        else:
-            path_or_data = BytesIO(
-                _fsspec_data_transfer(
-                    path_or_data, mode=mode, bytes_per_thread=bytes_per_thread
-                )
+        path_or_data = BytesIO(
+            _fsspec_data_transfer(
+                path_or_data, mode=mode, bytes_per_thread=bytes_per_thread
             )
+        )
 
     return path_or_data, compression
 
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index c9b343e0f9f..7347ec7866a 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -6,7 +6,6 @@
 import os
 import traceback
 import warnings
-from contextlib import contextmanager
 
 import numpy as np
 import pandas as pd
@@ -404,28 +403,3 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
     if result_mask is not None:
         result_col = result_col.set_mask(result_mask.as_mask())
     return result_col
-
-
-@contextmanager
-def maybe_filter_deprecation(
-    condition: bool, message: str, category: type[Warning]
-):
-    """Conditionally filter a warning category.
-
-    Parameters
-    ----------
-    condition
-        If true, filter the warning
-    message
-        Message to match, passed to :func:`warnings.filterwarnings`
-    category
-        Category of warning, passed to :func:`warnings.filterwarnings`
-    """
-    with warnings.catch_warnings():
-        if condition:
-            warnings.filterwarnings(
-                "ignore",
-                message,
-                category=category,
-            )
-        yield
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index 99f19917424..a14ffbc37dc 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -119,22 +119,6 @@ def test_read_csv(s3_base, s3so):
         assert df.a.sum().compute() == 4
 
 
-def test_read_csv_warns(s3_base, s3so):
-    with s3_context(
-        s3_base=s3_base,
-        bucket="daskcsv_warns",
-        files={"a.csv": b"a,b\n1,2\n3,4\n"},
-    ):
-        with pytest.warns(FutureWarning):
-            df = dask_cudf.read_csv(
-                "s3://daskcsv_warns/*.csv",
-                blocksize="50 B",
-                storage_options=s3so,
-                use_python_file_object=True,
-            )
-            assert df.a.sum().compute() == 4
-
-
 def test_read_parquet_open_file_options_raises():
     with pytest.raises(ValueError):
         dask_cudf.read_parquet(
@@ -198,22 +182,6 @@ def test_read_parquet(s3_base, s3so, pdf):
         assert_eq(pdf, got)
 
 
-def test_read_parquet_use_python_file_object(s3_base, s3so, pdf):
-    fname = "test_parquet_use_python_file_object.parquet"
-    bucket = "parquet"
-    buffer = BytesIO()
-    pdf.to_parquet(path=buffer)
-    buffer.seek(0)
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            got = dask_cudf.read_parquet(
-                f"s3://{bucket}/{fname}",
-                storage_options=s3so,
-                read={"use_python_file_object": True},
-            ).head()
-            assert_eq(pdf, got)
-
-
 def test_read_orc(s3_base, s3so, pdf):
     fname = "test_orc_reader_dask.orc"
     bucket = "orc"
@@ -226,19 +194,3 @@ def test_read_orc(s3_base, s3so, pdf):
             storage_options=s3so,
         )
         assert_eq(pdf, got)
-
-
-def test_read_orc_use_python_file_object(s3_base, s3so, pdf):
-    fname = "test_orc_use_python_file_object.orc"
-    bucket = "orc"
-    buffer = BytesIO()
-    pdf.to_orc(path=buffer)
-    buffer.seek(0)
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            got = dask_cudf.read_orc(
-                f"s3://{bucket}/{fname}",
-                storage_options=s3so,
-                use_python_file_object=True,
-            ).head()
-            assert_eq(pdf, got)
diff --git a/python/pylibcudf/pylibcudf/io/datasource.pxd b/python/pylibcudf/pylibcudf/io/datasource.pxd
index 05c03dceee2..c08f36693c7 100644
--- a/python/pylibcudf/pylibcudf/io/datasource.pxd
+++ b/python/pylibcudf/pylibcudf/io/datasource.pxd
@@ -1,14 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport shared_ptr
-from pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
 from pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef class Datasource:
     cdef datasource* get_datasource(self) except * nogil
-
-
-cdef class NativeFileDatasource(Datasource):
-    cdef shared_ptr[arrow_io_source] c_datasource
-    cdef datasource* get_datasource(self) nogil
diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyx b/python/pylibcudf/pylibcudf/io/datasource.pyx
index 6cc509b74cb..02418444caa 100644
--- a/python/pylibcudf/pylibcudf/io/datasource.pyx
+++ b/python/pylibcudf/pylibcudf/io/datasource.pyx
@@ -1,34 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport shared_ptr
-from pyarrow.includes.libarrow cimport CRandomAccessFile
-from pyarrow.lib cimport NativeFile
-from pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
 from pylibcudf.libcudf.io.datasource cimport datasource
 
-import warnings
-
 
 cdef class Datasource:
     cdef datasource* get_datasource(self) except * nogil:
         with gil:
             raise NotImplementedError("get_datasource() should not "
                                       + "be directly invoked here")
-
-cdef class NativeFileDatasource(Datasource):
-
-    def __cinit__(self, NativeFile native_file):
-
-        cdef shared_ptr[CRandomAccessFile] ra_src
-
-        warnings.warn(
-            "Support for reading pyarrow's NativeFile is deprecated "
-            "and will be removed in a future release of cudf.",
-            FutureWarning,
-        )
-
-        ra_src = native_file.get_random_access_file()
-        self.c_datasource.reset(new arrow_io_source(ra_src))
-
-    cdef datasource* get_datasource(self) nogil:
-        return <datasource *> (self.c_datasource.get())

From 6ccc2c2e4d7b4cda0bb4f844a28d69254049b795 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 19 Aug 2024 16:39:34 -0500
Subject: [PATCH 089/270] standardize and consolidate wheel installations in
 testing scripts (#16575)

I noticed some common changes to wheel-testing scripts in the PRs splitting off `pylibcudf` (#16299) and `libcudf` (#15483).

* consolidating multiple `pip install`'s into 1
  - *(this is safer, as it removes the risk of `pip` replacing a previously-installed CI package with another one from a remote package repository)*
* standardizing the approach used for "install some wheels built earlier in this same CI run"

These can go onto `branch-24.10` right now, so proposing them in a separate PR so that `cudf` CI can benefit from them without having to wait on those large PRs.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16575
---
 ci/cudf_pandas_scripts/pandas-tests/run.sh | 13 +++++++++----
 ci/cudf_pandas_scripts/run_tests.sh        | 13 +++++++++----
 ci/test_wheel_cudf.sh                      | 11 ++++++-----
 ci/test_wheel_cudf_polars.sh               | 15 ++++++++++-----
 ci/test_wheel_dask_cudf.sh                 | 14 +++++++-------
 5 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index 8deaeab78a3..97c3139080f 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -11,10 +11,15 @@ rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids
 rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install $(ls ./local-pylibcudf-dep/pylibcudf*.whl)
-python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,pandas-tests]
+
+# Download the cudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install \
+  "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,pandas-tests]" \
+  "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index bfb655db3ca..8215ce729b3 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -36,10 +36,15 @@ if [ "$no_cudf" = true ]; then
     echo "Skipping cudf install"
 else
     RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-    RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
-    RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-    python -m pip install $(ls ./local-pylibcudf-dep/pylibcudf*.whl)
-    python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf-pandas-tests]
+
+    # Download the cudf and pylibcudf built in the previous step
+    RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+    RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+
+    # echo to expand wildcard before adding `[extra]` requires for pip
+    python -m pip install \
+        "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,cudf-pandas-tests]" \
+        "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 fi
 
 python -m pytest -p cudf.pandas \
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 5a2c3ccac8f..19131952098 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -3,15 +3,16 @@
 
 set -eou pipefail
 
-# Download the pylibcudf built in the previous step
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
+
+# Download the cudf and pylibcudf built in the previous step
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
-# Install both pylibcudf and cudf
+# echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
-    "$(echo ./local-pylibcudf-dep/pylibcudf*.whl)[test]" \
-    "$(echo ./dist/cudf*.whl)[test]"
+  "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+  "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 357d4170d47..6438d13c4b7 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -20,12 +20,17 @@ fi
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-# Download the cudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
-python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
+# Download the cudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
-rapids-logger "Install cudf_polars"
-python -m pip install $(echo ./dist/cudf_polars*.whl)[test]
+rapids-logger "Installing cudf_polars and its dependencies"
+
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install \
+    "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+    "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+    "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
 rapids-logger "Run cudf_polars tests"
 
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 4d045472604..ff893a08e27 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -6,15 +6,15 @@ set -eou pipefail
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-# Download the cudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install \
-    "$(echo ./local-pylibcudf-dep/pylibcudf*.whl)" \
-    "$(echo ./local-cudf-dep/cudf*.whl)"
+# Download the cudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
+python -m pip install \
+  "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+  "$(echo ./dist/dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+  "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/

From f2d13c9dbe957cd2a5cbf93a339149ab3edc0240 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 19 Aug 2024 17:23:23 -0500
Subject: [PATCH 090/270] make more use of YAML anchors in dependencies.yaml
 (#16597)

Contributes to https://github.com/rapidsai/build-planning/issues/33

Follow-up to #16299

This proposes some simplifications to `dependencies.yaml`. It's not intended to change any behavior.

* more use of YAML anchors for requirements that are intended to be identical to each other
* eliminating the `pylibcudf_build_dep` dependency group that was introduced in #16299, in favor of just tracking the `pylibcudf` build dependency alongside `cudf`'s `rmm` build dependency in the existing `build_python_cudf` group
  - *(sorry I'd missed that in the review on #16299)*

I found myself starting to make similar changes in the PR breaking up these packages into more (splitting out a `libcudf` in #15483) and thought they'd be better as a standalone PR.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16597
---
 dependencies.yaml | 45 ++++++++++++++-------------------------------
 1 file changed, 14 insertions(+), 31 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index ca615905a15..a774345fe95 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -96,7 +96,6 @@ files:
       - build_base
       - build_python_common
       - build_python_cudf
-      - pylibcudf_build_dep
   py_run_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -383,12 +382,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==24.10.*,>=0.0.0a0
+              - &rmm_cu12 rmm-cu12==24.10.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu11==24.10.*,>=0.0.0a0
+              - &rmm_cu11 rmm-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*rmm_unsuffixed]}
   build_python_cudf:
     common:
@@ -412,34 +411,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==24.10.*,>=0.0.0a0
-              - pylibcudf-cu12==24.10.*,>=0.0.0a0
+              - &pylibcudf_cu12 pylibcudf-cu12==24.10.*,>=0.0.0a0
+              - *rmm_cu12
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu11==24.10.*,>=0.0.0a0
-              - pylibcudf-cu11==24.10.*,>=0.0.0a0
-          - {matrix: null, packages: [*rmm_unsuffixed]}
-  pylibcudf_build_dep:
-    common:
-      - output_types: conda
-        packages:
-          - &pylibcudf_unsuffixed pylibcudf==24.10.*,>=0.0.0a0
-    specific:
-      - output_types: [pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - pylibcudf-cu12==24.10.*,>=0.0.0a0
+              - &pylibcudf_cu11 pylibcudf-cu11==24.10.*,>=0.0.0a0
+              - *rmm_cu11
           - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
             packages:
-              - pylibcudf-cu11==24.10.*,>=0.0.0a0
-          - {matrix: null, packages: [*pylibcudf_unsuffixed]}
+              - &pylibcudf_unsuffixed pylibcudf==24.10.*,>=0.0.0a0
+              - *rmm_unsuffixed
   libarrow_build:
     common:
       - output_types: conda
@@ -677,12 +660,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==24.10.*,>=0.0.0a0
+              - *rmm_cu12
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu11==24.10.*,>=0.0.0a0
+              - *rmm_cu11
           - {matrix: null, packages: [*rmm_unsuffixed]}
   run_cudf:
     common:
@@ -728,7 +711,7 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==24.10.*,>=0.0.0a0
+              - *rmm_cu12
               - pynvjitlink-cu12>=0.0.0a0
           - matrix:
               cuda: "12.*"
@@ -740,7 +723,7 @@ dependencies:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu11==24.10.*,>=0.0.0a0
+              - *rmm_cu11
               - cubinlinker-cu11
               - ptxcompiler-cu11
           - matrix:
@@ -874,12 +857,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibcudf-cu12==24.10.*,>=0.0.0a0
+              - *pylibcudf_cu12
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - pylibcudf-cu11==24.10.*,>=0.0.0a0
+              - *pylibcudf_cu11
           - {matrix: null, packages: [*pylibcudf_unsuffixed]}
   depends_on_cudf:
     common:

From 3f6dd14e26deccc761ed06790cf806edc266d5e4 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 19 Aug 2024 12:29:15 -1000
Subject: [PATCH 091/270] Make StructColumn.__init__ strict (#16467)

This PR makes `StructColumn.__init__` strict putting restrictions on data, dtype, size and children so these columns cannot be constructed into to an invalid state. It also aligns the signature with the base class.

xref https://github.com/rapidsai/cudf/issues/16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16467
---
 python/cudf/cudf/core/column/column.py   | 13 +++--
 python/cudf/cudf/core/column/interval.py | 71 ++++++++++++++++--------
 python/cudf/cudf/core/column/struct.py   | 50 ++++++++++++++---
 python/cudf/cudf/core/index.py           |  6 +-
 4 files changed, 100 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 0857727d23f..27278120abb 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1635,22 +1635,23 @@ def build_column(
         )
     elif isinstance(dtype, IntervalDtype):
         return cudf.core.column.IntervalColumn(
+            data=None,
+            size=size,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
-            size=size,
             offset=offset,
-            children=children,
             null_count=null_count,
+            children=children,  # type: ignore[arg-type]
         )
     elif isinstance(dtype, StructDtype):
         return cudf.core.column.StructColumn(
-            data=data,
-            dtype=dtype,
+            data=None,
             size=size,  # type: ignore[arg-type]
-            offset=offset,
+            dtype=dtype,
             mask=mask,
+            offset=offset,
             null_count=null_count,
-            children=children,
+            children=children,  # type: ignore[arg-type]
         )
     elif isinstance(dtype, cudf.Decimal64Dtype):
         return cudf.core.column.Decimal64Column(
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index d9fc96a9f3e..9147270c289 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -11,32 +11,46 @@
 from cudf.core.dtypes import IntervalDtype
 
 if TYPE_CHECKING:
+    from typing_extensions import Self
+
     from cudf._typing import ScalarLike
+    from cudf.core.buffer import Buffer
     from cudf.core.column import ColumnBase
 
 
 class IntervalColumn(StructColumn):
     def __init__(
         self,
-        dtype,
-        mask=None,
-        size=None,
-        offset=0,
-        null_count=None,
-        children=(),
+        data: None,
+        size: int,
+        dtype: IntervalDtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple[ColumnBase, ColumnBase] = (),  # type: ignore[assignment]
     ):
+        if len(children) != 2:
+            raise ValueError(
+                "children must be a tuple of two columns (left edges, right edges)."
+            )
         super().__init__(
-            data=None,
+            data=data,
+            size=size,
             dtype=dtype,
             mask=mask,
-            size=size,
             offset=offset,
             null_count=null_count,
             children=children,
         )
 
+    @staticmethod
+    def _validate_dtype_instance(dtype: IntervalDtype) -> IntervalDtype:
+        if not isinstance(dtype, IntervalDtype):
+            raise ValueError("dtype must be a IntervalDtype.")
+        return dtype
+
     @classmethod
-    def from_arrow(cls, data):
+    def from_arrow(cls, data: pa.Array) -> Self:
         new_col = super().from_arrow(data.storage)
         size = len(data)
         dtype = IntervalDtype.from_arrow(data.type)
@@ -48,16 +62,17 @@ def from_arrow(cls, data):
         null_count = data.null_count
         children = new_col.children
 
-        return IntervalColumn(
+        return cls(
+            data=None,
             size=size,
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
-            children=children,
+            children=children,  # type: ignore[arg-type]
         )
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         typ = self.dtype.to_arrow()
         struct_arrow = super().to_arrow()
         if len(struct_arrow) == 0:
@@ -67,9 +82,14 @@ def to_arrow(self):
         return pa.ExtensionArray.from_storage(typ, struct_arrow)
 
     @classmethod
-    def from_struct_column(cls, struct_column: StructColumn, closed="right"):
+    def from_struct_column(
+        cls,
+        struct_column: StructColumn,
+        closed: Literal["left", "right", "both", "neither"] = "right",
+    ) -> Self:
         first_field_name = next(iter(struct_column.dtype.fields.keys()))
-        return IntervalColumn(
+        return cls(
+            data=None,
             size=struct_column.size,
             dtype=IntervalDtype(
                 struct_column.dtype.fields[first_field_name], closed
@@ -77,12 +97,13 @@ def from_struct_column(cls, struct_column: StructColumn, closed="right"):
             mask=struct_column.base_mask,
             offset=struct_column.offset,
             null_count=struct_column.null_count,
-            children=struct_column.base_children,
+            children=struct_column.base_children,  # type: ignore[arg-type]
         )
 
-    def copy(self, deep=True):
+    def copy(self, deep: bool = True) -> Self:
         struct_copy = super().copy(deep=deep)
-        return IntervalColumn(
+        return IntervalColumn(  # type: ignore[return-value]
+            data=None,
             size=struct_copy.size,
             dtype=IntervalDtype(
                 struct_copy.dtype.fields["left"], self.dtype.closed
@@ -90,7 +111,7 @@ def copy(self, deep=True):
             mask=struct_copy.base_mask,
             offset=struct_copy.offset,
             null_count=struct_copy.null_count,
-            children=struct_copy.base_children,
+            children=struct_copy.base_children,  # type: ignore[arg-type]
         )
 
     @property
@@ -138,25 +159,27 @@ def overlaps(other) -> ColumnBase:
 
     def set_closed(
         self, closed: Literal["left", "right", "both", "neither"]
-    ) -> IntervalColumn:
-        return IntervalColumn(
+    ) -> Self:
+        return IntervalColumn(  # type: ignore[return-value]
+            data=None,
             size=self.size,
             dtype=IntervalDtype(self.dtype.fields["left"], closed),
             mask=self.base_mask,
             offset=self.offset,
             null_count=self.null_count,
-            children=self.base_children,
+            children=self.base_children,  # type: ignore[arg-type]
         )
 
-    def as_interval_column(self, dtype):
+    def as_interval_column(self, dtype: IntervalDtype) -> Self:  # type: ignore[override]
         if isinstance(dtype, IntervalDtype):
-            return IntervalColumn(
+            return IntervalColumn(  # type: ignore[return-value]
+                data=None,
                 size=self.size,
                 dtype=dtype,
                 mask=self.mask,
                 offset=self.offset,
                 null_count=self.null_count,
-                children=tuple(
+                children=tuple(  # type: ignore[arg-type]
                     child.astype(dtype.subtype) for child in self.children
                 ),
             )
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index c2ce787eeae..2fda3b2c434 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -14,7 +14,10 @@
 from cudf.core.missing import NA
 
 if TYPE_CHECKING:
+    from typing_extensions import Self
+
     from cudf._typing import Dtype
+    from cudf.core.buffer import Buffer
 
 
 class StructColumn(ColumnBase):
@@ -23,10 +26,39 @@ class StructColumn(ColumnBase):
 
     Every column has n children, where n is
     the number of fields in the Struct Dtype.
-
     """
 
-    dtype: StructDtype
+    def __init__(
+        self,
+        data: None,
+        size: int,
+        dtype: StructDtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple[ColumnBase, ...] = (),
+    ):
+        if data is not None:
+            raise ValueError("data must be None.")
+        dtype = self._validate_dtype_instance(dtype)
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
+
+    @staticmethod
+    def _validate_dtype_instance(dtype: StructDtype) -> StructDtype:
+        # IntervalDtype is a subclass of StructDtype, so compare types exactly
+        if type(dtype) is not StructDtype:
+            raise ValueError(
+                f"{type(dtype).__name__} must be a StructDtype exactly."
+            )
+        return dtype
 
     @property
     def base_size(self):
@@ -35,7 +67,7 @@ def base_size(self):
         else:
             return self.size + self.offset
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         children = [
             pa.nulls(len(child))
             if len(child) == child.null_count
@@ -50,7 +82,7 @@ def to_arrow(self):
             }
         )
 
-        if self.nullable:
+        if self.mask is not None:
             buffers = (pa.py_buffer(self.mask.memoryview()),)
         else:
             buffers = (None,)
@@ -73,7 +105,7 @@ def to_pandas(
             return pd.Index(self.to_arrow().tolist(), dtype="object")
 
     @cached_property
-    def memory_usage(self):
+    def memory_usage(self) -> int:
         n = 0
         if self.nullable:
             n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size)
@@ -99,7 +131,7 @@ def __setitem__(self, key, value):
             value = cudf.Scalar(value, self.dtype)
         super().__setitem__(key, value)
 
-    def copy(self, deep=True):
+    def copy(self, deep: bool = True) -> Self:
         # Since struct columns are immutable, both deep and
         # shallow copies share the underlying device data and mask.
         result = super().copy(deep=False)
@@ -107,15 +139,15 @@ def copy(self, deep=True):
             result = result._rename_fields(self.dtype.fields.keys())
         return result
 
-    def _rename_fields(self, names):
+    def _rename_fields(self, names) -> Self:
         """
         Return a StructColumn with the same field values as this StructColumn,
         but with the field names equal to `names`.
         """
-        dtype = cudf.core.dtypes.StructDtype(
+        dtype = StructDtype(
             {name: col.dtype for name, col in zip(names, self.children)}
         )
-        return StructColumn(
+        return StructColumn(  # type: ignore[return-value]
             data=None,
             size=self.size,
             dtype=dtype,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index ee2f0317f8d..6a5e718c2c5 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3354,6 +3354,7 @@ def interval_range(
         return IntervalIndex(data, closed=closed, name=name)
 
     interval_col = IntervalColumn(
+        data=None,
         dtype=IntervalDtype(left_col.dtype, closed),
         size=len(left_col),
         children=(left_col, right_col),
@@ -3425,6 +3426,7 @@ def __init__(
             elif isinstance(data.dtype, (pd.IntervalDtype, IntervalDtype)):
                 data = np.array([], dtype=data.dtype.subtype)
             interval_col = IntervalColumn(
+                None,
                 dtype=IntervalDtype(data.dtype, closed),
                 size=len(data),
                 children=(as_column(data), as_column(data)),
@@ -3436,12 +3438,13 @@ def __init__(
             if copy:
                 col = col.copy()
             interval_col = IntervalColumn(
+                data=None,
                 dtype=IntervalDtype(col.dtype.subtype, closed),
                 mask=col.mask,
                 size=col.size,
                 offset=col.offset,
                 null_count=col.null_count,
-                children=col.children,
+                children=col.children,  # type: ignore[arg-type]
             )
 
         if dtype:
@@ -3517,6 +3520,7 @@ def from_breaks(
         )
 
         interval_col = IntervalColumn(
+            data=None,
             dtype=IntervalDtype(left_col.dtype, closed),
             size=len(left_col),
             children=(left_col, right_col),

From a45af4a61ba582d6af839702148e9a6e2da69bc9 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 19 Aug 2024 19:06:28 -0700
Subject: [PATCH 092/270] Remove arrow_io_source (#16607)

The `arrow_io_source` in libcudf only existed to support Python's pyarrow NativeFile integration, which was deprecated and removed in #16589.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16607
---
 cpp/CMakeLists.txt                            |   1 -
 cpp/include/cudf/io/arrow_io_source.hpp       |  93 ----------------
 cpp/src/io/utilities/arrow_io_source.cpp      |  87 ---------------
 cpp/tests/CMakeLists.txt                      |   4 -
 cpp/tests/io/arrow_io_source_test.cpp         | 103 ------------------
 cpp/tests/io/csv_test.cpp                     |  26 +----
 cpp/tests/io/json/json_test.cpp               |  26 -----
 .../pylibcudf/libcudf/io/arrow_io_source.pxd  |  14 ---
 8 files changed, 1 insertion(+), 353 deletions(-)
 delete mode 100644 cpp/include/cudf/io/arrow_io_source.hpp
 delete mode 100644 cpp/src/io/utilities/arrow_io_source.cpp
 delete mode 100644 cpp/tests/io/arrow_io_source_test.cpp
 delete mode 100644 python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eeafc411874..ff00c484501 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -438,7 +438,6 @@ add_library(
   src/io/text/bgzip_data_chunk_source.cu
   src/io/text/bgzip_utils.cpp
   src/io/text/multibyte_split.cu
-  src/io/utilities/arrow_io_source.cpp
   src/io/utilities/base64_utilities.cpp
   src/io/utilities/column_buffer.cpp
   src/io/utilities/column_buffer_strings.cu
diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp
deleted file mode 100644
index ed5c839cbb4..00000000000
--- a/cpp/include/cudf/io/arrow_io_source.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "datasource.hpp"
-
-#include <cudf/utilities/export.hpp>
-
-#include <arrow/filesystem/filesystem.h>
-#include <arrow/io/interfaces.h>
-
-#include <memory>
-#include <string>
-#include <utility>
-
-namespace CUDF_EXPORT cudf {
-namespace io {
-/**
- * @addtogroup io_datasources
- * @{
- * @file
- */
-
-/**
- * @brief Implementation class for reading from an Apache Arrow file. The file
- * could be a memory-mapped file or other implementation supported by Arrow.
- */
-class arrow_io_source : public datasource {
- public:
-  /**
-   * @brief Constructs an object from an Apache Arrow Filesystem URI
-   *
-   * @param arrow_uri Apache Arrow Filesystem URI
-   */
-  explicit arrow_io_source(std::string const& arrow_uri);
-
-  /**
-   * @brief Constructs an object from an `arrow` source object.
-   *
-   * @param file The `arrow` object from which the data is read
-   */
-  explicit arrow_io_source(std::shared_ptr<arrow::io::RandomAccessFile> file)
-    : arrow_file(std::move(file))
-  {
-  }
-
-  /**
-   * @brief Returns a buffer with a subset of data from the `arrow` source.
-   *
-   * @param offset The offset in bytes from which to read
-   * @param size The number of bytes to read
-   * @return A buffer with the read data
-   */
-  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override;
-
-  /**
-   * @brief Reads a selected range from the `arrow` source into a preallocated buffer.
-   *
-   * @param[in] offset The offset in bytes from which to read
-   * @param[in] size The number of bytes to read
-   * @param[out] dst The preallocated buffer to read into
-   * @return The number of bytes read
-   */
-  size_t host_read(size_t offset, size_t size, uint8_t* dst) override;
-  /**
-   * @brief Returns the size of the data in the `arrow` source.
-   *
-   * @return The size of the data in the `arrow` source
-   */
-  [[nodiscard]] size_t size() const override;
-
- private:
-  std::shared_ptr<arrow::fs::FileSystem> filesystem;
-  std::shared_ptr<arrow::io::RandomAccessFile> arrow_file;
-};
-
-/** @} */  // end of group
-}  // namespace io
-}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/utilities/arrow_io_source.cpp b/cpp/src/io/utilities/arrow_io_source.cpp
deleted file mode 100644
index 157240b8b08..00000000000
--- a/cpp/src/io/utilities/arrow_io_source.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/io/arrow_io_source.hpp>
-
-#include <arrow/buffer.h>
-#include <arrow/filesystem/filesystem.h>
-#include <arrow/result.h>
-
-#include <memory>
-#include <string>
-#include <utility>
-
-namespace cudf::io {
-
-/**
- * @brief Implementation for an owning buffer where `arrow::Buffer` holds the data.
- */
-class arrow_io_buffer : public datasource::buffer {
-  std::shared_ptr<arrow::Buffer> arrow_buffer;
-
- public:
-  explicit arrow_io_buffer(std::shared_ptr<arrow::Buffer> arrow_buffer)
-    : arrow_buffer(std::move(arrow_buffer))
-  {
-  }
-  [[nodiscard]] size_t size() const override { return arrow_buffer->size(); }
-  [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); }
-};
-
-arrow_io_source::arrow_io_source(std::string const& arrow_uri)
-{
-  std::string const uri_start_delimiter = "//";
-  std::string const uri_end_delimiter   = "?";
-
-  auto const result = arrow::fs::FileSystemFromUri(arrow_uri);
-  CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI.");
-  filesystem = result.ValueOrDie();
-
-  // Parse the path from the URI
-  auto const start = [&]() {
-    auto const delim_start = arrow_uri.find(uri_start_delimiter);
-    return delim_start == std::string::npos ? 0 : delim_start + uri_start_delimiter.size();
-  }();
-  auto const end  = arrow_uri.find(uri_end_delimiter) - start;
-  auto const path = arrow_uri.substr(start, end);
-
-  auto const in_stream = filesystem->OpenInputFile(path);
-  CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile");
-  arrow_file = in_stream.ValueOrDie();
-}
-
-std::unique_ptr<datasource::buffer> arrow_io_source::host_read(size_t offset, size_t size)
-{
-  auto const result = arrow_file->ReadAt(offset, size);
-  CUDF_EXPECTS(result.ok(), "Cannot read file data");
-  return std::make_unique<arrow_io_buffer>(result.ValueOrDie());
-}
-
-size_t arrow_io_source::host_read(size_t offset, size_t size, uint8_t* dst)
-{
-  auto const result = arrow_file->ReadAt(offset, size, dst);
-  CUDF_EXPECTS(result.ok(), "Cannot read file data");
-  return result.ValueOrDie();
-}
-
-[[nodiscard]] size_t arrow_io_source::size() const
-{
-  auto const result = arrow_file->GetSize();
-  CUDF_EXPECTS(result.ok(), "Cannot get file size");
-  return result.ValueOrDie();
-}
-
-}  // namespace cudf::io
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 006b36add0e..ac77a362e1c 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -321,7 +321,6 @@ ConfigureTest(
 ConfigureTest(JSON_WRITER_TEST io/json/json_writer.cpp)
 ConfigureTest(JSON_TYPE_CAST_TEST io/json/json_type_cast_test.cu)
 ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cpp)
-ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
 ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp)
 ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu)
@@ -334,9 +333,6 @@ target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB)
 ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)
 ConfigureTest(FST_TEST io/fst/fst_test.cu)
 ConfigureTest(TYPE_INFERENCE_TEST io/type_inference_test.cu)
-if(CUDF_ENABLE_ARROW_S3)
-  target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED")
-endif()
 
 # ##################################################################################################
 # * sort tests ------------------------------------------------------------------------------------
diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
deleted file mode 100644
index ffdf2c7e00f..00000000000
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/io/arrow_io_source.hpp>
-#include <cudf/io/json.hpp>
-#include <cudf/io/parquet.hpp>
-
-#include <arrow/filesystem/filesystem.h>
-#include <arrow/filesystem/s3fs.h>
-#include <arrow/io/api.h>
-#include <arrow/util/config.h>
-
-#include <fstream>
-#include <memory>
-#include <string>
-
-// Global environment for temporary files
-auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
-  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
-
-// Base test fixture for tests
-struct ArrowIOTest : public cudf::test::BaseFixture {};
-
-TEST_F(ArrowIOTest, URIFileSystem)
-{
-  const std::string file_name = temp_env->get_temp_dir() + "JsonLinesFileTest.json";
-  std::ofstream outfile(file_name, std::ofstream::out);
-  outfile << "{\"a\":11, \"b\":1.1}\n{\"a\":22, \"b\":2.2}";
-  outfile.close();
-
-  std::string file_uri = "file://" + file_name;
-  auto datasource      = std::make_unique<cudf::io::arrow_io_source>(file_uri);
-
-  // Populate the JSON Reader Options
-  cudf::io::json_reader_options options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info(datasource.get())).lines(true);
-
-  // Read the JSON file from the LocalFileSystem
-  cudf::io::table_with_metadata tbl = cudf::io::read_json(options);
-
-  ASSERT_EQ(2, tbl.tbl->num_columns());
-  ASSERT_EQ(2, tbl.tbl->num_rows());
-}
-
-TEST_F(ArrowIOTest, S3FileSystem)
-{
-  std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2";
-
-  // Check to see if Arrow was built with support for S3. If not, ensure this
-  // test throws. If so, validate the S3 file contents.
-  auto const s3_unsupported = arrow::fs::FileSystemFromUri(s3_uri).status().IsNotImplemented();
-  if (s3_unsupported) {
-    EXPECT_THROW(std::make_unique<cudf::io::arrow_io_source>(s3_uri), cudf::logic_error);
-  } else {
-    auto datasource = std::make_unique<cudf::io::arrow_io_source>(s3_uri);
-
-    // Populate the Parquet Reader Options
-    cudf::io::source_info src(datasource.get());
-    std::vector<std::string> single_column;
-    single_column.insert(single_column.begin(), "total_bill");
-    cudf::io::parquet_reader_options_builder builder(src);
-    cudf::io::parquet_reader_options options = builder.columns(single_column).build();
-
-    // Read the Parquet file from S3
-    cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options);
-
-    ASSERT_EQ(1, tbl.tbl->num_columns());  // Only single column specified in reader_options
-    ASSERT_EQ(244, tbl.tbl->num_rows());   // known number of rows from the S3 file
-  }
-
-#ifdef ARROW_S3
-  if (!s3_unsupported) {
-    // Verify that we are using Arrow with S3, and call finalize
-    // https://github.com/apache/arrow/issues/36974
-    // This needs to be in a separate conditional to ensure we call
-    // finalize after all arrow_io_source instances have been deleted.
-    [[maybe_unused]] auto _ = arrow::fs::EnsureS3Finalized();
-  }
-#endif
-}
-
-CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index ff433264446..dc14824d834 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -25,8 +25,8 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/csv.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -1197,30 +1197,6 @@ TEST_F(CsvReaderTest, HeaderOnlyFile)
   EXPECT_EQ(3, view.num_columns());
 }
 
-TEST_F(CsvReaderTest, ArrowFileSource)
-{
-  auto filepath = temp_env->get_temp_dir() + "ArrowFileSource.csv";
-  {
-    std::ofstream outfile(filepath, std::ofstream::out);
-    outfile << "A\n9\n8\n7\n6\n5\n4\n3\n2\n";
-  }
-
-  std::shared_ptr<arrow::io::ReadableFile> infile;
-  ASSERT_TRUE(arrow::io::ReadableFile::Open(filepath).Value(&infile).ok());
-
-  auto arrow_source = cudf::io::arrow_io_source{infile};
-  cudf::io::csv_reader_options in_opts =
-    cudf::io::csv_reader_options::builder(cudf::io::source_info{&arrow_source})
-      .dtypes({dtype<int8_t>()});
-  auto result = cudf::io::read_csv(in_opts);
-
-  auto const view = result.tbl->view();
-  EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(type_id::INT8, view.column(0).type().id());
-
-  expect_column_data_equal(std::vector<int8_t>{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0));
-}
-
 TEST_F(CsvReaderTest, InvalidFloatingPoint)
 {
   auto const filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.csv";
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 0a485e26b71..576a698ba31 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -26,7 +26,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
-#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/repeat_strings.hpp>
@@ -958,31 +957,6 @@ TEST_F(JsonReaderTest, NoDataFileValues)
   EXPECT_EQ(0, view.num_columns());
 }
 
-TEST_F(JsonReaderTest, ArrowFileSource)
-{
-  const std::string fname = temp_env->get_temp_dir() + "ArrowFileSource.csv";
-
-  std::ofstream outfile(fname, std::ofstream::out);
-  outfile << "[9]\n[8]\n[7]\n[6]\n[5]\n[4]\n[3]\n[2]\n";
-  outfile.close();
-
-  std::shared_ptr<arrow::io::ReadableFile> infile;
-  ASSERT_TRUE(arrow::io::ReadableFile::Open(fname).Value(&infile).ok());
-
-  auto arrow_source = cudf::io::arrow_io_source{infile};
-  cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{&arrow_source})
-      .dtypes({dtype<int8_t>()})
-      .lines(true);
-
-  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
-
-  EXPECT_EQ(result.tbl->num_columns(), 1);
-  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT8);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int8_wrapper{{9, 8, 7, 6, 5, 4, 3, 2}});
-}
-
 TEST_P(JsonReaderParamTest, InvalidFloatingPoint)
 {
   auto const test_opt       = GetParam();
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd b/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd
deleted file mode 100644
index 54a913a9ce3..00000000000
--- a/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-cimport pylibcudf.libcudf.io.datasource as cudf_io_datasource
-from libcpp.memory cimport shared_ptr
-from libcpp.string cimport string
-from pyarrow.includes.libarrow cimport CRandomAccessFile
-
-
-cdef extern from "cudf/io/arrow_io_source.hpp" \
-        namespace "cudf::io" nogil:
-
-    cdef cppclass arrow_io_source(cudf_io_datasource.datasource):
-        arrow_io_source(const string& arrow_uri) except +
-        arrow_io_source(shared_ptr[CRandomAccessFile]) except +

From 3ac409dc26437deb77d30f64ec148121394878e4 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 19 Aug 2024 21:18:11 -0700
Subject: [PATCH 093/270] Fix C++ and Cython io types (#16610)

The C++ I/O types were previously not specifying a base type despite the fact that the Cython code was relying on the base being an int32. This has apparently never bitten us before, but in theory this could go very wrong since it leaves the underlying type up to the compiler and if the C++ binary used something other than an int32 that would result in an ABI incompatibility with the Python build that would produce spurious results.

While fixing this, I also noticed that the Cython contained a number of erroneous (likely outdated) declarations. Since Cython extern declarations are simply an indicate to Cython of how to resolve a function call _if_ it appears in compiled Cython code, these were not causing any build failures because these were all unused APIs, so I removed them from the Cython with no further changes needed.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16610
---
 cpp/include/cudf/io/types.hpp                 | 12 ++---
 .../pylibcudf/pylibcudf/libcudf/io/types.pxd  | 50 ++++++++-----------
 2 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 3df737413fa..a34881942ce 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -54,7 +54,7 @@ namespace io {
 /**
  * @brief Compression algorithms
  */
-enum class compression_type {
+enum class compression_type : int32_t {
   NONE,    ///< No compression
   AUTO,    ///< Automatically detect or select compression format
   SNAPPY,  ///< Snappy format, using byte-oriented LZ77
@@ -72,7 +72,7 @@ enum class compression_type {
 /**
  * @brief Data source or destination types
  */
-enum class io_type {
+enum class io_type : int32_t {
   FILEPATH,          ///< Input/output is a file path
   HOST_BUFFER,       ///< Input/output is a buffer in host memory
   DEVICE_BUFFER,     ///< Input/output is a buffer in device memory
@@ -83,7 +83,7 @@ enum class io_type {
 /**
  * @brief Behavior when handling quotations in field data
  */
-enum class quote_style {
+enum class quote_style : int32_t {
   MINIMAL,     ///< Quote only fields which contain special characters
   ALL,         ///< Quote all fields
   NONNUMERIC,  ///< Quote all non-numeric fields
@@ -93,7 +93,7 @@ enum class quote_style {
 /**
  * @brief Column statistics granularity type for parquet/orc writers
  */
-enum statistics_freq {
+enum statistics_freq : int32_t {
   STATISTICS_NONE     = 0,  ///< No column statistics
   STATISTICS_ROWGROUP = 1,  ///< Per-Rowgroup column statistics
   STATISTICS_PAGE     = 2,  ///< Per-page column statistics
@@ -103,7 +103,7 @@ enum statistics_freq {
 /**
  * @brief Valid encodings for use with `column_in_metadata::set_encoding()`
  */
-enum class column_encoding {
+enum class column_encoding : int32_t {
   // Common encodings:
   USE_DEFAULT = -1,  ///< No encoding has been requested, use default encoding
   DICTIONARY,        ///< Use dictionary encoding
@@ -222,7 +222,7 @@ class writer_compression_statistics {
 /**
  * @brief Control use of dictionary encoding for parquet writer
  */
-enum dictionary_policy {
+enum dictionary_policy : int32_t {
   NEVER    = 0,  ///< Never use dictionary encoding
   ADAPTIVE = 1,  ///< Use dictionary when it will not impact compression
   ALWAYS   = 2   ///< Use dictionary regardless of impact on compression
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/types.pxd b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
index a3d99807876..5f3be2f0727 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
@@ -6,12 +6,10 @@ cimport pylibcudf.libcudf.table.table_view as cudf_table_view
 from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport shared_ptr, unique_ptr
-from libcpp.pair cimport pair
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from pyarrow.includes.libarrow cimport CRandomAccessFile
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
 
@@ -42,32 +40,32 @@ cdef extern from "cudf/io/types.hpp" \
     cpdef enum class io_type(int32_t):
         FILEPATH
         HOST_BUFFER
+        DEVICE_BUFFER
         VOID
         USER_IMPLEMENTED
 
     cpdef enum class statistics_freq(int32_t):
-        STATISTICS_NONE = 0,
-        STATISTICS_ROWGROUP = 1,
-        STATISTICS_PAGE = 2,
-        STATISTICS_COLUMN = 3,
+        STATISTICS_NONE,
+        STATISTICS_ROWGROUP,
+        STATISTICS_PAGE,
+        STATISTICS_COLUMN,
 
     cpdef enum class dictionary_policy(int32_t):
-        NEVER = 0,
-        ADAPTIVE = 1,
-        ALWAYS = 2,
-
-    cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil:
-        cpdef enum class column_encoding(int32_t):
-            USE_DEFAULT = -1
-            DICTIONARY = 0
-            PLAIN = 1
-            DELTA_BINARY_PACKED = 2
-            DELTA_LENGTH_BYTE_ARRAY =3
-            DELTA_BYTE_ARRAY = 4
-            BYTE_STREAM_SPLIT = 5
-            DIRECT = 6
-            DIRECT_V2 = 7
-            DICTIONARY_V2 = 8
+        NEVER,
+        ADAPTIVE,
+        ALWAYS,
+
+    cpdef enum class column_encoding(int32_t):
+        USE_DEFAULT
+        DICTIONARY
+        PLAIN
+        DELTA_BINARY_PACKED
+        DELTA_LENGTH_BYTE_ARRAY
+        DELTA_BYTE_ARRAY
+        BYTE_STREAM_SPLIT
+        DIRECT
+        DIRECT_V2
+        DICTIONARY_V2
 
     cdef cppclass column_name_info:
         string name
@@ -76,7 +74,6 @@ cdef extern from "cudf/io/types.hpp" \
     cdef cppclass table_metadata:
         table_metadata() except +
 
-        vector[string] column_names
         map[string, string] user_data
         vector[unordered_map[string, string]] per_file_user_data
         vector[column_name_info] schema_info
@@ -120,10 +117,7 @@ cdef extern from "cudf/io/types.hpp" \
         host_buffer(const char* data, size_t size)
 
     cdef cppclass source_info:
-        io_type type
         const vector[string]& filepaths() except +
-        const vector[host_buffer]& buffers() except +
-        vector[shared_ptr[CRandomAccessFile]] files
 
         source_info() except +
         source_info(const vector[string] &filepaths) except +
@@ -132,9 +126,7 @@ cdef extern from "cudf/io/types.hpp" \
         source_info(const vector[cudf_io_datasource.datasource*] &datasources) except +
 
     cdef cppclass sink_info:
-        io_type type
         const vector[string]& filepaths()
-        const vector[vector[char] *]& buffers()
         const vector[cudf_io_data_sink.data_sink *]& user_sinks()
 
         sink_info() except +

From 2f7d35435db2b5ed9ead96cf43e2a710db5e5e6d Mon Sep 17 00:00:00 2001
From: Nicolas <denoyelle.nicolas@gmail.com>
Date: Tue, 20 Aug 2024 03:52:34 -0500
Subject: [PATCH 094/270] bug-fix: cudf/io/json.hpp use after move (#16609)

This PR fixes a use after move in json header.
The fix simply shifts the attributes to access the object value before moving it.
Closes https://github.com/rapidsai/cudf/issues/16608

Authors:
  - Nicolas (https://github.com/NicolasDenoyelle)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16609
---
 cpp/include/cudf/io/json.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 0cb39d15cd5..fde1857cb7f 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -696,6 +696,8 @@ class json_writer_options_builder;
 class json_writer_options {
   // Specify the sink to use for writer output
   sink_info _sink;
+  // maximum number of rows to write in each chunk (limits memory use)
+  size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
   // Set of columns to output
   table_view _table;
   // string to use for null entries
@@ -704,8 +706,6 @@ class json_writer_options {
   bool _include_nulls = false;
   // Indicates whether to use JSON lines for records format
   bool _lines = false;
-  // maximum number of rows to write in each chunk (limits memory use)
-  size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
   // string to use for values != 0 in INT8 types (default 'true')
   std::string _true_value = std::string{"true"};
   // string to use for values == 0 in INT8 types (default 'false')
@@ -720,7 +720,7 @@ class json_writer_options {
    * @param table Table to be written to output
    */
   explicit json_writer_options(sink_info sink, table_view table)
-    : _sink(std::move(sink)), _table(std::move(table)), _rows_per_chunk(table.num_rows())
+    : _sink(std::move(sink)), _rows_per_chunk(table.num_rows()), _table(std::move(table))
   {
   }
 

From 1cccf3eeaee50cd69107b3c54ee349720233d8c6 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 20 Aug 2024 16:32:29 +0200
Subject: [PATCH 095/270] Replace usages of `thrust::optional` with
 `std::optional` (#15091)

We want to get rid of thrust types in API boundaries so replace them by the better suited std types

Authors:
  - Michael Schellenberger Costa (https://github.com/miscco)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15091
---
 .../cudf/column/column_device_view.cuh        | 28 ++++-----
 cpp/include/cudf/detail/copy_if_else.cuh      |  6 +-
 cpp/include/cudf/detail/indexalator.cuh       | 12 ++--
 cpp/include/cudf/detail/iterator.cuh          | 26 ++++----
 cpp/include/cudf/json/json.hpp                |  2 -
 .../strings/detail/convert/fixed_point.cuh    |  8 +--
 .../cudf/strings/detail/copy_if_else.cuh      |  6 +-
 .../cudf/table/experimental/row_operators.cuh |  6 +-
 cpp/src/binaryop/binaryop.cpp                 |  4 +-
 cpp/src/io/orc/orc.hpp                        |  7 ++-
 cpp/src/io/orc/writer_impl.cu                 |  6 +-
 .../io/parquet/compact_protocol_reader.cpp    |  8 +--
 cpp/src/io/parquet/parquet.hpp                | 62 +++++++++----------
 cpp/src/io/parquet/parquet_gpu.hpp            | 14 ++---
 cpp/src/io/parquet/predicate_pushdown.cpp     |  6 +-
 cpp/src/io/parquet/reader_impl.cpp            |  2 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    |  6 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  6 +-
 cpp/src/io/parquet/writer_impl.cu             |  8 +--
 cpp/src/json/json_path.cu                     | 22 +++----
 cpp/src/lists/contains.cu                     |  1 -
 cpp/src/lists/explode.cu                      | 14 ++---
 cpp/src/strings/convert/convert_datetime.cu   | 10 +--
 cpp/src/strings/regex/regex.cuh               |  4 +-
 cpp/src/strings/regex/regex.inl               |  6 +-
 cpp/src/strings/replace/multi_re.cu           |  2 +-
 cpp/src/transform/row_bit_count.cu            | 18 +++---
 cpp/tests/io/parquet_common.cpp               |  2 +-
 cpp/tests/io/parquet_common.hpp               |  2 +-
 cpp/tests/iterator/indexalator_test.cu        | 11 ++--
 cpp/tests/iterator/offsetalator_test.cu       |  3 +-
 cpp/tests/iterator/optional_iterator_test.cuh | 25 ++++----
 .../optional_iterator_test_numeric.cu         | 10 +--
 33 files changed, 176 insertions(+), 177 deletions(-)

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 89fe59bfeaa..c3238cb94fd 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -32,9 +32,9 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 
 #include <algorithm>
@@ -614,7 +614,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   /**
    * @brief Return an optional iterator to the first element of the column.
    *
-   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
+   * Dereferencing the returned iterator returns a `cuda::std::optional<T>`.
    *
    * The element of this iterator contextually converts to bool. The conversion returns true
    * if the object contains a value and false if it does not contain a value.
@@ -739,7 +739,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   /**
    * @brief Return an optional iterator to the element following the last element of the column.
    *
-   * The returned iterator represents a `thrust::optional<T>` element.
+   * The returned iterator represents a `cuda::std::optional<T>` element.
    *
    * This function does not participate in overload resolution if
    * `column_device_view::has_element_accessor<T>()` is false.
@@ -1272,21 +1272,21 @@ struct value_accessor {
  * @brief optional accessor of a column
  *
  *
- * The optional_accessor always returns a `thrust::optional` of `column[i]`. The validity
+ * The optional_accessor always returns a `cuda::std::optional` of `column[i]`. The validity
  * of the optional is determined by the `Nullate` parameter which may be one of the following:
  *
  * - `nullate::YES` means that the column supports nulls and the optional returned
  *    might be valid or invalid.
  *
  * - `nullate::NO` means the caller attests that the column has no null values,
- *    no checks will occur and `thrust::optional{column[i]}` will be
+ *    no checks will occur and `cuda::std::optional{column[i]}` will be
  *    return for each `i`.
  *
  * - `nullate::DYNAMIC` defers the assumption of nullability to runtime and the caller
  *    specifies if the column has nulls at runtime.
- *    For `DYNAMIC{true}` the return value will be `thrust::optional{column[i]}` if
- *      element `i` is not null and `thrust::optional{}` if element `i` is null.
- *    For `DYNAMIC{false}` the return value will always be `thrust::optional{column[i]}`.
+ *    For `DYNAMIC{true}` the return value will be `cuda::std::optional{column[i]}` if
+ *      element `i` is not null and `cuda::std::optional{}` if element `i` is null.
+ *    For `DYNAMIC{false}` the return value will always be `cuda::std::optional{column[i]}`.
  *
  * @throws cudf::logic_error if column datatype and template T type mismatch.
  * @throws cudf::logic_error if the column is not nullable and `with_nulls` evaluates to true
@@ -1312,19 +1312,19 @@ struct optional_accessor {
   }
 
   /**
-   * @brief Returns a `thrust::optional` of `column[i]`.
+   * @brief Returns a `cuda::std::optional` of `column[i]`.
    *
    * @param i The index of the element to return
-   * @return A `thrust::optional` that contains the value of `column[i]` is not null. If that
+   * @return A `cuda::std::optional` that contains the value of `column[i]` is not null. If that
    * element is null, the resulting optional will not contain a value.
    */
-  __device__ inline thrust::optional<T> operator()(cudf::size_type i) const
+  __device__ inline cuda::std::optional<T> operator()(cudf::size_type i) const
   {
     if (has_nulls) {
-      return (col.is_valid_nocheck(i)) ? thrust::optional<T>{col.element<T>(i)}
-                                       : thrust::optional<T>{thrust::nullopt};
+      return (col.is_valid_nocheck(i)) ? cuda::std::optional<T>{col.element<T>(i)}
+                                       : cuda::std::optional<T>{cuda::std::nullopt};
     }
-    return thrust::optional<T>{col.element<T>(i)};
+    return cuda::std::optional<T>{col.element<T>(i)};
   }
 
   Nullate has_nulls{};  ///< Indicates if the `col` should be checked for nulls.
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 8418e279ce7..d260a4591b7 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -25,8 +25,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/optional.h>
 
 namespace cudf {
 namespace detail {
@@ -70,7 +70,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
   while (warp_cur <= warp_end) {
     auto const index = static_cast<size_type>(tidx);
     auto const opt_value =
-      (index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt;
+      (index < end) ? (filter(index) ? lhs[index] : rhs[index]) : cuda::std::nullopt;
     if (opt_value) { out.element<T>(index) = static_cast<T>(*opt_value); }
 
     // update validity
@@ -156,7 +156,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
 {
-  // This is the type of the thrust::optional element in the passed iterators
+  // This is the type of the cuda::std::optional element in the passed iterators
   using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;
 
   size_type size           = std::distance(lhs_begin, lhs_end);
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index b5d57da6cd5..ec7b1c3e6b6 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -22,9 +22,9 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 
 namespace cudf {
@@ -376,10 +376,10 @@ struct indexalator_factory {
       iter = make_input_iterator(col);
     }
 
-    __device__ thrust::optional<size_type> operator()(size_type i) const
+    __device__ cuda::std::optional<size_type> operator()(size_type i) const
     {
-      return has_nulls && !bit_is_set(null_mask, i + offset) ? thrust::nullopt
-                                                             : thrust::make_optional(iter[i]);
+      return has_nulls && !bit_is_set(null_mask, i + offset) ? cuda::std::nullopt
+                                                             : cuda::std::make_optional(iter[i]);
     }
   };
 
@@ -400,9 +400,9 @@ struct indexalator_factory {
       iter = indexalator_factory::make_input_iterator(input);
     }
 
-    __device__ thrust::optional<size_type> operator()(size_type) const
+    __device__ cuda::std::optional<size_type> operator()(size_type) const
     {
-      return is_null ? thrust::nullopt : thrust::make_optional(*iter);
+      return is_null ? cuda::std::nullopt : cuda::std::make_optional(*iter);
     }
   };
 
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 9e6227ec19b..4349e1b70fd 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,10 +37,10 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 
+#include <cuda/std/optional>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 
 #include <utility>
@@ -186,7 +186,7 @@ auto make_null_replacement_iterator(column_device_view const& column,
 /**
  * @brief Constructs an optional iterator over a column's values and its validity.
  *
- * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
+ * Dereferencing the returned iterator returns a `cuda::std::optional<Element>`.
  *
  * The element of this iterator contextually converts to bool. The conversion returns true
  * if the object contains a value and false if it does not contain a value.
@@ -237,7 +237,7 @@ auto make_null_replacement_iterator(column_device_view const& column,
  * @param column The column to iterate
  * @param has_nulls Indicates whether `column` is checked for nulls.
  * @return Iterator that returns valid column elements and the validity of the
- * element in a `thrust::optional`
+ * element in a `cuda::std::optional`
  */
 template <typename Element, typename Nullate>
 auto make_optional_iterator(column_device_view const& column, Nullate has_nulls)
@@ -393,7 +393,7 @@ auto inline make_scalar_iterator(scalar const& scalar_value)
 /**
  * @brief Optional accessor for a scalar
  *
- * The `scalar_optional_accessor` always returns a `thrust::optional` of the scalar.
+ * The `scalar_optional_accessor` always returns a `cuda::std::optional` of the scalar.
  * The validity of the optional is determined by the `Nullate` parameter which may
  * be one of the following:
  *
@@ -401,14 +401,14 @@ auto inline make_scalar_iterator(scalar const& scalar_value)
  *    will contain a value only if the scalar is valid.
  *
  * - `nullate::NO` means the caller attests that the scalar will always be valid,
- *    no checks will occur and `thrust::optional{column[i]}` will return a value
+ *    no checks will occur and `cuda::std::optional{column[i]}` will return a value
  *    for each `i`.
  *
  * - `nullate::DYNAMIC` defers the assumption of nullability to runtime and the caller
  *    specifies if the scalar may be valid or invalid.
- *    For `DYNAMIC{true}` the return value will be a `thrust::optional{scalar}` when the
- *      scalar is valid and a `thrust::optional{}` when the scalar is invalid.
- *    For `DYNAMIC{false}` the return value will always be a `thrust::optional{scalar}`.
+ *    For `DYNAMIC{true}` the return value will be a `cuda::std::optional{scalar}` when the
+ *      scalar is valid and a `cuda::std::optional{}` when the scalar is invalid.
+ *    For `DYNAMIC{false}` the return value will always be a `cuda::std::optional{scalar}`.
  *
  * @throws `cudf::logic_error` if scalar datatype and Element type mismatch.
  *
@@ -418,7 +418,7 @@ auto inline make_scalar_iterator(scalar const& scalar_value)
 template <typename Element, typename Nullate>
 struct scalar_optional_accessor : public scalar_value_accessor<Element> {
   using super_t    = scalar_value_accessor<Element>;
-  using value_type = thrust::optional<Element>;
+  using value_type = cuda::std::optional<Element>;
 
   scalar_optional_accessor(scalar const& scalar_value, Nullate with_nulls)
     : scalar_value_accessor<Element>(scalar_value), has_nulls{with_nulls}
@@ -427,7 +427,7 @@ struct scalar_optional_accessor : public scalar_value_accessor<Element> {
 
   __device__ inline value_type const operator()(size_type) const
   {
-    if (has_nulls && !super_t::dscalar.is_valid()) { return value_type{thrust::nullopt}; }
+    if (has_nulls && !super_t::dscalar.is_valid()) { return value_type{cuda::std::nullopt}; }
 
     if constexpr (cudf::is_fixed_point<Element>()) {
       using namespace numeric;
@@ -519,7 +519,7 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor<Elemen
 /**
  * @brief Constructs an optional iterator over a scalar's values and its validity.
  *
- * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
+ * Dereferencing the returned iterator returns a `cuda::std::optional<Element>`.
  *
  * The element of this iterator contextually converts to bool. The conversion returns true
  * if the object contains a value and false if it does not contain a value.
@@ -575,7 +575,7 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor<Elemen
  *
  * @param scalar_value The scalar to be returned by the iterator.
  * @param has_nulls Indicates if the scalar value may be invalid.
- * @return Iterator that returns scalar and the validity of the scalar in a thrust::optional
+ * @return Iterator that returns scalar and the validity of the scalar in a cuda::std::optional
  */
 template <typename Element, typename Nullate>
 auto inline make_optional_iterator(scalar const& scalar_value, Nullate has_nulls)
diff --git a/cpp/include/cudf/json/json.hpp b/cpp/include/cudf/json/json.hpp
index 48d5dcf7727..403374c536d 100644
--- a/cpp/include/cudf/json/json.hpp
+++ b/cpp/include/cudf/json/json.hpp
@@ -22,8 +22,6 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/optional.h>
-
 namespace CUDF_EXPORT cudf {
 
 /**
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index 5f51da967d3..8440805960e 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -17,8 +17,8 @@
 
 #include <cudf/fixed_point/temporary.hpp>
 
+#include <cuda/std/optional>
 #include <cuda/std/type_traits>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 
 namespace cudf {
@@ -88,7 +88,7 @@ __device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
  * @return Integer value of the exponent
  */
 template <bool check_only = false>
-__device__ thrust::optional<int32_t> parse_exponent(char const* iter, char const* iter_end)
+__device__ cuda::std::optional<int32_t> parse_exponent(char const* iter, char const* iter_end)
 {
   constexpr uint32_t exponent_max = static_cast<uint32_t>(std::numeric_limits<int32_t>::max());
 
@@ -105,12 +105,12 @@ __device__ thrust::optional<int32_t> parse_exponent(char const* iter, char const
   while (iter < iter_end) {
     auto const ch = *iter++;
     if (ch < '0' || ch > '9') {
-      if (check_only) { return thrust::nullopt; }
+      if (check_only) { return cuda::std::nullopt; }
       break;
     }
 
     uint32_t exp_check = static_cast<uint32_t>(exp_ten * 10) + static_cast<uint32_t>(ch - '0');
-    if (check_only && (exp_check > exponent_max)) { return thrust::nullopt; }  // check overflow
+    if (check_only && (exp_check > exponent_max)) { return cuda::std::nullopt; }  // check overflow
     exp_ten = static_cast<int32_t>(exp_check);
   }
 
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 4db7651330b..213a41ca596 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -25,8 +25,8 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -41,9 +41,9 @@ namespace detail {
  * ```
  *
  * @tparam StringIterLeft A random access iterator whose value_type is
- * `thrust::optional<string_view>` where the `optional` has a value iff the element is valid.
+ * `cuda::std::optional<string_view>` where the `optional` has a value iff the element is valid.
  * @tparam StringIterRight A random access iterator whose value_type is
- * `thrust::optional<string_view>` where the `optional` has a value iff the element is valid.
+ * `cuda::std::optional<string_view>` where the `optional` has a value iff the element is valid.
  * @tparam Filter Functor that takes an index and returns a boolean.
  *
  * @param lhs_begin Start of first set of data. Used when `filter_fn` returns true.
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index f05e5f4ca5c..3f33c70c29a 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -211,7 +211,7 @@ struct sorting_physical_element_comparator {
   }
 };
 
-using optional_dremel_view = thrust::optional<detail::dremel_device_view const>;
+using optional_dremel_view = cuda::std::optional<detail::dremel_device_view const>;
 
 // The has_nested_columns template parameter of the device_row_comparator is
 // necessary to help the compiler optimize our code. Without it, the list and
@@ -223,12 +223,12 @@ using optional_dremel_view = thrust::optional<detail::dremel_device_view const>;
 // std::optional<device_span<dremel_device_view>> in the
 // preprocessed_table/device_row_comparator (which is always valid when
 // has_nested_columns and is otherwise invalid) that is then unpacked to a
-// thrust::optional<dremel_device_view> at the element_comparator level (which
+// cuda::std::optional<dremel_device_view> at the element_comparator level (which
 // is always valid for a list column and otherwise invalid).  We cannot use an
 // additional template parameter for the element_comparator on a per-column
 // basis because we cannot conditionally define dremel_device_view member
 // variables without jumping through extra hoops with inheritance, so the
-// thrust::optional<dremel_device_view> member must be an optional rather than
+// cuda::std::optional<dremel_device_view> member must be an optional rather than
 // a raw dremel_device_view.
 /**
  * @brief Computes the lexicographic comparison between 2 rows.
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 3ac8547baad..25b0f68aaa8 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -41,7 +41,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/optional.h>
+#include <cuda/std/optional>
 
 #include <jit_preprocessed_files/binaryop/jit/kernel.cu.jit.hpp>
 
@@ -173,7 +173,7 @@ template <typename Lhs, typename Rhs>
 void fixed_point_binary_operation_validation(binary_operator op,
                                              Lhs lhs,
                                              Rhs rhs,
-                                             thrust::optional<cudf::data_type> output_type = {})
+                                             cuda::std::optional<cudf::data_type> output_type = {})
 {
   CUDF_EXPECTS((is_fixed_point(lhs) or is_fixed_point(rhs)),
                "One of the inputs must have fixed_point data_type.");
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index e1403acd455..790532c9d54 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -24,7 +24,7 @@
 #include <cudf/io/orc_types.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <thrust/optional.h>
+#include <cuda/std/optional>
 
 #include <algorithm>
 #include <cstddef>
@@ -692,11 +692,12 @@ class metadata {
  * @brief `column_device_view` and additional, ORC specific, information on the column.
  */
 struct orc_column_device_view : public column_device_view {
-  __device__ orc_column_device_view(column_device_view col, thrust::optional<uint32_t> parent_idx)
+  __device__ orc_column_device_view(column_device_view col,
+                                    cuda::std::optional<uint32_t> parent_idx)
     : column_device_view{col}, parent_index{parent_idx}
   {
   }
-  thrust::optional<uint32_t> parent_index;
+  cuda::std::optional<uint32_t> parent_index;
   bitmask_type const* pushdown_mask = nullptr;
 };
 
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index f3b8cfbc836..04eee68e757 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -42,6 +42,7 @@
 #include <cooperative_groups/memcpy_async.h>
 #include <cuda/std/climits>
 #include <cuda/std/limits>
+#include <cuda/std/optional>
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
 #include <thrust/for_each.h>
@@ -50,7 +51,6 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
@@ -1831,7 +1831,7 @@ orc_table_view make_orc_table_view(table_view const& table,
     type_kinds, stream, rmm::mr::get_current_device_resource());
 
   rmm::device_uvector<orc_column_device_view> d_orc_columns(orc_columns.size(), stream);
-  using stack_value_type = thrust::pair<column_device_view const*, thrust::optional<uint32_t>>;
+  using stack_value_type = thrust::pair<column_device_view const*, cuda::std::optional<uint32_t>>;
   rmm::device_uvector<stack_value_type> stack_storage(orc_columns.size(), stream);
 
   // pre-order append ORC device columns
@@ -1847,7 +1847,7 @@ orc_table_view make_orc_table_view(table_view const& table,
                        thrust::make_reverse_iterator(d_table.end()),
                        thrust::make_reverse_iterator(d_table.begin()),
                        [&stack](column_device_view const& c) {
-                         stack.push({&c, thrust::nullopt});
+                         stack.push({&c, cuda::std::nullopt});
                        });
 
       uint32_t idx = 0;
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index e13ed5e85e5..afcf6b373a9 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -304,10 +304,10 @@ class parquet_field_struct : public parquet_field {
 template <typename E, typename T>
 class parquet_field_union_struct : public parquet_field {
   E& enum_val;
-  thrust::optional<T>& val;  // union structs are always wrapped in std::optional
+  cuda::std::optional<T>& val;  // union structs are always wrapped in std::optional
 
  public:
-  parquet_field_union_struct(int f, E& ev, thrust::optional<T>& v)
+  parquet_field_union_struct(int f, E& ev, cuda::std::optional<T>& v)
     : parquet_field(f), enum_val(ev), val(v)
   {
   }
@@ -431,10 +431,10 @@ class parquet_field_struct_blob : public parquet_field {
  */
 template <typename T, typename FieldFunctor>
 class parquet_field_optional : public parquet_field {
-  thrust::optional<T>& val;
+  cuda::std::optional<T>& val;
 
  public:
-  parquet_field_optional(int f, thrust::optional<T>& v) : parquet_field(f), val(v) {}
+  parquet_field_optional(int f, cuda::std::optional<T>& v) : parquet_field(f), val(v) {}
 
   inline void operator()(CompactProtocolReader* cpr, int field_type)
   {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 8ee4c175e09..5d10472b0ae 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -20,7 +20,7 @@
 
 #include <cudf/types.hpp>
 
-#include <thrust/optional.h>
+#include <cuda/std/optional>
 
 #include <cstdint>
 #include <optional>
@@ -94,10 +94,10 @@ struct LogicalType {
     BSON
   };
   Type type;
-  thrust::optional<DecimalType> decimal_type;
-  thrust::optional<TimeType> time_type;
-  thrust::optional<TimestampType> timestamp_type;
-  thrust::optional<IntType> int_type;
+  cuda::std::optional<DecimalType> decimal_type;
+  cuda::std::optional<TimeType> time_type;
+  cuda::std::optional<TimestampType> timestamp_type;
+  cuda::std::optional<IntType> int_type;
 
   LogicalType(Type tp = UNDEFINED) : type(tp) {}
   LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
@@ -178,21 +178,21 @@ struct SchemaElement {
   // 5: nested fields
   int32_t num_children = 0;
   // 6: DEPRECATED: record the original type before conversion to parquet type
-  thrust::optional<ConvertedType> converted_type;
+  cuda::std::optional<ConvertedType> converted_type;
   // 7: DEPRECATED: record the scale for DECIMAL converted type
   int32_t decimal_scale = 0;
   // 8: DEPRECATED: record the precision for DECIMAL converted type
   int32_t decimal_precision = 0;
   // 9: save field_id from original schema
-  thrust::optional<int32_t> field_id;
+  cuda::std::optional<int32_t> field_id;
   // 10: replaces converted type
-  thrust::optional<LogicalType> logical_type;
+  cuda::std::optional<LogicalType> logical_type;
 
   // extra cudf specific fields
   bool output_as_byte_array = false;
 
   // cudf type determined from arrow:schema
-  thrust::optional<type_id> arrow_type;
+  cuda::std::optional<type_id> arrow_type;
 
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
@@ -259,21 +259,21 @@ struct SchemaElement {
  */
 struct Statistics {
   // deprecated max value in signed comparison order
-  thrust::optional<std::vector<uint8_t>> max;
+  cuda::std::optional<std::vector<uint8_t>> max;
   // deprecated min value in signed comparison order
-  thrust::optional<std::vector<uint8_t>> min;
+  cuda::std::optional<std::vector<uint8_t>> min;
   // count of null values in the column
-  thrust::optional<int64_t> null_count;
+  cuda::std::optional<int64_t> null_count;
   // count of distinct values occurring
-  thrust::optional<int64_t> distinct_count;
+  cuda::std::optional<int64_t> distinct_count;
   // max value for column determined by ColumnOrder
-  thrust::optional<std::vector<uint8_t>> max_value;
+  cuda::std::optional<std::vector<uint8_t>> max_value;
   // min value for column determined by ColumnOrder
-  thrust::optional<std::vector<uint8_t>> min_value;
+  cuda::std::optional<std::vector<uint8_t>> min_value;
   // If true, max_value is the actual maximum value for a column
-  thrust::optional<bool> is_max_value_exact;
+  cuda::std::optional<bool> is_max_value_exact;
   // If true, min_value is the actual minimum value for a column
-  thrust::optional<bool> is_min_value_exact;
+  cuda::std::optional<bool> is_min_value_exact;
 };
 
 /**
@@ -282,7 +282,7 @@ struct Statistics {
 struct SizeStatistics {
   // Number of variable-width bytes stored for the page/chunk. Should not be set for anything
   // but the BYTE_ARRAY physical type.
-  thrust::optional<int64_t> unencoded_byte_array_data_bytes;
+  cuda::std::optional<int64_t> unencoded_byte_array_data_bytes;
   /**
    * When present, there is expected to be one element corresponding to each
    * repetition (i.e. size=max repetition_level+1) where each element
@@ -291,14 +291,14 @@ struct SizeStatistics {
    *
    * This value should not be written if max_repetition_level is 0.
    */
-  thrust::optional<std::vector<int64_t>> repetition_level_histogram;
+  cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
 
   /**
    * Same as repetition_level_histogram except for definition levels.
    *
    * This value should not be written if max_definition_level is 0 or 1.
    */
-  thrust::optional<std::vector<int64_t>> definition_level_histogram;
+  cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
 /**
@@ -319,7 +319,7 @@ struct OffsetIndex {
   std::vector<PageLocation> page_locations;
   // per-page size info. see description of the same field in SizeStatistics. only present for
   // columns with a BYTE_ARRAY physical type.
-  thrust::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
+  cuda::std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
 };
 
 /**
@@ -331,10 +331,10 @@ struct ColumnIndex {
   std::vector<std::vector<uint8_t>> max_values;  // upper bound for values in each page
   BoundaryOrder boundary_order =
     BoundaryOrder::UNORDERED;  // Indicates if min and max values are ordered
-  thrust::optional<std::vector<int64_t>> null_counts;  // Optional count of null values per page
+  cuda::std::optional<std::vector<int64_t>> null_counts;  // Optional count of null values per page
   // Repetition/definition level histograms for the column chunk
-  thrust::optional<std::vector<int64_t>> repetition_level_histogram;
-  thrust::optional<std::vector<int64_t>> definition_level_histogram;
+  cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
+  cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
 /**
@@ -384,11 +384,11 @@ struct ColumnChunkMetaData {
   Statistics statistics;
   // Set of all encodings used for pages in this column chunk. This information can be used to
   // determine if all data pages are dictionary encoded for example.
-  thrust::optional<std::vector<PageEncodingStats>> encoding_stats;
+  cuda::std::optional<std::vector<PageEncodingStats>> encoding_stats;
   // Optional statistics to help estimate total memory when converted to in-memory representations.
   // The histograms contained in these statistics can also be useful in some cases for more
   // fine-grained nullability/list length filter pushdown.
-  thrust::optional<SizeStatistics> size_statistics;
+  cuda::std::optional<SizeStatistics> size_statistics;
 };
 
 /**
@@ -430,13 +430,13 @@ struct RowGroup {
   int64_t num_rows = 0;
   // If set, specifies a sort ordering of the rows in this RowGroup.
   // The sorting columns can be a subset of all the columns.
-  thrust::optional<std::vector<SortingColumn>> sorting_columns;
+  cuda::std::optional<std::vector<SortingColumn>> sorting_columns;
   // Byte offset from beginning of file to first page (data or dictionary) in this row group
-  thrust::optional<int64_t> file_offset;
+  cuda::std::optional<int64_t> file_offset;
   // Total byte size of all compressed (and potentially encrypted) column data in this row group
-  thrust::optional<int64_t> total_compressed_size;
+  cuda::std::optional<int64_t> total_compressed_size;
   // Row group ordinal in the file
-  thrust::optional<int16_t> ordinal;
+  cuda::std::optional<int16_t> ordinal;
 };
 
 /**
@@ -461,7 +461,7 @@ struct FileMetaData {
   std::vector<RowGroup> row_groups;
   std::vector<KeyValue> key_value_metadata;
   std::string created_by = "";
-  thrust::optional<std::vector<ColumnOrder>> column_orders;
+  cuda::std::optional<std::vector<ColumnOrder>> column_orders;
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index efc1f5ebab1..8f52f073833 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -394,7 +394,7 @@ struct ColumnChunkDesc {
                            uint8_t def_level_bits_,
                            uint8_t rep_level_bits_,
                            Compression codec_,
-                           thrust::optional<LogicalType> logical_type_,
+                           cuda::std::optional<LogicalType> logical_type_,
                            int32_t ts_clock_rate_,
                            int32_t src_col_index_,
                            int32_t src_col_schema_,
@@ -438,12 +438,12 @@ struct ColumnChunkDesc {
   int32_t num_data_pages{};                     // number of data pages
   int32_t num_dict_pages{};                     // number of dictionary pages
   PageInfo const* dict_page{};
-  string_index_pair* str_dict_index{};           // index for string dictionary
-  bitmask_type** valid_map_base{};               // base pointers of valid bit map for this column
-  void** column_data_base{};                     // base pointers of column data
-  void** column_string_base{};                   // base pointers of column string data
-  Compression codec{};                           // compressed codec enum
-  thrust::optional<LogicalType> logical_type{};  // logical type
+  string_index_pair* str_dict_index{};  // index for string dictionary
+  bitmask_type** valid_map_base{};      // base pointers of valid bit map for this column
+  void** column_data_base{};            // base pointers of column data
+  void** column_string_base{};          // base pointers of column string data
+  Compression codec{};                  // compressed codec enum
+  cuda::std::optional<LogicalType> logical_type{};  // logical type
   int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index{};   // my input column index
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 481c1e9fcdd..5ca090b05b3 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -154,7 +154,7 @@ struct stats_caster {
         }
 
         void set_index(size_type index,
-                       thrust::optional<std::vector<uint8_t>> const& binary_value,
+                       cuda::std::optional<std::vector<uint8_t>> const& binary_value,
                        Type const type)
         {
           if (binary_value.has_value()) {
@@ -236,8 +236,8 @@ struct stats_caster {
             max.set_index(stats_idx, max_value, colchunk.meta_data.type);
           } else {
             // Marking it null, if column present in row group
-            min.set_index(stats_idx, thrust::nullopt, {});
-            max.set_index(stats_idx, thrust::nullopt, {});
+            min.set_index(stats_idx, cuda::std::nullopt, {});
+            max.set_index(stats_idx, cuda::std::nullopt, {});
           }
           stats_idx++;
         }
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 68ec61ead0a..2648a1f41ab 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -39,7 +39,7 @@ namespace {
 // be treated as a string. Currently the only logical type that has special handling is DECIMAL.
 // Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which
 // for now would also be treated as a string).
-inline bool is_treat_fixed_length_as_string(thrust::optional<LogicalType> const& logical_type)
+inline bool is_treat_fixed_length_as_string(cuda::std::optional<LogicalType> const& logical_type)
 {
   if (!logical_type.has_value()) { return true; }
   return logical_type->type != LogicalType::DECIMAL;
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 794750ab6d2..54ba898b058 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -370,11 +370,11 @@ int64_t find_next_split(int64_t cur_pos,
  *
  * @return A tuple of Parquet clock rate and Parquet decimal type.
  */
-[[nodiscard]] std::tuple<int32_t, thrust::optional<LogicalType>> conversion_info(
+[[nodiscard]] std::tuple<int32_t, cuda::std::optional<LogicalType>> conversion_info(
   type_id column_type_id,
   type_id timestamp_type_id,
   Type physical,
-  thrust::optional<LogicalType> logical_type)
+  cuda::std::optional<LogicalType> logical_type)
 {
   int32_t const clock_rate =
     is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0;
@@ -385,7 +385,7 @@ int64_t find_next_split(int64_t cur_pos,
     // if decimal but not outputting as float or decimal, then convert to no logical type
     if (column_type_id != type_id::FLOAT64 and
         not cudf::is_fixed_point(data_type{column_type_id})) {
-      return std::make_tuple(clock_rate, thrust::nullopt);
+      return std::make_tuple(clock_rate, cuda::std::nullopt);
     }
   }
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 581c44d024b..00f75e4e828 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -38,7 +38,7 @@ namespace flatbuf = cudf::io::parquet::flatbuf;
 
 namespace {
 
-thrust::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
+cuda::std::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
 {
   if (schema.converted_type.has_value()) {
     switch (schema.converted_type.value()) {
@@ -66,7 +66,7 @@ thrust::optional<LogicalType> converted_to_logical_type(SchemaElement const& sch
       default: return LogicalType{LogicalType::UNDEFINED};
     }
   }
-  return thrust::nullopt;
+  return cuda::std::nullopt;
 }
 
 }  // namespace
@@ -246,7 +246,7 @@ void metadata::sanitize_schema()
         struct_elem.repetition_type = REQUIRED;
         struct_elem.num_children    = schema_elem.num_children;
         struct_elem.type            = UNDEFINED_TYPE;
-        struct_elem.converted_type  = thrust::nullopt;
+        struct_elem.converted_type  = cuda::std::nullopt;
 
         // swap children
         struct_elem.children_idx = std::move(schema_elem.children_idx);
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 36a1d8377bf..c2c5dbb4a56 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -185,7 +185,7 @@ struct aggregate_writer_metadata {
     std::vector<std::vector<uint8_t>> column_indexes;
   };
   std::vector<per_file_metadata> files;
-  thrust::optional<std::vector<ColumnOrder>> column_orders = thrust::nullopt;
+  cuda::std::optional<std::vector<ColumnOrder>> column_orders = cuda::std::nullopt;
 };
 
 namespace {
@@ -471,7 +471,7 @@ struct leaf_schema_fn {
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
     col_schema.type           = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type = thrust::nullopt;
+    col_schema.converted_type = cuda::std::nullopt;
     col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
     if (timestamp_is_int96) {
       col_schema.ts_scale = -1000;  // negative value indicates division by absolute value
@@ -749,7 +749,7 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
           col_schema.type = Type::BYTE_ARRAY;
         }
 
-        col_schema.converted_type  = thrust::nullopt;
+        col_schema.converted_type  = cuda::std::nullopt;
         col_schema.stats_dtype     = statistics_dtype::dtype_byte_array;
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -2776,7 +2776,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   // See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615
   for (auto& se : md.schema) {
     if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) {
-      se.logical_type = thrust::nullopt;
+      se.logical_type = cuda::std::nullopt;
     }
   }
 
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index d1a1097de35..1bf4bf3b153 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -39,7 +39,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/optional.h>
+#include <cuda/std/optional>
 #include <thrust/pair.h>
 #include <thrust/scan.h>
 #include <thrust/tuple.h>
@@ -207,7 +207,7 @@ class parser {
 struct json_output {
   size_t output_max_len;
   char* output;
-  thrust::optional<size_t> output_len;
+  cuda::std::optional<size_t> output_len;
 
   __device__ void add_output(char const* str, size_t len)
   {
@@ -656,7 +656,7 @@ class path_state : private parser {
  * @param stream Cuda stream to perform any gpu actions on
  * @returns A pair containing the command buffer, and maximum stack depth required.
  */
-std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_command_buffer(
+std::pair<cuda::std::optional<rmm::device_uvector<path_operator>>, int> build_command_buffer(
   cudf::string_scalar const& json_path, rmm::cuda_stream_view stream)
 {
   std::string h_json_path = json_path.to_string(stream);
@@ -690,8 +690,8 @@ std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_comma
   } while (op.type != path_operator_type::END);
 
   auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END;
-  return is_empty ? std::pair(thrust::nullopt, 0)
-                  : std::pair(thrust::make_optional(cudf::detail::make_device_uvector_sync(
+  return is_empty ? std::pair(cuda::std::nullopt, 0)
+                  : std::pair(cuda::std::make_optional(cudf::detail::make_device_uvector_sync(
                                 h_operators, stream, rmm::mr::get_current_device_resource())),
                               max_stack_depth);
 }
@@ -920,9 +920,9 @@ __launch_bounds__(block_size) CUDF_KERNEL
                               path_operator const* const commands,
                               size_type* d_sizes,
                               cudf::detail::input_offsetalator output_offsets,
-                              thrust::optional<char*> out_buf,
-                              thrust::optional<bitmask_type*> out_validity,
-                              thrust::optional<size_type*> out_valid_count,
+                              cuda::std::optional<char*> out_buf,
+                              cuda::std::optional<bitmask_type*> out_validity,
+                              cuda::std::optional<size_type*> out_valid_count,
                               get_json_object_options options)
 {
   auto tid          = cudf::detail::grid_1d::global_thread_id();
@@ -1012,9 +1012,9 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
       std::get<0>(preprocess).value().data(),
       sizes.data(),
       d_offsets,
-      thrust::nullopt,
-      thrust::nullopt,
-      thrust::nullopt,
+      cuda::std::nullopt,
+      cuda::std::nullopt,
+      cuda::std::nullopt,
       options);
 
   // convert sizes to offsets
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 30c03a8cd68..11703527d26 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -40,7 +40,6 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/logical.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 #include <thrust/tabulate.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 46c4fc78a6f..74a0d842aad 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -29,6 +29,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -36,7 +37,6 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/scan.h>
 #include <thrust/transform.h>
 
@@ -57,8 +57,8 @@ std::unique_ptr<table> build_table(
   size_type const explode_column_idx,
   column_view const& sliced_child,
   cudf::device_span<size_type const> gather_map,
-  thrust::optional<cudf::device_span<size_type const>> explode_col_gather_map,
-  thrust::optional<rmm::device_uvector<size_type>> position_array,
+  cuda::std::optional<cudf::device_span<size_type const>> explode_col_gather_map,
+  cuda::std::optional<rmm::device_uvector<size_type>> position_array,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
@@ -143,8 +143,8 @@ std::unique_ptr<table> explode(table_view const& input_table,
                      explode_column_idx,
                      sliced_child,
                      gather_map,
-                     thrust::nullopt,
-                     thrust::nullopt,
+                     cuda::std::nullopt,
+                     cuda::std::nullopt,
                      stream,
                      mr);
 }
@@ -193,7 +193,7 @@ std::unique_ptr<table> explode_position(table_view const& input_table,
                      explode_column_idx,
                      sliced_child,
                      gather_map,
-                     thrust::nullopt,
+                     cuda::std::nullopt,
                      std::move(pos),
                      stream,
                      mr);
@@ -292,7 +292,7 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
     sliced_child,
     gather_map,
     explode_col_gather_map,
-    include_position ? std::move(pos) : thrust::optional<rmm::device_uvector<size_type>>{},
+    include_position ? std::move(pos) : cuda::std::optional<rmm::device_uvector<size_type>>{},
     stream,
     mr);
 }
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 64a2107e17a..99c40f00b00 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -36,11 +36,11 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 #include <thrust/transform.h>
 
@@ -519,7 +519,7 @@ struct check_datetime_format {
    * The checking here is a little more strict than the actual
    * parser used for conversion.
    */
-  __device__ thrust::optional<timestamp_components> check_string(string_view const& d_string)
+  __device__ cuda::std::optional<timestamp_components> check_string(string_view const& d_string)
   {
     timestamp_components dateparts = {1970, 1, 1, 0};  // init to epoch time
 
@@ -529,7 +529,7 @@ struct check_datetime_format {
       // eliminate static character values first
       if (item.item_type == format_char_type::literal) {
         // check static character matches
-        if (*ptr != item.value) return thrust::nullopt;
+        if (*ptr != item.value) return cuda::std::nullopt;
         ptr += item.length;
         length -= item.length;
         continue;
@@ -645,7 +645,7 @@ struct check_datetime_format {
         case 'Z': result = true;  // skip
         default: break;
       }
-      if (!result) return thrust::nullopt;
+      if (!result) return cuda::std::nullopt;
       ptr += bytes_read;
       length -= bytes_read;
     }
@@ -821,7 +821,7 @@ struct datetime_formatter_fn {
     // We only dissect the timestamp into components if needed
     // by a specifier. And then we only do it once and reuse it.
     // This can improve performance when not using uncommon specifiers.
-    thrust::optional<cuda::std::chrono::sys_days> days;
+    cuda::std::optional<cuda::std::chrono::sys_days> days;
 
     auto days_from_timestamp = [tstamp]() {
       auto const count = tstamp.time_since_epoch().count();
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index e6134296e45..2df404048f7 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -23,8 +23,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cuda/std/optional>
 #include <cuda_runtime.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 
 #include <memory>
@@ -36,7 +36,7 @@ namespace detail {
 struct relist;
 
 using match_pair   = thrust::pair<cudf::size_type, cudf::size_type>;
-using match_result = thrust::optional<match_pair>;
+using match_result = cuda::std::optional<match_pair>;
 
 constexpr int32_t MAX_SHARED_MEM      = 2048;  ///< Memory size for storing prog instruction data
 constexpr std::size_t MAX_WORKING_MEM = 0x01'FFFF'FFFF;  ///< Memory size for state data
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 23e1944cda4..3b899e4edc1 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -260,12 +260,12 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
       switch (jnk.starttype) {
         case BOL:
           if (pos == 0) break;
-          if (jnk.startchar != '^') { return thrust::nullopt; }
+          if (jnk.startchar != '^') { return cuda::std::nullopt; }
           --itr;
           startchar = static_cast<char_utf8>('\n');
         case CHAR: {
           auto const find_itr = find_char(startchar, dstr, itr);
-          if (find_itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; }
+          if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; }
           itr = find_itr + (jnk.starttype == BOL);
           pos = itr.position();
           break;
@@ -396,7 +396,7 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
     checkstart = jnk.list1->get_size() == 0;
   } while (!last_character && (!checkstart || !match));
 
-  return match ? match_result({begin, end}) : thrust::nullopt;
+  return match ? match_result({begin, end}) : cuda::std::nullopt;
 }
 
 __device__ __forceinline__ match_result reprog_device::find(int32_t const thread_idx,
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 31234ea42ec..0ad3ab2305c 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -92,7 +92,7 @@ struct replace_multi_regex_fn {
         }
         reprog_device prog = progs[ptn_idx];
 
-        auto const result = !prog.is_empty() ? prog.find(idx, d_str, itr) : thrust::nullopt;
+        auto const result = !prog.is_empty() ? prog.find(idx, d_str, itr) : cuda::std::nullopt;
         d_ranges[ptn_idx] =
           result ? found_range{result->first, result->second} : found_range{nchars, nchars};
       }
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 4530fabf889..6a965d10184 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -35,8 +35,8 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/tabulate.h>
 
 namespace cudf {
@@ -159,9 +159,9 @@ void flatten_hierarchy(ColIter begin,
                        std::vector<column_info>& info,
                        hierarchy_info& h_info,
                        rmm::cuda_stream_view stream,
-                       size_type cur_depth                = 0,
-                       size_type cur_branch_depth         = 0,
-                       thrust::optional<int> parent_index = {});
+                       size_type cur_depth                   = 0,
+                       size_type cur_branch_depth            = 0,
+                       cuda::std::optional<int> parent_index = {});
 
 /**
  * @brief Type-dispatched functor called by flatten_hierarchy.
@@ -177,7 +177,7 @@ struct flatten_functor {
                   rmm::cuda_stream_view,
                   size_type cur_depth,
                   size_type cur_branch_depth,
-                  thrust::optional<int>)
+                  cuda::std::optional<int>)
   {
     out.push_back(col);
     info.push_back({cur_depth, cur_branch_depth, cur_branch_depth});
@@ -194,7 +194,7 @@ struct flatten_functor {
                   rmm::cuda_stream_view,
                   size_type cur_depth,
                   size_type cur_branch_depth,
-                  thrust::optional<int>)
+                  cuda::std::optional<int>)
   {
     out.push_back(col);
     info.push_back({cur_depth, cur_branch_depth, cur_branch_depth});
@@ -210,7 +210,7 @@ struct flatten_functor {
                   rmm::cuda_stream_view stream,
                   size_type cur_depth,
                   size_type cur_branch_depth,
-                  thrust::optional<int> parent_index)
+                  cuda::std::optional<int> parent_index)
   {
     // track branch depth as we reach this list and after we pass it
     auto const branch_depth_start = cur_branch_depth;
@@ -243,7 +243,7 @@ struct flatten_functor {
                   rmm::cuda_stream_view stream,
                   size_type cur_depth,
                   size_type cur_branch_depth,
-                  thrust::optional<int>)
+                  cuda::std::optional<int>)
   {
     out.push_back(col);
     info.push_back({cur_depth, cur_branch_depth, cur_branch_depth});
@@ -284,7 +284,7 @@ void flatten_hierarchy(ColIter begin,
                        rmm::cuda_stream_view stream,
                        size_type cur_depth,
                        size_type cur_branch_depth,
-                       thrust::optional<int> parent_index)
+                       cuda::std::optional<int> parent_index)
 {
   std::for_each(begin, end, [&](column_view const& col) {
     cudf::type_dispatcher(col.type(),
diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp
index c1211869bcc..3dd5ad145ea 100644
--- a/cpp/tests/io/parquet_common.cpp
+++ b/cpp/tests/io/parquet_common.cpp
@@ -744,7 +744,7 @@ int32_t compare(T& v1, T& v2)
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
                        cudf::io::parquet::detail::Type ptype,
-                       thrust::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
+                       cuda::std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
 {
   auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN);
   switch (ptype) {
diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp
index 59ee85444f2..bc6145d77da 100644
--- a/cpp/tests/io/parquet_common.hpp
+++ b/cpp/tests/io/parquet_common.hpp
@@ -172,7 +172,7 @@ std::pair<cudf::table, std::string> create_parquet_typed_with_stats(std::string
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
                        cudf::io::parquet::detail::Type ptype,
-                       thrust::optional<cudf::io::parquet::detail::ConvertedType> const& ctype);
+                       cuda::std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype);
 
 void expect_compression_stats_empty(std::shared_ptr<cudf::io::writer_compression_statistics> stats);
 
diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu
index 0c10853ec02..dac2356dcb0 100644
--- a/cpp/tests/iterator/indexalator_test.cu
+++ b/cpp/tests/iterator/indexalator_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,10 +20,10 @@
 
 #include <cudf/detail/indexalator.cuh>
 
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/gather.h>
 #include <thrust/host_vector.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
@@ -84,15 +84,16 @@ TYPED_TEST(IndexalatorTest, optional_iterator)
   auto d_col = cudf::test::fixed_width_column_wrapper<T>(
     host_values.begin(), host_values.end(), validity.begin());
 
-  auto expected_values = thrust::host_vector<thrust::optional<cudf::size_type>>(host_values.size());
+  auto expected_values =
+    thrust::host_vector<cuda::std::optional<cudf::size_type>>(host_values.size());
 
   std::transform(host_values.begin(),
                  host_values.end(),
                  validity.begin(),
                  expected_values.begin(),
                  [](T v, bool b) {
-                   return (b) ? thrust::make_optional(static_cast<cudf::size_type>(v))
-                              : thrust::nullopt;
+                   return (b) ? cuda::std::make_optional(static_cast<cudf::size_type>(v))
+                              : cuda::std::nullopt;
                  });
 
   auto it_dev = cudf::detail::indexalator_factory::make_input_optional_iterator(d_col);
diff --git a/cpp/tests/iterator/offsetalator_test.cu b/cpp/tests/iterator/offsetalator_test.cu
index e569e58f42a..b206ff947bb 100644
--- a/cpp/tests/iterator/offsetalator_test.cu
+++ b/cpp/tests/iterator/offsetalator_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 #include <thrust/binary_search.h>
 #include <thrust/gather.h>
 #include <thrust/host_vector.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
diff --git a/cpp/tests/iterator/optional_iterator_test.cuh b/cpp/tests/iterator/optional_iterator_test.cuh
index 6a264cee9a8..04f5410a44f 100644
--- a/cpp/tests/iterator/optional_iterator_test.cuh
+++ b/cpp/tests/iterator/optional_iterator_test.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #include <tests/iterator/iterator_tests.cuh>
 
+#include <cuda/std/optional>
 #include <thrust/host_vector.h>
-#include <thrust/optional.h>
 
 template <typename T>
 void nonull_optional_iterator(IteratorTest<T>& testFixture)
@@ -32,9 +32,9 @@ void nonull_optional_iterator(IteratorTest<T>& testFixture)
   auto d_col = cudf::column_device_view::create(w_col);
 
   // calculate the expected value by CPU.
-  thrust::host_vector<thrust::optional<T>> replaced_array(host_values.size());
+  thrust::host_vector<cuda::std::optional<T>> replaced_array(host_values.size());
   std::transform(host_values.begin(), host_values.end(), replaced_array.begin(), [](auto s) {
-    return thrust::optional<T>{s};
+    return cuda::std::optional<T>{s};
   });
 
   // GPU test
@@ -61,19 +61,20 @@ void null_optional_iterator(IteratorTest<T>& testFixture)
   auto d_col = cudf::column_device_view::create(w_col);
 
   // calculate the expected value by CPU.
-  thrust::host_vector<thrust::optional<T>> optional_values(host_values.size());
-  std::transform(host_values.begin(),
-                 host_values.end(),
-                 host_bools.begin(),
-                 optional_values.begin(),
-                 [](auto s, bool b) { return b ? thrust::optional<T>{s} : thrust::optional<T>{}; });
+  thrust::host_vector<cuda::std::optional<T>> optional_values(host_values.size());
+  std::transform(
+    host_values.begin(),
+    host_values.end(),
+    host_bools.begin(),
+    optional_values.begin(),
+    [](auto s, bool b) { return b ? cuda::std::optional<T>{s} : cuda::std::optional<T>{}; });
 
-  thrust::host_vector<thrust::optional<T>> value_all_valid(host_values.size());
+  thrust::host_vector<cuda::std::optional<T>> value_all_valid(host_values.size());
   std::transform(host_values.begin(),
                  host_values.end(),
                  host_bools.begin(),
                  value_all_valid.begin(),
-                 [](auto s, bool b) { return thrust::optional<T>{s}; });
+                 [](auto s, bool b) { return cuda::std::optional<T>{s}; });
 
   // GPU test for correct null mapping
   testFixture.iterator_test_thrust(
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index 98befb0a3ee..257c0979017 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/utilities/default_stream.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/transform.h>
 
@@ -49,21 +49,21 @@ TYPED_TEST(NumericOptionalIteratorTest, null_optional_iterator) { null_optional_
 // Transformers and Operators for optional_iterator test
 template <typename ElementType>
 struct transformer_optional_meanvar {
-  using ResultType = thrust::optional<cudf::meanvar<ElementType>>;
+  using ResultType = cuda::std::optional<cudf::meanvar<ElementType>>;
 
-  CUDF_HOST_DEVICE inline ResultType operator()(thrust::optional<ElementType> const& optional)
+  CUDF_HOST_DEVICE inline ResultType operator()(cuda::std::optional<ElementType> const& optional)
   {
     if (optional.has_value()) {
       auto v = *optional;
       return cudf::meanvar<ElementType>{v, static_cast<ElementType>(v * v), 1};
     }
-    return thrust::nullopt;
+    return cuda::std::nullopt;
   }
 };
 
 template <typename T>
 struct optional_to_meanvar {
-  CUDF_HOST_DEVICE inline T operator()(thrust::optional<T> const& v) { return v.value_or(T{0}); }
+  CUDF_HOST_DEVICE inline T operator()(cuda::std::optional<T> const& v) { return v.value_or(T{0}); }
 };
 
 // TODO: enable this test also at __CUDACC_DEBUG__

From 555734dee7a8fb10f50c8609a8e4fb2c025e6305 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 20 Aug 2024 09:32:59 -0500
Subject: [PATCH 096/270] Remove thrust::optional from expression evaluator
 (#16604)

This PR follows up on a request from @davidwendt in https://github.com/rapidsai/cudf/pull/15091#discussion_r1722183142.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16604
---
 cpp/include/cudf/ast/detail/expression_evaluator.cuh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index 105d87ff96f..9d8762555d7 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,8 +29,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/optional.h>
-
 namespace cudf {
 
 namespace ast {
@@ -278,7 +276,7 @@ struct expression_evaluator {
     detail::device_data_reference const& input_reference,
     IntermediateDataType<has_nulls>* thread_intermediate_storage,
     cudf::size_type left_row_index,
-    thrust::optional<cudf::size_type> right_row_index = {}) const
+    cudf::size_type right_row_index = {}) const
   {
     // TODO: Everywhere in the code assumes that the table reference is either
     // left or right. Should we error-check somewhere to prevent
@@ -291,7 +289,7 @@ struct expression_evaluator {
       // any case where input_reference.table_source == table_reference::RIGHT.
       // Otherwise, behavior is undefined.
       auto const row_index =
-        (input_reference.table_source == table_reference::LEFT) ? left_row_index : *right_row_index;
+        (input_reference.table_source == table_reference::LEFT) ? left_row_index : right_row_index;
       if constexpr (has_nulls) {
         return table.column(input_reference.data_index).is_valid(row_index)
                  ? ReturnType(table.column(input_reference.data_index).element<Element>(row_index))
@@ -329,7 +327,7 @@ struct expression_evaluator {
     detail::device_data_reference const& device_data_reference,
     IntermediateDataType<has_nulls>* thread_intermediate_storage,
     cudf::size_type left_row_index,
-    thrust::optional<cudf::size_type> right_row_index = {}) const
+    cudf::size_type right_row_index = {}) const
   {
     CUDF_UNREACHABLE("Unsupported type in resolve_input.");
   }

From b32bc10ee9795ba94df9a79d6fa5bfd2a53455d6 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 20 Aug 2024 12:43:51 -0500
Subject: [PATCH 097/270] do not install cudf in cudf_polars wheel tests
 (#16612)

Removes unnecessary installation of `cudf` wheels in wheel testing for `cudf_polars`.

`cudf_polars` doesn't depend on `cudf`, and neither do its tests. However, right now it's downloading `cudf` during it's wheel tests. I mistakenly introduced that in #16575.

This introduced a race condition that could lead to CI failures whenever the `cudf` wheels aren't published yet by the time the `cudf_polars` tests. Because the `cudf_polars` wheel tests (rightly) do not wait for `cudf` wheels to be available:

https://github.com/rapidsai/cudf/blob/555734dee7a8fb10f50c8609a8e4fb2c025e6305/.github/workflows/pr.yaml#L154-L155

https://github.com/rapidsai/cudf/blob/555734dee7a8fb10f50c8609a8e4fb2c025e6305/.github/workflows/pr.yaml#L145-L146

Noticed this in #16611

```text
[rapids-download-from-s3] Downloading and decompressing s3://rapids-downloads/ci/cudf/pull-request/16611/a6b7eff/cudf_wheel_python_cudf_cu12_py310_x86_64.tar.gz into ./dist
download failed: s3://rapids-downloads/ci/cudf/pull-request/16611/a6b7eff/cudf_wheel_python_cudf_cu12_py310_x86_64.tar.gz to - An error occurred (404) when calling the HeadObject operation: Not Found
```

([build link](https://github.com/rapidsai/cudf/actions/runs/10472939821/job/29004728278?pr=16611))

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16612
---
 ci/test_wheel_cudf_polars.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 6438d13c4b7..e9c6188502c 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -20,15 +20,13 @@ fi
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-# Download the cudf and pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# Download pylibcudf built in the previous step
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
 rapids-logger "Installing cudf_polars and its dependencies"
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
-    "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
     "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
     "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 

From e450baf1d748a4a361797ee18a1372095212b816 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 20 Aug 2024 13:59:58 -0500
Subject: [PATCH 098/270] remove streamz git dependency, standardize build
 dependency names, consolidate some dependency lists (#16611)

Proposes some additional cleanup in `dependencies.yaml`, for things I noticed while working through #15483.

* standardizes the naming of keys in the `files:` section for build dependencies
  - *`py_build_{project}` = dependencies for the `[build-system]` table*
  - *`py_rapids_build_{project}` = dependencies for the `[tool.rapids-build-backend]` table*
  - *this is how it was done over most of the other repos in https://github.com/rapidsai/build-planning/issues/31, it was just missed because `cudf` was one of the first repos to add `rapids-build-backend`*
* removes the dependency on building `streamz` from latest source on GitHub
  - *`custreamz` conda packages and wheels depend on packages for those, not this git dependency*
    - https://github.com/rapidsai/cudf/blob/2f7d35435db2b5ed9ead96cf43e2a710db5e5e6d/dependencies.yaml#L752-L754
    - https://github.com/rapidsai/cudf/blob/2f7d35435db2b5ed9ead96cf43e2a710db5e5e6d/conda/recipes/custreamz/meta.yaml#L45-L47
  - *if this is really needed, I don't think it belongs in the `build_python_cudf` set*
  - *the last commit to `streamz` was 2 years ago (https://github.com/python-streamz/streamz), this doesn't seem like a `rapids-dask-dependency`, try-to-always-test-against-latest, situation to me*
  - *I'm guessing this is left over from a time before `streamz` was regularly publishing wheels... it's been in `dependencies.yaml` since that file was first introduced here in November 2022 (#11674)*
  - *the last release, v0.6.4, was made on July 27, 2022. There have been around 20 commits to `master` since then ([history link](https://github.com/python-streamz/streamz/commits/master/)) ... but if `custreamz` really needed those, I'd expect `custreamz` to depend on the version built from GitHub sources. I strongly suspect that that isn't the case.*
* removes `build_python_cudf` and `build_python_libcudf` lists in `dependencies.yaml`, in favor of re-using the `depends_on_rmm` and `depends_on_pylibcudf` lists

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16611
---
 .../all_cuda-118_arch-x86_64.yaml             |   3 -
 .../all_cuda-125_arch-x86_64.yaml             |   3 -
 dependencies.yaml                             | 137 ++++++------------
 3 files changed, 42 insertions(+), 101 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index d0d18e57abc..018162bd848 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -66,7 +66,6 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
-- pip
 - pre-commit
 - ptxcompiler
 - pyarrow==16.1.0.*
@@ -99,6 +98,4 @@ dependencies:
 - transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
-- pip:
-  - git+https://github.com/python-streamz/streamz.git@master
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index caf39a32d79..c60ffa7aaa5 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -64,7 +64,6 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
-- pip
 - pre-commit
 - pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
@@ -97,6 +96,4 @@ dependencies:
 - transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
-- pip:
-  - git+https://github.com/python-streamz/streamz.git@master
 name: all_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index a774345fe95..150d03be021 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -10,10 +10,10 @@ files:
       - build_all
       - build_cpp
       - build_python_common
-      - build_python_pylibcudf
-      - build_python_cudf
       - cuda
       - cuda_version
+      - depends_on_cupy
+      - depends_on_rmm
       - develop
       - docs
       - libarrow_build
@@ -31,7 +31,6 @@ files:
       - test_python_cudf
       - test_python_dask_cudf
       - test_python_pylibcudf
-      - depends_on_cupy
   test_static_build:
     output: none
     includes:
@@ -95,7 +94,8 @@ files:
     includes:
       - build_base
       - build_python_common
-      - build_python_cudf
+      - depends_on_pylibcudf
+      - depends_on_rmm
   py_run_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -107,6 +107,7 @@ files:
       - pyarrow_run
       - depends_on_cupy
       - depends_on_pylibcudf
+      - depends_on_rmm
   py_test_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -116,14 +117,14 @@ files:
     includes:
       - test_python_common
       - test_python_cudf
-  py_rapids_build_pylibcudf:
+  py_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
     extras:
       table: build-system
     includes:
       - rapids_build_skbuild
-  py_build_pylibcudf:
+  py_rapids_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
     extras:
@@ -132,15 +133,16 @@ files:
     includes:
       - build_base
       - build_python_common
-      - build_python_pylibcudf
+      - depends_on_rmm
   py_run_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
     extras:
       table: project
     includes:
-      - run_pylibcudf
+      - depends_on_rmm
       - pyarrow_run
+      - run_pylibcudf
   py_test_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
@@ -215,14 +217,14 @@ files:
     includes:
       - test_python_common
       - test_python_dask_cudf
-  py_rapids_build_cudf_kafka:
+  py_build_cudf_kafka:
     output: pyproject
     pyproject_dir: python/cudf_kafka
     extras:
       table: build-system
     includes:
       - rapids_build_skbuild
-  py_build_cudf_kafka:
+  py_rapids_build_cudf_kafka:
     output: pyproject
     pyproject_dir: python/cudf_kafka
     extras:
@@ -364,65 +366,6 @@ dependencies:
           # Sync with conda build constraint & wheel run constraint.
           # TODO: Change to `2.0.*` for NumPy 2
           - numpy==1.23.*
-  build_python_pylibcudf:
-    common:
-      - output_types: conda
-        packages:
-          - &rmm_unsuffixed rmm==24.10.*,>=0.0.0a0
-      - output_types: requirements
-        packages:
-          # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for rmm-cu{11,12}.
-          - --extra-index-url=https://pypi.nvidia.com
-          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
-    specific:
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - &rmm_cu12 rmm-cu12==24.10.*,>=0.0.0a0
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - &rmm_cu11 rmm-cu11==24.10.*,>=0.0.0a0
-          - {matrix: null, packages: [*rmm_unsuffixed]}
-  build_python_cudf:
-    common:
-      - output_types: conda
-        packages:
-          - *rmm_unsuffixed
-          - pip
-          - pip:
-              - git+https://github.com/python-streamz/streamz.git@master
-      - output_types: requirements
-        packages:
-          # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for rmm-cu{11,12}.
-          - --extra-index-url=https://pypi.nvidia.com
-          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
-          - git+https://github.com/python-streamz/streamz.git@master
-    specific:
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - &pylibcudf_cu12 pylibcudf-cu12==24.10.*,>=0.0.0a0
-              - *rmm_cu12
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - &pylibcudf_cu11 pylibcudf-cu11==24.10.*,>=0.0.0a0
-              - *rmm_cu11
-          - matrix:
-            packages:
-              - &pylibcudf_unsuffixed pylibcudf==24.10.*,>=0.0.0a0
-              - *rmm_unsuffixed
   libarrow_build:
     common:
       - output_types: conda
@@ -635,9 +578,6 @@ dependencies:
           - nvtx>=0.2.1
           - packaging
           - typing_extensions>=4.0.0
-      - output_types: conda
-        packages:
-          - *rmm_unsuffixed
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -654,19 +594,6 @@ dependencies:
             packages: &run_pylibcudf_packages_all_cu11
               - cuda-python>=11.7.1,<12.0a0
           - {matrix: null, packages: *run_pylibcudf_packages_all_cu11}
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - *rmm_cu12
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - *rmm_cu11
-          - {matrix: null, packages: [*rmm_unsuffixed]}
   run_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -677,9 +604,6 @@ dependencies:
           - packaging
           - rich
           - typing_extensions>=4.0.0
-      - output_types: conda
-        packages:
-          - *rmm_unsuffixed
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -711,19 +635,16 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - *rmm_cu12
               - pynvjitlink-cu12>=0.0.0a0
           - matrix:
               cuda: "12.*"
               cuda_suffixed: "false"
             packages:
-              - *rmm_unsuffixed
               - *pynvjitlink_unsuffixed
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - *rmm_cu11
               - cubinlinker-cu11
               - ptxcompiler-cu11
           - matrix:
@@ -732,7 +653,6 @@ dependencies:
             packages: &run_cudf_cu11_unsuffixed
               - *cubinlinker_unsuffixed
               - *ptxcompiler_unsuffixed
-              - *rmm_unsuffixed
           - {matrix: null, packages: *run_cudf_cu11_unsuffixed}
   run_cudf_polars:
     common:
@@ -843,7 +763,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - *pylibcudf_unsuffixed
+          - &pylibcudf_unsuffixed pylibcudf==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -857,12 +777,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - *pylibcudf_cu12
+              - pylibcudf-cu12==24.10.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - *pylibcudf_cu11
+              - pylibcudf-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibcudf_unsuffixed]}
   depends_on_cudf:
     common:
@@ -929,6 +849,33 @@ dependencies:
             packages: &cupy_packages_cu11
               - cupy-cuda11x>=12.0.0
           - {matrix: null, packages: *cupy_packages_cu11}
+  depends_on_rmm:
+    common:
+      - output_types: conda
+        packages:
+          - &rmm_unsuffixed rmm==24.10.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu11==24.10.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *rmm_unsuffixed
   test_python_pandas_cudf:
     common:
       - output_types: [requirements, pyproject]

From 28fee97c24bcb5f6c61241058c7c3f824687f654 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 20 Aug 2024 17:02:49 -0400
Subject: [PATCH 099/270] Enable gtests previously disabled for
 compute-sanitizer bug (#16581)

Enables tests disable in https://github.com/rapidsai/cudf/pull/15259 due to a `compute-sanitizer` bug. This has been fixed in the CUDA 12.5 release and the nightly memchecks should pass again with these enabled.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16581
---
 .../iterator/value_iterator_test_numeric.cu   | 16 ++-----------
 cpp/tests/reductions/reduction_tests.cpp      |  3 ---
 .../reductions/segmented_reduction_tests.cpp  | 24 -------------------
 3 files changed, 2 insertions(+), 41 deletions(-)

diff --git a/cpp/tests/iterator/value_iterator_test_numeric.cu b/cpp/tests/iterator/value_iterator_test_numeric.cu
index d3d1c12bdc7..39e05ff6832 100644
--- a/cpp/tests/iterator/value_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/value_iterator_test_numeric.cu
@@ -23,17 +23,5 @@ template <typename T>
 struct NumericValueIteratorTest : public IteratorTest<T> {};
 
 TYPED_TEST_SUITE(NumericValueIteratorTest, TestingTypes);
-TYPED_TEST(NumericValueIteratorTest, non_null_iterator)
-{
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-  non_null_iterator(*this);
-}
-TYPED_TEST(NumericValueIteratorTest, null_iterator)
-{
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-  null_iterator(*this);
-}
+TYPED_TEST(NumericValueIteratorTest, non_null_iterator) { non_null_iterator(*this); }
+TYPED_TEST(NumericValueIteratorTest, null_iterator) { null_iterator(*this); }
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 0ec4cfa34c4..949ffcc26a6 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -300,9 +300,6 @@ TYPED_TEST_SUITE(ReductionTest, cudf::test::NumericTypes);
 TYPED_TEST(ReductionTest, Product)
 {
   using T = TypeParam;
-  if constexpr (std::is_same_v<T, int16_t> || std::is_same_v<T, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
 
   std::vector<int> int_values({5, -1, 1, 0, 3, 2, 4});
   std::vector<bool> host_bools({true, true, false, false, true, true, true});
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 37efc116d2a..668690639a6 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -87,10 +87,6 @@ TYPED_TEST(SegmentedReductionTest, SumExcludeNulls)
 
 TYPED_TEST(SegmentedReductionTest, ProductExcludeNulls)
 {
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-
   // [1, 3, 5], [null, 3, 5], [1], [null], [null, null], []
   // values:    {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}
   // offsets:   {0, 3, 6, 7, 8, 10, 10}
@@ -141,10 +137,6 @@ TYPED_TEST(SegmentedReductionTest, ProductExcludeNulls)
 
 TYPED_TEST(SegmentedReductionTest, MaxExcludeNulls)
 {
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-
   // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
   // values:    {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}
   // offsets:   {0, 3, 6, 7, 8, 10, 10}
@@ -193,10 +185,6 @@ TYPED_TEST(SegmentedReductionTest, MaxExcludeNulls)
 
 TYPED_TEST(SegmentedReductionTest, MinExcludeNulls)
 {
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-
   // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
   // values:   {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}
   // offsets:  {0, 3, 6, 7, 8, 10, 10}
@@ -388,10 +376,6 @@ TYPED_TEST(SegmentedReductionTest, SumIncludeNulls)
 
 TYPED_TEST(SegmentedReductionTest, ProductIncludeNulls)
 {
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-
   // [1, 3, 5], [null, 3, 5], [1], [null], [null, null], []
   // values:    {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}
   // offsets:   {0, 3, 6, 7, 8, 10, 10}
@@ -445,10 +429,6 @@ TYPED_TEST(SegmentedReductionTest, ProductIncludeNulls)
 
 TYPED_TEST(SegmentedReductionTest, MaxIncludeNulls)
 {
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-
   // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
   // values:    {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}
   // offsets:   {0, 3, 6, 7, 8, 10, 10}
@@ -500,10 +480,6 @@ TYPED_TEST(SegmentedReductionTest, MaxIncludeNulls)
 
 TYPED_TEST(SegmentedReductionTest, MinIncludeNulls)
 {
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-
   // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
   // values:   {1, 2, 3, 1, XXX, 3, 1, XXX, XXX}
   // offsets:  {0, 3, 6, 7, 8, 10, 10}

From 58799d698d861866b5650d368f5195174fc9644e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 20 Aug 2024 11:29:27 -1000
Subject: [PATCH 100/270] Add stricter typing and validation to ColumnAccessor
 (#16602)

* Added typing annotations that are generally a little stricter on when `Column`s should be passed. Added error handling for these cases
* Moved some argument checking that was performed on `DataFrame` to `ColumnAccessor`
* Adding more `verify=False` to `ColumnAccessor` calls and preserving `.label_dtype` more when we're just selecting columns from the prior `ColumnAccessor`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16602
---
 python/cudf/cudf/_lib/csv.pyx                 |   2 +-
 python/cudf/cudf/core/_base_index.py          |   2 +-
 python/cudf/cudf/core/column_accessor.py      | 114 ++++++++++--------
 python/cudf/cudf/core/dataframe.py            |  14 +--
 python/cudf/cudf/core/frame.py                |   4 +-
 python/cudf/cudf/core/indexing_utils.py       |   4 -
 python/cudf/cudf/core/join/_join_helpers.py   |   8 +-
 python/cudf/cudf/core/join/join.py            |   5 +-
 .../cudf/cudf/tests/test_column_accessor.py   |   2 +-
 9 files changed, 80 insertions(+), 75 deletions(-)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index a90fe0f9ac6..e0f57df1368 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -282,7 +282,7 @@ def read_csv(
     # Set index if the index_col parameter is passed
     if index_col is not None and index_col is not False:
         if isinstance(index_col, int):
-            index_col_name = df._data.select_by_index(index_col).names[0]
+            index_col_name = df._data.get_labels_by_index(index_col)[0]
             df = df.set_index(index_col_name)
             if isinstance(index_col_name, str) and \
                     names is None and orig_header == "infer":
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index d13351c49dd..a224e0ce0d0 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1698,7 +1698,7 @@ def join(
         # in case of MultiIndex
         if isinstance(lhs, cudf.MultiIndex):
             on = (
-                lhs._data.select_by_index(level).names[0]
+                lhs._data.get_labels_by_index(level)[0]
                 if isinstance(level, int)
                 else level
             )
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 67c19f11e41..7aa3e5f8163 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -102,7 +102,7 @@ def __init__(
         rangeindex: bool = False,
         label_dtype: Dtype | None = None,
         verify: bool = True,
-    ):
+    ) -> None:
         if isinstance(data, ColumnAccessor):
             self._data = data._data
             self._level_names = data.level_names
@@ -147,10 +147,10 @@ def __iter__(self):
     def __getitem__(self, key: Any) -> ColumnBase:
         return self._data[key]
 
-    def __setitem__(self, key: Any, value: Any):
+    def __setitem__(self, key: Any, value: ColumnBase) -> None:
         self.set_by_label(key, value)
 
-    def __delitem__(self, key: Any):
+    def __delitem__(self, key: Any) -> None:
         old_ncols = len(self._data)
         del self._data[key]
         new_ncols = len(self._data)
@@ -174,7 +174,7 @@ def __repr__(self) -> str:
 
     def _from_columns_like_self(
         self, columns: abc.Iterable[ColumnBase], verify: bool = True
-    ):
+    ) -> Self:
         """
         Return a new ColumnAccessor with columns and the properties of self.
 
@@ -250,7 +250,7 @@ def _grouped_data(self) -> abc.MutableMapping:
         else:
             return self._data
 
-    def _clear_cache(self, old_ncols: int, new_ncols: int):
+    def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
         """
         Clear cached attributes.
 
@@ -310,16 +310,14 @@ def to_pandas_index(self) -> pd.Index:
             )
         return result
 
-    def insert(
-        self, name: Any, value: Any, loc: int = -1, validate: bool = True
-    ):
+    def insert(self, name: Any, value: ColumnBase, loc: int = -1) -> None:
         """
         Insert column into the ColumnAccessor at the specified location.
 
         Parameters
         ----------
         name : Name corresponding to the new column
-        value : column-like
+        value : ColumnBase
         loc : int, optional
             The location to insert the new value at.
             Must be (0 <= loc <= ncols). By default, the column is added
@@ -330,33 +328,35 @@ def insert(
         None, this function operates in-place.
         """
         name = self._pad_key(name)
+        if name in self._data:
+            raise ValueError(f"Cannot insert '{name}', already exists")
 
         old_ncols = len(self._data)
         if loc == -1:
             loc = old_ncols
-        if not (0 <= loc <= old_ncols):
+        elif not (0 <= loc <= old_ncols):
             raise ValueError(
                 f"insert: loc out of bounds: must be  0 <= loc <= {old_ncols}"
             )
+
+        if not isinstance(value, column.ColumnBase):
+            raise ValueError("value must be a Column")
+        elif old_ncols > 0 and len(value) != self.nrows:
+            raise ValueError("All columns must be of equal length")
+
         # TODO: we should move all insert logic here
-        if name in self._data:
-            raise ValueError(f"Cannot insert '{name}', already exists")
         if loc == old_ncols:
-            if validate:
-                value = column.as_column(value)
-                if old_ncols > 0 and len(value) != self.nrows:
-                    raise ValueError("All columns must be of equal length")
             self._data[name] = value
         else:
             new_keys = self.names[:loc] + (name,) + self.names[loc:]
             new_values = self.columns[:loc] + (value,) + self.columns[loc:]
-            self._data = self._data.__class__(zip(new_keys, new_values))
+            self._data = dict(zip(new_keys, new_values))
         self._clear_cache(old_ncols, old_ncols + 1)
         if old_ncols == 0:
             # The type(name) may no longer match the prior label_dtype
             self.label_dtype = None
 
-    def copy(self, deep=False) -> ColumnAccessor:
+    def copy(self, deep: bool = False) -> Self:
         """
         Make a copy of this ColumnAccessor.
         """
@@ -373,7 +373,7 @@ def copy(self, deep=False) -> ColumnAccessor:
             verify=False,
         )
 
-    def select_by_label(self, key: Any) -> ColumnAccessor:
+    def select_by_label(self, key: Any) -> Self:
         """
         Return a subset of this column accessor,
         composed of the keys specified by `key`.
@@ -389,7 +389,7 @@ def select_by_label(self, key: Any) -> ColumnAccessor:
         if isinstance(key, slice):
             return self._select_by_label_slice(key)
         elif pd.api.types.is_list_like(key) and not isinstance(key, tuple):
-            return self._select_by_label_list_like(key)
+            return self._select_by_label_list_like(tuple(key))
         else:
             if isinstance(key, tuple):
                 if any(isinstance(k, slice) for k in key):
@@ -427,9 +427,13 @@ def get_labels_by_index(self, index: Any) -> tuple:
             # TODO: Doesn't handle on-device columns
             return tuple(n for n, keep in zip(self.names, index) if keep)
         else:
+            if len(set(index)) != len(index):
+                raise NotImplementedError(
+                    "Selecting duplicate column labels is not supported."
+                )
             return tuple(self.names[i] for i in index)
 
-    def select_by_index(self, index: Any) -> ColumnAccessor:
+    def select_by_index(self, index: Any) -> Self:
         """
         Return a ColumnAccessor composed of the columns
         specified by index.
@@ -445,13 +449,15 @@ def select_by_index(self, index: Any) -> ColumnAccessor:
         """
         keys = self.get_labels_by_index(index)
         data = {k: self._data[k] for k in keys}
-        return self.__class__(
+        return type(self)(
             data,
             multiindex=self.multiindex,
             level_names=self.level_names,
+            label_dtype=self.label_dtype,
+            verify=False,
         )
 
-    def swaplevel(self, i=-2, j=-1):
+    def swaplevel(self, i=-2, j=-1) -> Self:
         """
         Swap level i with level j.
         Calling this method does not change the ordering of the values.
@@ -467,6 +473,10 @@ def swaplevel(self, i=-2, j=-1):
         -------
         ColumnAccessor
         """
+        if not self.multiindex:
+            raise ValueError(
+                "swaplevel is only valid for self.multiindex=True"
+            )
 
         i = _get_level(i, self.nlevels, self.level_names)
         j = _get_level(j, self.nlevels, self.level_names)
@@ -486,13 +496,16 @@ def swaplevel(self, i=-2, j=-1):
         new_names = list(self.level_names)
         new_names[i], new_names[j] = new_names[j], new_names[i]
 
-        return self.__class__(
+        return type(self)(
             new_data,
-            multiindex=True,
+            multiindex=self.multiindex,
             level_names=new_names,
+            rangeindex=self.rangeindex,
+            label_dtype=self.label_dtype,
+            verify=False,
         )
 
-    def set_by_label(self, key: Any, value: Any, validate: bool = True):
+    def set_by_label(self, key: Any, value: ColumnBase) -> None:
         """
         Add (or modify) column by name.
 
@@ -500,26 +513,21 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
         ----------
         key
             name of the column
-        value : column-like
+        value : Column
             The value to insert into the column.
-        validate : bool
-            If True, the provided value will be coerced to a column and
-            validated before setting (Default value = True).
         """
         key = self._pad_key(key)
-        if validate:
-            value = column.as_column(value)
-            if len(self._data) > 0 and len(value) != self.nrows:
-                raise ValueError("All columns must be of equal length")
+        if not isinstance(value, column.ColumnBase):
+            raise ValueError("value must be a Column")
+        if len(self) > 0 and len(value) != self.nrows:
+            raise ValueError("All columns must be of equal length")
 
         old_ncols = len(self._data)
         self._data[key] = value
         new_ncols = len(self._data)
         self._clear_cache(old_ncols, new_ncols)
 
-    def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
-        # Might be a generator
-        key = tuple(key)
+    def _select_by_label_list_like(self, key: tuple) -> Self:
         # Special-casing for boolean mask
         if (bn := len(key)) > 0 and all(map(is_bool, key)):
             if bn != (n := len(self.names)):
@@ -539,19 +547,22 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
                 )
         if self.multiindex:
             data = dict(_to_flat_dict_inner(data))
-        return self.__class__(
+        return type(self)(
             data,
             multiindex=self.multiindex,
             level_names=self.level_names,
+            label_dtype=self.label_dtype,
+            verify=False,
         )
 
-    def _select_by_label_grouped(self, key: Any) -> ColumnAccessor:
+    def _select_by_label_grouped(self, key: Any) -> Self:
         result = self._grouped_data[key]
         if isinstance(result, column.ColumnBase):
             # self._grouped_data[key] = self._data[key] so skip validation
-            return self.__class__(
+            return type(self)(
                 data={key: result},
                 multiindex=self.multiindex,
+                label_dtype=self.label_dtype,
                 verify=False,
             )
         else:
@@ -563,9 +574,10 @@ def _select_by_label_grouped(self, key: Any) -> ColumnAccessor:
                 result,
                 multiindex=self.nlevels - len(key) > 1,
                 level_names=self.level_names[len(key) :],
+                verify=False,
             )
 
-    def _select_by_label_slice(self, key: slice) -> ColumnAccessor:
+    def _select_by_label_slice(self, key: slice) -> Self:
         start, stop = key.start, key.stop
         if key.step is not None:
             raise TypeError("Label slicing with step is not supported")
@@ -585,19 +597,22 @@ def _select_by_label_slice(self, key: slice) -> ColumnAccessor:
                 stop_idx = len(self.names) - idx
                 break
         keys = self.names[start_idx:stop_idx]
-        return self.__class__(
+        return type(self)(
             {k: self._data[k] for k in keys},
             multiindex=self.multiindex,
             level_names=self.level_names,
+            label_dtype=self.label_dtype,
             verify=False,
         )
 
-    def _select_by_label_with_wildcard(self, key: Any) -> ColumnAccessor:
+    def _select_by_label_with_wildcard(self, key: tuple) -> Self:
         key = self._pad_key(key, slice(None))
-        return self.__class__(
-            {k: self._data[k] for k in self._data if _keys_equal(k, key)},
+        data = {k: self._data[k] for k in self.names if _keys_equal(k, key)}
+        return type(self)(
+            data,
             multiindex=self.multiindex,
             level_names=self.level_names,
+            label_dtype=self.label_dtype,
             verify=False,
         )
 
@@ -614,7 +629,7 @@ def _pad_key(self, key: Any, pad_value="") -> Any:
 
     def rename_levels(
         self, mapper: Mapping[Any, Any] | Callable, level: int | None = None
-    ) -> ColumnAccessor:
+    ) -> Self:
         """
         Rename the specified levels of the given ColumnAccessor
 
@@ -686,7 +701,7 @@ def rename_column(x):
             verify=False,
         )
 
-    def droplevel(self, level):
+    def droplevel(self, level) -> None:
         # drop the nth level
         if level < 0:
             level += self.nlevels
@@ -701,9 +716,8 @@ def droplevel(self, level):
             self._level_names[:level] + self._level_names[level + 1 :]
         )
 
-        if (
-            len(self._level_names) == 1
-        ):  # can't use nlevels, as it depends on multiindex
+        if len(self._level_names) == 1:
+            # can't use nlevels, as it depends on multiindex
             self.multiindex = False
         self._clear_cache(old_ncols, new_ncols)
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 97684129203..43693ec20b1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -382,19 +382,19 @@ def _setitem_tuple_arg(self, key, value):
                 value = as_column(value, length=length)
 
             if isinstance(value, ColumnBase):
-                new_col = cudf.Series._from_column(value, index=idx)
+                new_ser = cudf.Series._from_column(value, index=idx)
             else:
-                new_col = cudf.Series(value, index=idx)
+                new_ser = cudf.Series(value, index=idx)
             if len(self._frame.index) != 0:
-                new_col = new_col._align_to_index(
+                new_ser = new_ser._align_to_index(
                     self._frame.index, how="right"
                 )
 
             if len(self._frame.index) == 0:
                 self._frame.index = (
-                    idx if idx is not None else cudf.RangeIndex(len(new_col))
+                    idx if idx is not None else cudf.RangeIndex(len(new_ser))
                 )
-            self._frame._data.insert(key[1], new_col)
+            self._frame._data.insert(key[1], new_ser._column)
         else:
             if is_scalar(value):
                 for col in columns_df._column_names:
@@ -981,6 +981,7 @@ def _init_from_series_list(self, data, columns, index):
             self._data.rangeindex = isinstance(
                 columns, (range, cudf.RangeIndex, pd.RangeIndex)
             )
+            self._data.label_dtype = pd.Index(columns).dtype
         else:
             self._data.rangeindex = True
 
@@ -3272,9 +3273,6 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
             If False, a reindexing operation is performed if
             `value.index` is not equal to `self.index`.
         """
-        if name in self._data:
-            raise NameError(f"duplicated column name {name}")
-
         num_cols = self._num_columns
         if loc < 0:
             loc += num_cols + 1
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index ce23d671a6c..3e1efd7c97a 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1010,9 +1010,7 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
         See `ColumnBase._with_type_metadata` for more information.
         """
         for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes):
-            self._data.set_by_label(
-                name, col._with_type_metadata(dtype), validate=False
-            )
+            self._data.set_by_label(name, col._with_type_metadata(dtype))
 
         return self
 
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index a0089242909..8182e5cede2 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -152,10 +152,6 @@ def destructure_dataframe_iloc_indexer(
         column_names: ColumnLabels = list(
             frame._data.get_labels_by_index(cols)
         )
-        if len(set(column_names)) != len(column_names):
-            raise NotImplementedError(
-                "cudf DataFrames do not support repeated column names"
-            )
     except TypeError:
         raise TypeError(
             "Column indices must be integers, slices, "
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index 32c84763401..854c44ff1a1 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -37,16 +37,16 @@ class _ColumnIndexer(_Indexer):
     def get(self, obj: cudf.DataFrame) -> ColumnBase:
         return obj._data[self.name]
 
-    def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False):
-        obj._data.set_by_label(self.name, value, validate=validate)
+    def set(self, obj: cudf.DataFrame, value: ColumnBase):
+        obj._data.set_by_label(self.name, value)
 
 
 class _IndexIndexer(_Indexer):
     def get(self, obj: cudf.DataFrame) -> ColumnBase:
         return obj.index._data[self.name]
 
-    def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False):
-        obj.index._data.set_by_label(self.name, value, validate=validate)
+    def set(self, obj: cudf.DataFrame, value: ColumnBase):
+        obj.index._data.set_by_label(self.name, value)
 
 
 def _match_join_keys(
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index ce81c1fc5b1..b65bc7af832 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -272,8 +272,8 @@ def perform_merge(self) -> cudf.DataFrame:
                 lcol_casted = lcol_casted.astype("category")
                 rcol_casted = rcol_casted.astype("category")
 
-            left_key.set(self.lhs, lcol_casted, validate=False)
-            right_key.set(self.rhs, rcol_casted, validate=False)
+            left_key.set(self.lhs, lcol_casted)
+            right_key.set(self.rhs, rcol_casted)
 
         left_rows, right_rows = self._gather_maps(
             left_join_cols, right_join_cols
@@ -329,7 +329,6 @@ def _merge_results(
                     lkey.set(
                         left_result,
                         lkey.get(left_result).fillna(rkey.get(right_result)),
-                        validate=False,
                     )
 
         # All columns from the left table make it into the output. Non-key
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index 2d7bc809d4d..5cef077c18d 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -370,7 +370,7 @@ def test_replace_level_values_MultiColumn():
 def test_clear_nrows_empty_before():
     ca = ColumnAccessor({})
     assert ca.nrows == 0
-    ca.insert("new", [1])
+    ca.insert("new", as_column([1]))
     assert ca.nrows == 1
 
 
From 8ab553c7835b21c2d5fcc76cb24960db03722b15 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 21 Aug 2024 08:47:13 -0400
Subject: [PATCH 101/270] Move libcudf reduction google-benchmarks to nvbench
 (#16564)

Reworks the reduction benchmarks currently coded with google-bench to use nvbench instead.
This removes the need to support `row_bit_count` for dictionary column types.
#16121

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16564
---
 cpp/benchmarks/CMakeLists.txt           |  14 ++-
 cpp/benchmarks/reduction/anyall.cpp     |  80 +++++++---------
 cpp/benchmarks/reduction/dictionary.cpp | 111 ++++++++++++-----------
 cpp/benchmarks/reduction/minmax.cpp     |  63 +++++--------
 cpp/benchmarks/reduction/reduce.cpp     | 116 ++++++++++++------------
 cpp/benchmarks/reduction/scan.cpp       |  65 ++++++-------
 6 files changed, 210 insertions(+), 239 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 483b7b0a539..6db282a7728 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -207,12 +207,16 @@ ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher.cu)
 
 # ##################################################################################################
 # * reduction benchmark ---------------------------------------------------------------------------
-ConfigureBench(
-  REDUCTION_BENCH reduction/anyall.cpp reduction/dictionary.cpp reduction/minmax.cpp
-  reduction/reduce.cpp reduction/scan.cpp
-)
 ConfigureNVBench(
-  REDUCTION_NVBENCH reduction/rank.cpp reduction/scan_structs.cpp reduction/segmented_reduce.cpp
+  REDUCTION_NVBENCH
+  reduction/anyall.cpp
+  reduction/dictionary.cpp
+  reduction/minmax.cpp
+  reduction/rank.cpp
+  reduction/reduce.cpp
+  reduction/scan.cpp
+  reduction/scan_structs.cpp
+  reduction/segmented_reduce.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp
index e9d23881764..1e578fab181 100644
--- a/cpp/benchmarks/reduction/anyall.cpp
+++ b/cpp/benchmarks/reduction/anyall.cpp
@@ -16,65 +16,51 @@
 
 #include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/common/table_utilities.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 
-#include <memory>
+#include <nvbench/nvbench.cuh>
 
-class Reduction : public cudf::benchmark {};
+#include <memory>
 
-template <typename type>
-void BM_reduction_anyall(benchmark::State& state,
-                         std::unique_ptr<cudf::reduce_aggregation> const& agg)
+template <typename DataType>
+static void reduction_anyall(nvbench::state& state, nvbench::type_list<DataType>)
 {
-  cudf::size_type const column_size{static_cast<cudf::size_type>(state.range(0))};
-  auto const dtype           = cudf::type_to_id<type>();
-  data_profile const profile = data_profile_builder().no_validity().distribution(
-    dtype, distribution_id::UNIFORM, 0, agg->kind == cudf::aggregation::ANY ? 0 : 100);
-  auto const values = create_random_column(dtype, row_count{column_size}, profile);
+  auto const size     = static_cast<cudf::size_type>(state.get_int64("size"));
+  auto const kind_str = state.get_string("kind");
 
-  cudf::data_type output_dtype{cudf::type_id::BOOL8};
+  auto const input_type = cudf::type_to_id<DataType>();
+  auto const agg        = kind_str == "any" ? cudf::make_any_aggregation<cudf::reduce_aggregation>()
+                                            : cudf::make_all_aggregation<cudf::reduce_aggregation>();
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    auto result = cudf::reduce(*values, *agg, output_dtype);
-  }
+  data_profile const profile =
+    data_profile_builder().no_validity().distribution(input_type,
+                                                      distribution_id::UNIFORM,
+                                                      (kind_str == "all" ? 1 : 0),
+                                                      (kind_str == "any" ? 0 : 100));
+  auto const values = create_random_column(input_type, row_count{size}, profile);
 
-  // The benchmark takes a column and produces one scalar.
-  set_items_processed(state, column_size + 1);
-  set_bytes_processed(state, estimate_size(values->view()) + cudf::size_of(output_dtype));
-}
+  auto const output_type = cudf::data_type{cudf::type_id::BOOL8};
+  auto stream            = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_element_count(size);
+  state.add_global_memory_reads<DataType>(size);
+  state.add_global_memory_writes<nvbench::int8_t>(1);
 
-#define concat(a, b, c) a##b##c
-#define get_agg(op)     concat(cudf::make_, op, _aggregation<cudf::reduce_aggregation>())
+  state.exec(nvbench::exec_tag::sync, [&values, output_type, &agg](nvbench::launch& launch) {
+    cudf::reduce(*values, *agg, output_type);
+  });
 
-// TYPE, OP
-#define RBM_BENCHMARK_DEFINE(name, type, aggregation)             \
-  BENCHMARK_DEFINE_F(Reduction, name)(::benchmark::State & state) \
-  {                                                               \
-    BM_reduction_anyall<type>(state, get_agg(aggregation));       \
-  }                                                               \
-  BENCHMARK_REGISTER_F(Reduction, name)                           \
-    ->UseManualTime()                                             \
-    ->Arg(10000)      /* 10k */                                   \
-    ->Arg(100000)     /* 100k */                                  \
-    ->Arg(1000000)    /* 1M */                                    \
-    ->Arg(10000000)   /* 10M */                                   \
-    ->Arg(100000000); /* 100M */
+  set_throughputs(state);
+}
 
-#define REDUCE_BENCHMARK_DEFINE(type, aggregation) \
-  RBM_BENCHMARK_DEFINE(concat(type, _, aggregation), type, aggregation)
+using Types = nvbench::type_list<bool, int8_t, int32_t, float>;
 
-REDUCE_BENCHMARK_DEFINE(bool, all);
-REDUCE_BENCHMARK_DEFINE(int8_t, all);
-REDUCE_BENCHMARK_DEFINE(int32_t, all);
-REDUCE_BENCHMARK_DEFINE(float, all);
-REDUCE_BENCHMARK_DEFINE(bool, any);
-REDUCE_BENCHMARK_DEFINE(int8_t, any);
-REDUCE_BENCHMARK_DEFINE(int32_t, any);
-REDUCE_BENCHMARK_DEFINE(float, any);
+NVBENCH_BENCH_TYPES(reduction_anyall, NVBENCH_TYPE_AXES(Types))
+  .set_name("anyall")
+  .set_type_axes_names({"DataType"})
+  .add_string_axis("kind", {"any", "all"})
+  .add_int64_axis("size", {100'000, 1'000'000, 10'000'000, 100'000'000});
diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp
index 5095337dbb3..1bdb50a539a 100644
--- a/cpp/benchmarks/reduction/dictionary.cpp
+++ b/cpp/benchmarks/reduction/dictionary.cpp
@@ -16,79 +16,84 @@
 
 #include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
+#include <cudf/aggregation.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
 
-class ReductionDictionary : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-template <typename T>
-void BM_reduction_dictionary(benchmark::State& state,
-                             std::unique_ptr<cudf::reduce_aggregation> const& agg)
+template <cudf::reduce_aggregation::Kind kind>
+static std::unique_ptr<cudf::reduce_aggregation> make_reduce_aggregation()
 {
-  cudf::size_type const column_size{static_cast<cudf::size_type>(state.range(0))};
+  switch (kind) {
+    case cudf::reduce_aggregation::ANY:
+      return cudf::make_any_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::ALL:
+      return cudf::make_all_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::MIN:
+      return cudf::make_min_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::MAX:
+      return cudf::make_max_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::MEAN:
+      return cudf::make_mean_aggregation<cudf::reduce_aggregation>();
+    default: CUDF_FAIL("Unsupported reduce aggregation in this benchmark");
+  }
+}
+
+template <typename DataType, cudf::reduce_aggregation::Kind kind>
+static void reduction_dictionary(nvbench::state& state,
+                                 nvbench::type_list<DataType, nvbench::enum_type<kind>>)
+{
+  cudf::size_type const size{static_cast<cudf::size_type>(state.get_int64("size"))};
 
-  // int column and encoded dictionary column
   data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
     cudf::type_to_id<long>(),
     distribution_id::UNIFORM,
-    (agg->kind == cudf::aggregation::ALL ? 1 : 0),
-    (agg->kind == cudf::aggregation::ANY ? 0 : 100));
-  auto int_column = create_random_column(cudf::type_to_id<long>(), row_count{column_size}, profile);
-  auto number_col = cudf::cast(*int_column, cudf::data_type{cudf::type_to_id<T>()});
+    (kind == cudf::aggregation::ALL ? 1 : 0),
+    (kind == cudf::aggregation::ANY ? 0 : 100));
+  auto int_column = create_random_column(cudf::type_to_id<long>(), row_count{size}, profile);
+  auto number_col = cudf::cast(*int_column, cudf::data_type{cudf::type_to_id<DataType>()});
   auto values     = cudf::dictionary::encode(*number_col);
 
-  cudf::data_type output_dtype = [&] {
-    if (agg->kind == cudf::aggregation::ANY || agg->kind == cudf::aggregation::ALL)
+  cudf::data_type output_type = [&] {
+    if (kind == cudf::aggregation::ANY || kind == cudf::aggregation::ALL) {
       return cudf::data_type{cudf::type_id::BOOL8};
-    if (agg->kind == cudf::aggregation::MEAN) return cudf::data_type{cudf::type_id::FLOAT64};
-    return cudf::data_type{cudf::type_to_id<T>()};
+    }
+    if (kind == cudf::aggregation::MEAN) { return cudf::data_type{cudf::type_id::FLOAT64}; }
+    return cudf::data_type{cudf::type_to_id<DataType>()};
   }();
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    auto result = cudf::reduce(*values, *agg, output_dtype);
+  auto agg = make_reduce_aggregation<kind>();
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_element_count(size);
+  state.add_global_memory_reads<DataType>(size);
+  if (kind == cudf::aggregation::ANY || kind == cudf::aggregation::ALL) {
+    state.add_global_memory_writes<nvbench::int8_t>(1);  // BOOL8s
+  } else {
+    state.add_global_memory_writes<DataType>(1);
   }
 
-  // The benchmark takes a column and produces two scalars.
-  set_items_processed(state, column_size + 1);
+  state.exec(nvbench::exec_tag::sync, [&values, output_type, &agg](nvbench::launch& launch) {
+    cudf::reduce(*values, *agg, output_type);
+  });
 
-  // We don't set the metrics for the size read/written as row_bit_count() doesn't
-  // support the dictionary type yet (and so is estimate_size()).
-  // See https://github.com/rapidsai/cudf/issues/16121 for details.
+  set_throughputs(state);
 }
 
-#define concat(a, b, c) a##b##c
-#define get_agg(op)     concat(cudf::make_, op, _aggregation<cudf::reduce_aggregation>())
-
-// TYPE, OP
-#define RBM_BENCHMARK_DEFINE(name, type, aggregation)                       \
-  BENCHMARK_DEFINE_F(ReductionDictionary, name)(::benchmark::State & state) \
-  {                                                                         \
-    BM_reduction_dictionary<type>(state, get_agg(aggregation));             \
-  }                                                                         \
-  BENCHMARK_REGISTER_F(ReductionDictionary, name)                           \
-    ->UseManualTime()                                                       \
-    ->Arg(10000)      /* 10k */                                             \
-    ->Arg(100000)     /* 100k */                                            \
-    ->Arg(1000000)    /* 1M */                                              \
-    ->Arg(10000000)   /* 10M */                                             \
-    ->Arg(100000000); /* 100M */
-
-#define REDUCE_BENCHMARK_DEFINE(type, aggregation) \
-  RBM_BENCHMARK_DEFINE(concat(type, _, aggregation), type, aggregation)
+using Types    = nvbench::type_list<int32_t, float>;
+using AggKinds = nvbench::enum_type_list<cudf::reduce_aggregation::ALL,
+                                         cudf::reduce_aggregation::ANY,
+                                         cudf::reduce_aggregation::MIN,
+                                         cudf::reduce_aggregation::MAX,
+                                         cudf::reduce_aggregation::MEAN>;
 
-REDUCE_BENCHMARK_DEFINE(int32_t, all);
-REDUCE_BENCHMARK_DEFINE(float, all);
-REDUCE_BENCHMARK_DEFINE(int32_t, any);
-REDUCE_BENCHMARK_DEFINE(float, any);
-REDUCE_BENCHMARK_DEFINE(int32_t, min);
-REDUCE_BENCHMARK_DEFINE(float, min);
-REDUCE_BENCHMARK_DEFINE(int32_t, max);
-REDUCE_BENCHMARK_DEFINE(float, max);
-REDUCE_BENCHMARK_DEFINE(int32_t, mean);
-REDUCE_BENCHMARK_DEFINE(float, mean);
+NVBENCH_BENCH_TYPES(reduction_dictionary, NVBENCH_TYPE_AXES(Types, AggKinds))
+  .set_name("reduction_dictionary")
+  .set_type_axes_names({"DataType", "AggKinds"})
+  .add_int64_axis("size", {100'000, 1'000'000, 10'000'000, 100'000'000});
diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp
index 050f2887221..c89e22d3f44 100644
--- a/cpp/benchmarks/reduction/minmax.cpp
+++ b/cpp/benchmarks/reduction/minmax.cpp
@@ -16,55 +16,40 @@
 
 #include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/common/table_utilities.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 
-class Reduction : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-template <typename type>
-void BM_reduction(benchmark::State& state)
+template <typename DataType>
+static void reduction_minmax(nvbench::state& state, nvbench::type_list<DataType>)
 {
-  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  auto const dtype_id = cudf::type_to_id<type>();
-  auto const input_column =
-    create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity());
+  auto const size = static_cast<cudf::size_type>(state.get_int64("size"));
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    auto result = cudf::minmax(*input_column);
-  }
+  auto const input_type = cudf::type_to_id<DataType>();
 
-  // The benchmark takes a column and produces two scalars.
-  set_items_processed(state, column_size + 2);
-  cudf::data_type dtype = cudf::data_type{dtype_id};
-  set_bytes_processed(state, estimate_size(input_column->view()) + 2 * cudf::size_of(dtype));
-}
+  data_profile const profile =
+    data_profile_builder().no_validity().distribution(input_type, distribution_id::UNIFORM, 0, 100);
+  auto const input_column = create_random_column(input_type, row_count{size}, profile);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_element_count(size);
+  state.add_global_memory_reads<DataType>(size);
+  state.add_global_memory_writes<DataType>(2);
 
-#define concat(a, b, c) a##b##c
-#define get_agg(op)     concat(cudf::make_, op, _aggregation())
+  state.exec(nvbench::exec_tag::sync,
+             [&input_column](nvbench::launch& launch) { cudf::minmax(*input_column); });
 
-// TYPE, OP
-#define RBM_BENCHMARK_DEFINE(name, type, aggregation)                                            \
-  BENCHMARK_DEFINE_F(Reduction, name)(::benchmark::State & state) { BM_reduction<type>(state); } \
-  BENCHMARK_REGISTER_F(Reduction, name)                                                          \
-    ->UseManualTime()                                                                            \
-    ->Arg(10000)      /* 10k */                                                                  \
-    ->Arg(100000)     /* 100k */                                                                 \
-    ->Arg(1000000)    /* 1M */                                                                   \
-    ->Arg(10000000)   /* 10M */                                                                  \
-    ->Arg(100000000); /* 100M */
+  set_throughputs(state);
+}
 
-#define REDUCE_BENCHMARK_DEFINE(type, aggregation) \
-  RBM_BENCHMARK_DEFINE(concat(type, _, aggregation), type, aggregation)
+using Types = nvbench::type_list<bool, int8_t, int32_t, float, cudf::timestamp_ms>;
 
-REDUCE_BENCHMARK_DEFINE(bool, minmax);
-REDUCE_BENCHMARK_DEFINE(int8_t, minmax);
-REDUCE_BENCHMARK_DEFINE(int32_t, minmax);
-using cudf::timestamp_ms;
-REDUCE_BENCHMARK_DEFINE(timestamp_ms, minmax);
-REDUCE_BENCHMARK_DEFINE(float, minmax);
+NVBENCH_BENCH_TYPES(reduction_minmax, NVBENCH_TYPE_AXES(Types))
+  .set_name("minmax")
+  .set_type_axes_names({"DataType"})
+  .add_int64_axis("size", {100'000, 1'000'000, 10'000'000, 100'000'000});
diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp
index 63c96f4fe9e..14bf90c4943 100644
--- a/cpp/benchmarks/reduction/reduce.cpp
+++ b/cpp/benchmarks/reduction/reduce.cpp
@@ -16,82 +16,80 @@
 
 #include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/common/table_utilities.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <memory>
 
-class Reduction : public cudf::benchmark {};
+template <cudf::reduce_aggregation::Kind kind>
+static std::unique_ptr<cudf::reduce_aggregation> make_reduce_aggregation()
+{
+  switch (kind) {
+    case cudf::reduce_aggregation::MIN:
+      return cudf::make_min_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::SUM:
+      return cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::MEAN:
+      return cudf::make_mean_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::PRODUCT:
+      return cudf::make_product_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::VARIANCE:
+      return cudf::make_variance_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::STD:
+      return cudf::make_std_aggregation<cudf::reduce_aggregation>();
+    default: CUDF_FAIL("Unsupported reduce aggregation in this benchmark");
+  }
+}
 
-template <typename type>
-void BM_reduction(benchmark::State& state, std::unique_ptr<cudf::reduce_aggregation> const& agg)
+template <typename DataType, cudf::reduce_aggregation::Kind kind>
+static void reduction(nvbench::state& state, nvbench::type_list<DataType, nvbench::enum_type<kind>>)
 {
-  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  auto const dtype = cudf::type_to_id<type>();
+  auto const size = static_cast<cudf::size_type>(state.get_int64("size"));
+  if (cudf::is_chrono<DataType>() && kind != cudf::aggregation::MIN) {
+    state.skip("Skip chrono types for some aggregations");
+  }
+
+  auto const input_type = cudf::type_to_id<DataType>();
   data_profile const profile =
-    data_profile_builder().no_validity().distribution(dtype, distribution_id::UNIFORM, 0, 100);
-  auto const input_column = create_random_column(dtype, row_count{column_size}, profile);
+    data_profile_builder().no_validity().distribution(input_type, distribution_id::UNIFORM, 0, 100);
+  auto const input_column = create_random_column(input_type, row_count{size}, profile);
 
-  cudf::data_type output_dtype =
-    (agg->kind == cudf::aggregation::MEAN || agg->kind == cudf::aggregation::VARIANCE ||
-     agg->kind == cudf::aggregation::STD)
+  cudf::data_type output_type =
+    (kind == cudf::aggregation::MEAN || kind == cudf::aggregation::VARIANCE ||
+     kind == cudf::aggregation::STD)
       ? cudf::data_type{cudf::type_id::FLOAT64}
       : input_column->type();
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    auto result = cudf::reduce(*input_column, *agg, output_dtype);
-  }
+  auto agg = make_reduce_aggregation<kind>();
 
-  // The benchmark takes a column and produces two scalars.
-  set_items_processed(state, column_size + 1);
-  set_bytes_processed(state, estimate_size(input_column->view()) + cudf::size_of(output_dtype));
-}
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_element_count(size);
+  state.add_global_memory_reads<DataType>(size);
+  state.add_global_memory_writes<DataType>(1);
 
-#define concat(a, b, c) a##b##c
-#define get_agg(op)     concat(cudf::make_, op, _aggregation<cudf::reduce_aggregation>())
+  state.exec(nvbench::exec_tag::sync, [&input_column, output_type, &agg](nvbench::launch& launch) {
+    cudf::reduce(*input_column, *agg, output_type);
+  });
 
-// TYPE, OP
-#define RBM_BENCHMARK_DEFINE(name, type, aggregation)             \
-  BENCHMARK_DEFINE_F(Reduction, name)(::benchmark::State & state) \
-  {                                                               \
-    BM_reduction<type>(state, get_agg(aggregation));              \
-  }                                                               \
-  BENCHMARK_REGISTER_F(Reduction, name)                           \
-    ->UseManualTime()                                             \
-    ->Arg(10000)      /* 10k */                                   \
-    ->Arg(100000)     /* 100k */                                  \
-    ->Arg(1000000)    /* 1M */                                    \
-    ->Arg(10000000)   /* 10M */                                   \
-    ->Arg(100000000); /* 100M */
-
-#define REDUCE_BENCHMARK_DEFINE(type, aggregation) \
-  RBM_BENCHMARK_DEFINE(concat(type, _, aggregation), type, aggregation)
+  set_throughputs(state);
+}
 
-#define REDUCE_BENCHMARK_NUMERIC(aggregation)    \
-  REDUCE_BENCHMARK_DEFINE(bool, aggregation);    \
-  REDUCE_BENCHMARK_DEFINE(int8_t, aggregation);  \
-  REDUCE_BENCHMARK_DEFINE(int32_t, aggregation); \
-  REDUCE_BENCHMARK_DEFINE(int64_t, aggregation); \
-  REDUCE_BENCHMARK_DEFINE(float, aggregation);   \
-  REDUCE_BENCHMARK_DEFINE(double, aggregation);
+using Types    = nvbench::type_list<int32_t, int64_t, double, cudf::timestamp_ms>;
+using AggKinds = nvbench::enum_type_list<cudf::reduce_aggregation::MIN,
+                                         cudf::reduce_aggregation::SUM,
+                                         cudf::reduce_aggregation::PRODUCT,
+                                         cudf::reduce_aggregation::VARIANCE,
+                                         cudf::reduce_aggregation::STD,
+                                         cudf::reduce_aggregation::MEAN>;
 
-REDUCE_BENCHMARK_NUMERIC(sum);
-REDUCE_BENCHMARK_DEFINE(int32_t, product);
-REDUCE_BENCHMARK_DEFINE(float, product);
-REDUCE_BENCHMARK_DEFINE(int64_t, min);
-REDUCE_BENCHMARK_DEFINE(double, min);
-using cudf::timestamp_ms;
-REDUCE_BENCHMARK_DEFINE(timestamp_ms, min);
-REDUCE_BENCHMARK_DEFINE(int8_t, mean);
-REDUCE_BENCHMARK_DEFINE(float, mean);
-REDUCE_BENCHMARK_DEFINE(int32_t, variance);
-REDUCE_BENCHMARK_DEFINE(double, variance);
-REDUCE_BENCHMARK_DEFINE(int64_t, std);
-REDUCE_BENCHMARK_DEFINE(float, std);
+NVBENCH_BENCH_TYPES(reduction, NVBENCH_TYPE_AXES(Types, AggKinds))
+  .set_name("reduction")
+  .set_type_axes_names({"DataType", "AggKinds"})
+  .add_int64_axis("size", {100'000, 1'000'000, 10'000'000, 100'000'000});
diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index dc05aad9807..f3d67a79498 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -16,9 +16,7 @@
 
 #include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/common/table_utilities.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
@@ -26,43 +24,38 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
-class ReductionScan : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-template <typename type>
-static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
+template <typename DataType>
+static void reduction_scan(nvbench::state& state, nvbench::type_list<DataType>)
 {
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const dtype  = cudf::type_to_id<type>();
-  auto const column = create_random_column(dtype, row_count{n_rows});
-  if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0);
+  auto const size       = static_cast<cudf::size_type>(state.get_int64("size"));
+  auto const nulls      = state.get_float64("nulls");
+  auto const input_type = cudf::type_to_id<DataType>();
 
-  std::unique_ptr<cudf::column> result = nullptr;
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    result = cudf::scan(
-      *column, *cudf::make_min_aggregation<cudf::scan_aggregation>(), cudf::scan_type::INCLUSIVE);
-  }
+  data_profile const profile = data_profile_builder().null_probability(nulls).distribution(
+    input_type, distribution_id::UNIFORM, 0, 100);
+  auto const input_column = create_random_column(input_type, row_count{size}, profile);
 
-  // The benchmark takes a column and produces a new column of the same size as input.
-  set_items_processed(state, n_rows * 2);
-  set_bytes_processed(state, estimate_size(column->view()) + estimate_size(result->view()));
+  auto agg = cudf::make_min_aggregation<cudf::scan_aggregation>();
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_element_count(size);
+  state.add_global_memory_reads<DataType>(size);
+  state.add_global_memory_writes<DataType>(1);
+
+  state.exec(nvbench::exec_tag::sync, [&input_column, &agg](nvbench::launch& launch) {
+    cudf::scan(*input_column, *agg, cudf::scan_type::INCLUSIVE);
+  });
+
+  set_throughputs(state);
 }
 
-#define SCAN_BENCHMARK_DEFINE(name, type, nulls)                          \
-  BENCHMARK_DEFINE_F(ReductionScan, name)                                 \
-  (::benchmark::State & state) { BM_reduction_scan<type>(state, nulls); } \
-  BENCHMARK_REGISTER_F(ReductionScan, name)                               \
-    ->UseManualTime()                                                     \
-    ->Arg(10000)      /* 10k */                                           \
-    ->Arg(100000)     /* 100k */                                          \
-    ->Arg(1000000)    /* 1M */                                            \
-    ->Arg(10000000)   /* 10M */                                           \
-    ->Arg(100000000); /* 100M */
+using Types = nvbench::type_list<int8_t, int32_t, uint64_t, float, int16_t, uint32_t, double>;
 
-SCAN_BENCHMARK_DEFINE(int8_no_nulls, int8_t, false);
-SCAN_BENCHMARK_DEFINE(int32_no_nulls, int32_t, false);
-SCAN_BENCHMARK_DEFINE(uint64_no_nulls, uint64_t, false);
-SCAN_BENCHMARK_DEFINE(float_no_nulls, float, false);
-SCAN_BENCHMARK_DEFINE(int16_nulls, int16_t, true);
-SCAN_BENCHMARK_DEFINE(uint32_nulls, uint32_t, true);
-SCAN_BENCHMARK_DEFINE(double_nulls, double, true);
+NVBENCH_BENCH_TYPES(reduction_scan, NVBENCH_TYPE_AXES(Types))
+  .set_name("scan")
+  .set_type_axes_names({"DataType"})
+  .add_float64_axis("nulls", {0.0, 0.1})
+  .add_int64_axis("size", {100'000, 1'000'000, 10'000'000, 100'000'000});

From 6a2f323ac2c53b32d8a1d47b36dd0d0786027a7c Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 21 Aug 2024 07:35:44 -0700
Subject: [PATCH 102/270] Fix function parameters with common dependency
 modified during their evaluation (#16620)

This fixes an issue in JNI C++ code. In particular, during a function call, the two passing parameters are evaluated using an index value, but that index is modified during evaluating one of the parameters, leading to out-of-bound access when evaluating the other.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/16620
---
 java/src/main/native/src/TableJni.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 76ca8c533ce..ecc551f1143 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1037,9 +1037,9 @@ cudf::io::schema_element read_schema_element(int& index,
     // go to the next entry, so recursion can parse it.
     index++;
     for (int i = 0; i < num_children; i++) {
+      auto const name = std::string{names.get(index).get()};
       child_elems.insert(
-        std::pair{names.get(index).get(),
-                  cudf::jni::read_schema_element(index, children, names, types, scales)});
+        std::pair{name, cudf::jni::read_schema_element(index, children, names, types, scales)});
     }
     return cudf::io::schema_element{d_type, std::move(child_elems)};
   } else {
@@ -1830,9 +1830,9 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
       std::map<std::string, cudf::io::schema_element> data_types;
       int at = 0;
       while (at < n_types.size()) {
+        auto const name = std::string{n_col_names.get(at).get()};
         data_types.insert(std::pair{
-          n_col_names.get(at).get(),
-          cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
+          name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
       }
       opts.dtypes(data_types);
     } else {
@@ -1929,9 +1929,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
       std::map<std::string, cudf::io::schema_element> data_types;
       int at = 0;
       while (at < n_types.size()) {
+        auto const name = std::string{n_col_names.get(at).get()};
         data_types.insert(std::pair{
-          n_col_names.get(at).get(),
-          cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
+          name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
       }
       opts.dtypes(data_types);
     } else {

From bf2ee328f99cae51c8bdbc240e0ceedb102c24ca Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 21 Aug 2024 14:47:11 -0700
Subject: [PATCH 103/270] DOC: Refresh pylibcudf guide (#15856)

This PR updates the pylibcudf dev guide with some more recent recommendations.

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15856
---
 docs/cudf/source/developer_guide/pylibcudf.md | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
index 2ae545a4955..4e10459fe2b 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -13,10 +13,8 @@ To satisfy the goals of pylibcudf, we impose the following set of design princip
 - Every public function or method should be `cpdef`ed. This allows it to be used in both Cython and Python code. This incurs some slight overhead over `cdef` functions, but we assume that this is acceptable because 1) the vast majority of users will be using pure Python rather than Cython, and 2) the overhead of a `cpdef` function over a `cdef` function is on the order of a nanosecond, while CUDA kernel launch overhead is on the order of a microsecond, so these function overheads should be washed out by typical usage of pylibcudf.
 - Every variable used should be strongly typed and either be a primitive type (int, float, etc) or a cdef class. Any enums in C++ should be mirrored using `cpdef enum`, which will create both a C-style enum in Cython and a PEP 435-style Python enum that will automatically be used in Python.
 - All typing in code should be written using Cython syntax, not PEP 484 Python typing syntax. Not only does this ensure compatibility with Cython < 3, but even with Cython 3 PEP 484 support remains incomplete as of this writing.
-- All cudf code should interact only with pylibcudf, never with libcudf directly.
-- All imports should be relative so that pylibcudf can be easily extracted from cudf later
-  - Exception: All imports of libcudf API bindings in `cudf._lib.cpp` should use absolute imports of `cudf._lib.cpp as libcudf`. We should convert the `cpp` directory into a proper package so that it can be imported as `libcudf` in that fashion. When moving pylibcudf into a separate package, it will be renamed to `libcudf` and only the imports will need to change.
-- Ideally, pylibcudf should depend on nothing other than rmm and pyarrow. This will allow it to be extracted into a a largely standalone library and used in environments where the larger dependency tree of cudf may be cumbersome.
+- All cudf code should interact only with pylibcudf, never with libcudf directly. This is not currently the case, but is the direction that the library is moving towards.
+- Ideally, pylibcudf should depend on no RAPIDS component other than rmm, and should in general have minimal runtime dependencies.
 
 
 ## Relationship to libcudf
@@ -112,6 +110,9 @@ Then, a corresponding pylibcudf fixture may be created using a simple `from_arro
 This approach ensures consistent global coverage across types for various tests.
 
 In general, pylibcudf tests should prefer validating against a corresponding pyarrow implementation rather than hardcoding data.
+If there is no pyarrow implementation, another alternative is to write a pure Python implementation that loops over the values
+of the Table/Column, if a scalar Python equivalent of the pylibcudf implementation exists (this is especially relevant for string methods).
+
 This approach is more resilient to changes to input data, particularly given the fixture strategy outlined above.
 Standard tools for comparing between pylibcudf and pyarrow types are provided in the utils module.
 
@@ -242,3 +243,8 @@ cpdef ColumnOrTable empty_like(ColumnOrTable input)
 
 [Cython supports specializing the contents of fused-type functions based on the argument types](https://cython.readthedocs.io/en/latest/src/userguide/fusedtypes.html#type-checking-specializations), so any type-specific logic may be encoded using the appropriate conditionals.
 See the pylibcudf source for examples of how to implement such functions.
+
+In the event that libcudf provides multiple overloads for the same function with differing numbers of arguments, specify the maximum number of arguments in the Cython definition,
+and set arguments not shared between overloads to `None`. If a user tries to pass in an unsupported argument for a specific overload type, you should raise `ValueError`.
+
+Finally, consider making an libcudf issue if you think this inconsistency can be addressed on the libcudf side.

From 6c4905da22ad5b3d5007f45f38a3fa8449f7f8e1 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 21 Aug 2024 21:03:12 -0700
Subject: [PATCH 104/270] Remove legacy Arrow interop APIs (#16590)

Contributes to #15193.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16590
---
 cpp/CMakeLists.txt                           |   3 -
 cpp/include/cudf/detail/interop.hpp          | 101 +---
 cpp/include/cudf/interop.hpp                 | 101 ----
 cpp/src/interop/detail/arrow_allocator.cpp   |  83 ---
 cpp/src/interop/detail/arrow_allocator.hpp   |  31 --
 cpp/src/interop/from_arrow.cu                | 524 -------------------
 cpp/src/interop/to_arrow.cu                  | 490 -----------------
 cpp/tests/interop/arrow_utils.hpp            |  64 ++-
 java/src/main/native/src/ColumnVectorJni.cpp |  76 ++-
 java/src/main/native/src/TableJni.cpp        |  35 +-
 10 files changed, 167 insertions(+), 1341 deletions(-)
 delete mode 100644 cpp/src/interop/detail/arrow_allocator.cpp
 delete mode 100644 cpp/src/interop/detail/arrow_allocator.hpp
 delete mode 100644 cpp/src/interop/from_arrow.cu
 delete mode 100644 cpp/src/interop/to_arrow.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ff00c484501..6b8bb26825b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -363,17 +363,14 @@ add_library(
   src/hash/sha512_hash.cu
   src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
-  src/interop/from_arrow.cu
   src/interop/arrow_utilities.cpp
   src/interop/decimal_conversion_utilities.cu
-  src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
   src/interop/to_arrow_host.cu
   src/interop/from_arrow_device.cu
   src/interop/from_arrow_host.cu
   src/interop/from_arrow_stream.cu
   src/interop/to_arrow_schema.cpp
-  src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
   src/io/avro/avro_gpu.cu
   src/io/avro/reader_impl.cu
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 0b9319ba663..0d8f078c9d1 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -16,29 +16,13 @@
 
 #pragma once
 
-// We disable warning 611 because the `arrow::TableBatchReader` only partially
-// override the `ReadNext` method of `arrow::RecordBatchReader::ReadNext`
-// triggering warning 611-D from nvcc.
-#ifdef __CUDACC__
-#pragma nv_diag_suppress 611
-#pragma nv_diag_suppress 2810
-#endif
-#include <rmm/resource_ref.hpp>
-
-#include <arrow/api.h>
-#ifdef __CUDACC__
-#pragma nv_diag_default 611
-#pragma nv_diag_default 2810
-#endif
-
 #include <cudf/interop.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-
-#include <string>
+#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
@@ -61,89 +45,6 @@ DLManagedTensor* to_dlpack(table_view const& input,
                            rmm::cuda_stream_view stream,
                            rmm::device_async_resource_ref mr);
 
-// Creating arrow as per given type_id and buffer arguments
-template <typename... Ts>
-std::shared_ptr<arrow::Array> to_arrow_array(cudf::type_id id, Ts&&... args)
-{
-  switch (id) {
-    case type_id::BOOL8: return std::make_shared<arrow::BooleanArray>(std::forward<Ts>(args)...);
-    case type_id::INT8: return std::make_shared<arrow::Int8Array>(std::forward<Ts>(args)...);
-    case type_id::INT16: return std::make_shared<arrow::Int16Array>(std::forward<Ts>(args)...);
-    case type_id::INT32: return std::make_shared<arrow::Int32Array>(std::forward<Ts>(args)...);
-    case type_id::INT64: return std::make_shared<arrow::Int64Array>(std::forward<Ts>(args)...);
-    case type_id::UINT8: return std::make_shared<arrow::UInt8Array>(std::forward<Ts>(args)...);
-    case type_id::UINT16: return std::make_shared<arrow::UInt16Array>(std::forward<Ts>(args)...);
-    case type_id::UINT32: return std::make_shared<arrow::UInt32Array>(std::forward<Ts>(args)...);
-    case type_id::UINT64: return std::make_shared<arrow::UInt64Array>(std::forward<Ts>(args)...);
-    case type_id::FLOAT32: return std::make_shared<arrow::FloatArray>(std::forward<Ts>(args)...);
-    case type_id::FLOAT64: return std::make_shared<arrow::DoubleArray>(std::forward<Ts>(args)...);
-    case type_id::TIMESTAMP_DAYS:
-      return std::make_shared<arrow::Date32Array>(std::make_shared<arrow::Date32Type>(),
-                                                  std::forward<Ts>(args)...);
-    case type_id::TIMESTAMP_SECONDS:
-      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::SECOND),
-                                                     std::forward<Ts>(args)...);
-    case type_id::TIMESTAMP_MILLISECONDS:
-      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::MILLI),
-                                                     std::forward<Ts>(args)...);
-    case type_id::TIMESTAMP_MICROSECONDS:
-      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::MICRO),
-                                                     std::forward<Ts>(args)...);
-    case type_id::TIMESTAMP_NANOSECONDS:
-      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::NANO),
-                                                     std::forward<Ts>(args)...);
-    case type_id::DURATION_SECONDS:
-      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::SECOND),
-                                                    std::forward<Ts>(args)...);
-    case type_id::DURATION_MILLISECONDS:
-      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::MILLI),
-                                                    std::forward<Ts>(args)...);
-    case type_id::DURATION_MICROSECONDS:
-      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::MICRO),
-                                                    std::forward<Ts>(args)...);
-    case type_id::DURATION_NANOSECONDS:
-      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::NANO),
-                                                    std::forward<Ts>(args)...);
-    default: CUDF_FAIL("Unsupported type_id conversion to arrow");
-  }
-}
-
-// Converting arrow type to cudf type
-data_type arrow_to_cudf_type(arrow::DataType const& arrow_type);
-
-/**
- * @copydoc cudf::to_arrow(table_view input, std::vector<column_metadata> const& metadata,
- * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr)
- */
-std::shared_ptr<arrow::Table> to_arrow(table_view input,
-                                       std::vector<column_metadata> const& metadata,
-                                       rmm::cuda_stream_view stream,
-                                       arrow::MemoryPool* ar_mr);
-
-/**
- * @copydoc cudf::to_arrow(cudf::scalar const& input, column_metadata const& metadata,
- * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr)
- */
-std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
-                                        column_metadata const& metadata,
-                                        rmm::cuda_stream_view stream,
-                                        arrow::MemoryPool* ar_mr);
-/**
- * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream,
- * rmm::device_async_resource_ref mr)
- */
-std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
-                                  rmm::cuda_stream_view stream,
-                                  rmm::device_async_resource_ref mr);
-
-/**
- * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream,
- * rmm::device_async_resource_ref mr)
- */
-std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::device_async_resource_ref mr);
-
 /**
  * @brief Return a maximum precision for a given type.
  *
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 9a8f87b4a46..0f52b0f7b31 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -16,21 +16,6 @@
 
 #pragma once
 
-// We disable warning 611 because the `arrow::TableBatchReader` only partially
-// override the `ReadNext` method of `arrow::RecordBatchReader::ReadNext`
-// triggering warning 611-D from nvcc.
-#ifdef __CUDACC__
-#pragma nv_diag_suppress 611
-#pragma nv_diag_suppress 2810
-#endif
-#include <rmm/resource_ref.hpp>
-
-#include <arrow/api.h>
-#ifdef __CUDACC__
-#pragma nv_diag_default 611
-#pragma nv_diag_default 2810
-#endif
-
 #include <cudf/column/column.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/table/table.hpp>
@@ -131,59 +116,6 @@ struct column_metadata {
   column_metadata() = default;
 };
 
-/**
- * @brief Create `arrow::Table` from cudf table `input`
- *
- * Converts the `cudf::table_view` to `arrow::Table` with the provided
- * metadata `column_names`.
- *
- * @deprecated Since 24.08. Use cudf::to_arrow_host instead.
- *
- * @throws cudf::logic_error if `column_names` size doesn't match with number of columns.
- *
- * @param input table_view that needs to be converted to arrow Table
- * @param metadata Contains hierarchy of names of columns and children
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param ar_mr arrow memory pool to allocate memory for arrow Table
- * @return arrow Table generated from `input`
- *
- * @note For decimals, since the precision is not stored for them in libcudf,
- * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
- * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
- * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
- * converted to Arrow decimal128 of the precision 38.
- */
-[[deprecated("Use cudf::to_arrow_host")]] std::shared_ptr<arrow::Table> to_arrow(
-  table_view input,
-  std::vector<column_metadata> const& metadata = {},
-  rmm::cuda_stream_view stream                 = cudf::get_default_stream(),
-  arrow::MemoryPool* ar_mr                     = arrow::default_memory_pool());
-
-/**
- * @brief Create `arrow::Scalar` from cudf scalar `input`
- *
- * Converts the `cudf::scalar` to `arrow::Scalar`.
- *
- * @deprecated Since 24.08.
- *
- * @param input scalar that needs to be converted to arrow Scalar
- * @param metadata Contains hierarchy of names of columns and children
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param ar_mr arrow memory pool to allocate memory for arrow Scalar
- * @return arrow Scalar generated from `input`
- *
- * @note For decimals, since the precision is not stored for them in libcudf,
- * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
- * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
- * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
- * converted to Arrow decimal128 of the precision 38.
- */
-[[deprecated("Use cudf::to_arrow_host")]] std::shared_ptr<arrow::Scalar> to_arrow(
-  cudf::scalar const& input,
-  column_metadata const& metadata = {},
-  rmm::cuda_stream_view stream    = cudf::get_default_stream(),
-  arrow::MemoryPool* ar_mr        = arrow::default_memory_pool());
-
 /**
  * @brief typedef for a unique_ptr to an ArrowSchema with custom deleter
  *
@@ -386,39 +318,6 @@ unique_device_array_t to_arrow_host(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Create `cudf::table` from given arrow Table input
- *
- * @deprecated Since 24.08. Use cudf::from_arrow_host instead.
- *
- * @param input arrow:Table that needs to be converted to `cudf::table`
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr    Device memory resource used to allocate `cudf::table`
- * @return cudf table generated from given arrow Table
- */
-[[deprecated("Use cudf::from_arrow_host")]] std::unique_ptr<table> from_arrow(
-  arrow::Table const& input,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Create `cudf::scalar` from given arrow Scalar input
- *
- * @deprecated Since 24.08. Use arrow's `MakeArrayFromScalar` on the
- * input, followed by `ExportArray` to obtain something that can be
- * consumed by `from_arrow_host`. Then use `cudf::get_element` to
- * extract a device scalar from the column.
- *
- * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar`
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr    Device memory resource used to allocate `cudf::scalar`
- * @return cudf scalar generated from given arrow Scalar
- */
-[[deprecated("See docstring for migration strategies")]] std::unique_ptr<cudf::scalar> from_arrow(
-  arrow::Scalar const& input,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Create `cudf::table` from given ArrowArray and ArrowSchema input
  *
diff --git a/cpp/src/interop/detail/arrow_allocator.cpp b/cpp/src/interop/detail/arrow_allocator.cpp
deleted file mode 100644
index 2a19a5360fe..00000000000
--- a/cpp/src/interop/detail/arrow_allocator.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/detail/interop.hpp>
-
-#include <sys/mman.h>
-#include <unistd.h>
-
-#include <memory>
-
-namespace cudf {
-namespace detail {
-
-/*
-  Enable Transparent Huge Pages (THP) for large (>4MB) allocations.
-  `buf` is returned untouched.
-  Enabling THP can improve performance of device-host memory transfers
-  significantly, see <https://github.com/rapidsai/cudf/pull/13914>.
-*/
-template <typename T>
-T enable_hugepage(T&& buf)
-{
-  if (buf->size() < (1u << 22u)) {  // Smaller than 4 MB
-    return std::move(buf);
-  }
-
-#ifdef MADV_HUGEPAGE
-  auto const pagesize = sysconf(_SC_PAGESIZE);
-  void* addr          = const_cast<uint8_t*>(buf->data());
-  if (addr == nullptr) { return std::move(buf); }
-  auto length{static_cast<std::size_t>(buf->size())};
-  if (std::align(pagesize, pagesize, addr, length)) {
-    // Intentionally not checking for errors that may be returned by older kernel versions;
-    // optimistically tries enabling huge pages.
-    madvise(addr, length, MADV_HUGEPAGE);
-  }
-#endif
-  return std::move(buf);
-}
-
-std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(int64_t const size, arrow::MemoryPool* ar_mr)
-{
-  /*
-  nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer
-  and `ValueOrDie` are used inside a CUDA compilation unit.
-
-  To work around this issue we compile an allocation shim in C++ and use
-  that from our cuda sources
-  */
-  arrow::Result<std::unique_ptr<arrow::Buffer>> result = arrow::AllocateBuffer(size, ar_mr);
-  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer");
-  return enable_hugepage(std::move(result).ValueOrDie());
-}
-
-std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(int64_t const size, arrow::MemoryPool* ar_mr)
-{
-  /*
-  nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer
-  and `ValueOrDie` are used inside a CUDA compilation unit.
-
-  To work around this issue we compile an allocation shim in C++ and use
-  that from our cuda sources
-  */
-  arrow::Result<std::shared_ptr<arrow::Buffer>> result = arrow::AllocateBitmap(size, ar_mr);
-  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow bitmap");
-  return enable_hugepage(std::move(result).ValueOrDie());
-}
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/interop/detail/arrow_allocator.hpp b/cpp/src/interop/detail/arrow_allocator.hpp
deleted file mode 100644
index 75c1baa0dca..00000000000
--- a/cpp/src/interop/detail/arrow_allocator.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/detail/interop.hpp>
-
-namespace cudf {
-namespace detail {
-
-// unique_ptr because that is what AllocateBuffer returns
-std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(int64_t const size, arrow::MemoryPool* ar_mr);
-
-// shared_ptr because that is what AllocateBitmap returns
-std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(int64_t const size, arrow::MemoryPool* ar_mr);
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
deleted file mode 100644
index 579820cbae3..00000000000
--- a/cpp/src/interop/from_arrow.cu
+++ /dev/null
@@ -1,524 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/detail/concatenate.hpp>
-#include <cudf/detail/copy.hpp>
-#include <cudf/detail/interop.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/transform.hpp>
-#include <cudf/detail/unary.hpp>
-#include <cudf/dictionary/dictionary_factories.hpp>
-#include <cudf/interop.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
-
-#include <thrust/gather.h>
-
-namespace cudf {
-
-namespace detail {
-data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
-{
-  switch (arrow_type.id()) {
-    case arrow::Type::NA: return data_type(type_id::EMPTY);
-    case arrow::Type::BOOL: return data_type(type_id::BOOL8);
-    case arrow::Type::INT8: return data_type(type_id::INT8);
-    case arrow::Type::INT16: return data_type(type_id::INT16);
-    case arrow::Type::INT32: return data_type(type_id::INT32);
-    case arrow::Type::INT64: return data_type(type_id::INT64);
-    case arrow::Type::UINT8: return data_type(type_id::UINT8);
-    case arrow::Type::UINT16: return data_type(type_id::UINT16);
-    case arrow::Type::UINT32: return data_type(type_id::UINT32);
-    case arrow::Type::UINT64: return data_type(type_id::UINT64);
-    case arrow::Type::FLOAT: return data_type(type_id::FLOAT32);
-    case arrow::Type::DOUBLE: return data_type(type_id::FLOAT64);
-    case arrow::Type::DATE32: return data_type(type_id::TIMESTAMP_DAYS);
-    case arrow::Type::TIMESTAMP: {
-      auto type = static_cast<arrow::TimestampType const*>(&arrow_type);
-      switch (type->unit()) {
-        case arrow::TimeUnit::type::SECOND: return data_type(type_id::TIMESTAMP_SECONDS);
-        case arrow::TimeUnit::type::MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS);
-        case arrow::TimeUnit::type::MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS);
-        case arrow::TimeUnit::type::NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS);
-        default: CUDF_FAIL("Unsupported timestamp unit in arrow");
-      }
-    }
-    case arrow::Type::DURATION: {
-      auto type = static_cast<arrow::DurationType const*>(&arrow_type);
-      switch (type->unit()) {
-        case arrow::TimeUnit::type::SECOND: return data_type(type_id::DURATION_SECONDS);
-        case arrow::TimeUnit::type::MILLI: return data_type(type_id::DURATION_MILLISECONDS);
-        case arrow::TimeUnit::type::MICRO: return data_type(type_id::DURATION_MICROSECONDS);
-        case arrow::TimeUnit::type::NANO: return data_type(type_id::DURATION_NANOSECONDS);
-        default: CUDF_FAIL("Unsupported duration unit in arrow");
-      }
-    }
-    case arrow::Type::STRING: return data_type(type_id::STRING);
-    case arrow::Type::LARGE_STRING: return data_type(type_id::STRING);
-    case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32);
-    case arrow::Type::LIST: return data_type(type_id::LIST);
-    case arrow::Type::DECIMAL: {
-      auto const type = static_cast<arrow::Decimal128Type const*>(&arrow_type);
-      return data_type{type_id::DECIMAL128, -type->scale()};
-    }
-    case arrow::Type::STRUCT: return data_type(type_id::STRUCT);
-    default: CUDF_FAIL("Unsupported type_id conversion to cudf");
-  }
-}
-
-namespace {
-/**
- * @brief Functor to return column for a corresponding arrow array. column
- * is formed from buffer underneath the arrow array along with any offset and
- * change in length that array has.
- */
-struct dispatch_to_cudf_column {
-  /**
-   * @brief Returns mask from an array without any offsets.
-   */
-  std::unique_ptr<rmm::device_buffer> get_mask_buffer(arrow::Array const& array,
-                                                      rmm::cuda_stream_view stream,
-                                                      rmm::device_async_resource_ref mr)
-  {
-    if (array.null_bitmap_data() == nullptr) {
-      return std::make_unique<rmm::device_buffer>(0, stream, mr);
-    }
-    auto const null_bitmap_size = array.null_bitmap()->size();
-    auto const allocation_size =
-      bitmask_allocation_size_bytes(static_cast<size_type>(null_bitmap_size * CHAR_BIT));
-    auto mask        = std::make_unique<rmm::device_buffer>(allocation_size, stream, mr);
-    auto mask_buffer = array.null_bitmap();
-    CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(),
-                                  reinterpret_cast<uint8_t const*>(mask_buffer->address()),
-                                  null_bitmap_size,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    // Zero-initialize trailing padding bytes
-    auto const num_trailing_bytes = allocation_size - null_bitmap_size;
-    if (num_trailing_bytes > 0) {
-      auto trailing_bytes = static_cast<uint8_t*>(mask->data()) + null_bitmap_size;
-      CUDF_CUDA_TRY(cudaMemsetAsync(trailing_bytes, 0, num_trailing_bytes, stream.value()));
-    }
-    return mask;
-  }
-
-  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
-  std::unique_ptr<column> operator()(
-    arrow::Array const&, data_type, bool, rmm::cuda_stream_view, rmm::device_async_resource_ref)
-  {
-    CUDF_FAIL("Unsupported type in from_arrow.");
-  }
-
-  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
-  std::unique_ptr<column> operator()(arrow::Array const& array,
-                                     data_type type,
-                                     bool skip_mask,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::device_async_resource_ref mr)
-  {
-    auto data_buffer         = array.data()->buffers[1];
-    size_type const num_rows = array.length();
-    auto const has_nulls     = skip_mask ? false : array.null_bitmap_data() != nullptr;
-    auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr);
-    auto mutable_column_view = col->mutable_view();
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      mutable_column_view.data<T>(),
-      reinterpret_cast<uint8_t const*>(data_buffer->address()) + array.offset() * sizeof(T),
-      sizeof(T) * num_rows,
-      cudaMemcpyDefault,
-      stream.value()));
-    if (has_nulls) {
-      auto tmp_mask = get_mask_buffer(array, stream, mr);
-
-      // If array is sliced, we have to copy whole mask and then take copy.
-      auto out_mask = (num_rows == static_cast<size_type>(data_buffer->size() / sizeof(T)))
-                        ? std::move(*tmp_mask)
-                        : cudf::detail::copy_bitmask(static_cast<bitmask_type*>(tmp_mask->data()),
-                                                     array.offset(),
-                                                     array.offset() + num_rows,
-                                                     stream,
-                                                     mr);
-
-      col->set_null_mask(std::move(out_mask), array.null_count());
-    }
-
-    return col;
-  }
-};
-
-std::unique_ptr<column> get_empty_type_column(size_type size)
-{
-  // this abomination is required by cuDF Python, which needs to handle
-  // [PyArrow null arrays](https://arrow.apache.org/docs/python/generated/pyarrow.NullArray.html)
-  // of finite length
-  return std::make_unique<column>(
-    data_type(type_id::EMPTY), size, rmm::device_buffer{}, rmm::device_buffer{}, size);
-}
-
-/**
- * @brief Returns cudf column formed from given arrow array
- * This has been introduced to take care of compiler error "error: explicit specialization of
- * function must precede its first use"
- */
-std::unique_ptr<column> get_column(arrow::Array const& array,
-                                   data_type type,
-                                   bool skip_mask,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::device_async_resource_ref mr);
-
-template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>(
-  arrow::Array const& array,
-  data_type type,
-  bool skip_mask,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  using DeviceType = __int128_t;
-
-  auto data_buffer    = array.data()->buffers[1];
-  auto const num_rows = static_cast<size_type>(array.length());
-  auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr);
-  auto mutable_column_view = col->mutable_view();
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    mutable_column_view.data<DeviceType>(),
-    reinterpret_cast<uint8_t const*>(data_buffer->address()) + array.offset() * sizeof(DeviceType),
-    sizeof(DeviceType) * num_rows,
-    cudaMemcpyDefault,
-    stream.value()));
-
-  auto null_mask = [&] {
-    if (not skip_mask and array.null_bitmap_data()) {
-      auto temp_mask = get_mask_buffer(array, stream, mr);
-      // If array is sliced, we have to copy whole mask and then take copy.
-      return (num_rows == static_cast<size_type>(data_buffer->size() / sizeof(DeviceType)))
-               ? std::move(*temp_mask.release())
-               : cudf::detail::copy_bitmask(static_cast<bitmask_type*>(temp_mask->data()),
-                                            array.offset(),
-                                            array.offset() + num_rows,
-                                            stream,
-                                            mr);
-    }
-    return rmm::device_buffer{};
-  }();
-
-  col->set_null_mask(std::move(null_mask), array.null_count());
-  return col;
-}
-
-template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(arrow::Array const& array,
-                                                                  data_type,
-                                                                  bool skip_mask,
-                                                                  rmm::cuda_stream_view stream,
-                                                                  rmm::device_async_resource_ref mr)
-{
-  auto data_buffer = array.data()->buffers[1];
-  // mask-to-bools expects the mask to be bitmask_type aligned/padded
-  auto data = rmm::device_buffer(
-    cudf::bitmask_allocation_size_bytes(data_buffer->size() * CHAR_BIT), stream, mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(),
-                                reinterpret_cast<uint8_t const*>(data_buffer->address()),
-                                data_buffer->size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  auto out_col = mask_to_bools(static_cast<bitmask_type*>(data.data()),
-                               array.offset(),
-                               array.offset() + array.length(),
-                               stream,
-                               mr);
-
-  auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr;
-  if (has_nulls) {
-    auto out_mask =
-      detail::copy_bitmask(static_cast<bitmask_type*>(get_mask_buffer(array, stream, mr)->data()),
-                           array.offset(),
-                           array.offset() + array.length(),
-                           stream,
-                           mr);
-
-    out_col->set_null_mask(std::move(out_mask), array.null_count());
-  }
-
-  return out_col;
-}
-
-template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
-  arrow::Array const& array,
-  data_type,
-  bool,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  if (array.length() == 0) { return make_empty_column(type_id::STRING); }
-
-  std::unique_ptr<column> offsets_column;
-  std::unique_ptr<arrow::Array> char_array;
-
-  if (array.type_id() == arrow::Type::LARGE_STRING) {
-    auto str_array    = static_cast<arrow::LargeStringArray const*>(&array);
-    auto offset_array = std::make_unique<arrow::Int64Array>(
-      str_array->value_offsets()->size() / sizeof(int64_t), str_array->value_offsets(), nullptr);
-    offsets_column = dispatch_to_cudf_column{}.operator()<int64_t>(
-      *offset_array, data_type(type_id::INT64), true, stream, mr);
-    char_array = std::make_unique<arrow::Int8Array>(
-      str_array->value_data()->size(), str_array->value_data(), nullptr);
-  } else if (array.type_id() == arrow::Type::STRING) {
-    auto str_array    = static_cast<arrow::StringArray const*>(&array);
-    auto offset_array = std::make_unique<arrow::Int32Array>(
-      str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
-    offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
-      *offset_array, data_type(type_id::INT32), true, stream, mr);
-    char_array = std::make_unique<arrow::Int8Array>(
-      str_array->value_data()->size(), str_array->value_data(), nullptr);
-  } else {
-    throw std::runtime_error("Unsupported array type");
-  }
-
-  rmm::device_buffer chars(char_array->length(), stream, mr);
-  auto data_buffer = char_array->data()->buffers[1];
-  CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(),
-                                reinterpret_cast<uint8_t const*>(data_buffer->address()),
-                                chars.size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  auto const num_rows = offsets_column->size() - 1;
-  auto out_col        = make_strings_column(num_rows,
-                                     std::move(offsets_column),
-                                     std::move(chars),
-                                     array.null_count(),
-                                     std::move(*get_mask_buffer(array, stream, mr)));
-
-  return num_rows == array.length()
-           ? std::move(out_col)
-           : std::make_unique<column>(
-               cudf::detail::slice(out_col->view(),
-                                   static_cast<size_type>(array.offset()),
-                                   static_cast<size_type>(array.offset() + array.length()),
-                                   stream),
-               stream,
-               mr);
-}
-
-template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::dictionary32>(
-  arrow::Array const& array,
-  data_type,
-  bool,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  auto dict_array  = static_cast<arrow::DictionaryArray const*>(&array);
-  auto dict_type   = arrow_to_cudf_type(*(dict_array->dictionary()->type()));
-  auto keys_column = get_column(*(dict_array->dictionary()), dict_type, true, stream, mr);
-  auto ind_type    = arrow_to_cudf_type(*(dict_array->indices()->type()));
-
-  auto indices_column = get_column(*(dict_array->indices()), ind_type, false, stream, mr);
-  // If index type is not of type uint32_t, then cast it to uint32_t
-  auto const dict_indices_type = data_type{type_id::UINT32};
-  if (indices_column->type().id() != dict_indices_type.id())
-    indices_column = cudf::detail::cast(indices_column->view(), dict_indices_type, stream, mr);
-
-  // Child columns shouldn't have masks and we need the mask in main column
-  auto column_contents = indices_column->release();
-  indices_column       = std::make_unique<column>(dict_indices_type,
-                                            static_cast<size_type>(array.length()),
-                                            std::move(*(column_contents.data)),
-                                            rmm::device_buffer{},
-                                            0);
-
-  return make_dictionary_column(std::move(keys_column),
-                                std::move(indices_column),
-                                std::move(*(column_contents.null_mask)),
-                                array.null_count());
-}
-
-template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::struct_view>(
-  arrow::Array const& array,
-  data_type,
-  bool,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  auto struct_array = static_cast<arrow::StructArray const*>(&array);
-  std::vector<std::unique_ptr<column>> child_columns;
-  // Offsets have already been applied to child
-  arrow::ArrayVector array_children = struct_array->fields();
-  std::transform(array_children.cbegin(),
-                 array_children.cend(),
-                 std::back_inserter(child_columns),
-                 [&mr, &stream](auto const& child_array) {
-                   auto type = arrow_to_cudf_type(*(child_array->type()));
-                   return get_column(*child_array, type, false, stream, mr);
-                 });
-
-  auto out_mask = std::move(*(get_mask_buffer(array, stream, mr)));
-  if (struct_array->null_bitmap_data() != nullptr) {
-    out_mask = detail::copy_bitmask(static_cast<bitmask_type*>(out_mask.data()),
-                                    array.offset(),
-                                    array.offset() + array.length(),
-                                    stream,
-                                    mr);
-  }
-
-  return make_structs_column(
-    array.length(), move(child_columns), array.null_count(), std::move(out_mask), stream, mr);
-}
-
-template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::list_view>(
-  arrow::Array const& array,
-  data_type,
-  bool,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  auto list_array   = static_cast<arrow::ListArray const*>(&array);
-  auto offset_array = std::make_unique<arrow::Int32Array>(
-    list_array->value_offsets()->size() / sizeof(int32_t), list_array->value_offsets(), nullptr);
-  auto offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
-    *offset_array, data_type(type_id::INT32), true, stream, mr);
-
-  auto child_type   = arrow_to_cudf_type(*(list_array->values()->type()));
-  auto child_column = get_column(*(list_array->values()), child_type, false, stream, mr);
-
-  auto const num_rows = offsets_column->size() - 1;
-  auto out_col        = make_lists_column(num_rows,
-                                   std::move(offsets_column),
-                                   std::move(child_column),
-                                   array.null_count(),
-                                   std::move(*get_mask_buffer(array, stream, mr)),
-                                   stream,
-                                   mr);
-
-  return num_rows == array.length()
-           ? std::move(out_col)
-           : std::make_unique<column>(
-               cudf::detail::slice(out_col->view(),
-                                   static_cast<size_type>(array.offset()),
-                                   static_cast<size_type>(array.offset() + array.length()),
-                                   stream),
-               stream,
-               mr);
-}
-
-std::unique_ptr<column> get_column(arrow::Array const& array,
-                                   data_type type,
-                                   bool skip_mask,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::device_async_resource_ref mr)
-{
-  return type.id() != type_id::EMPTY
-           ? type_dispatcher(type, dispatch_to_cudf_column{}, array, type, skip_mask, stream, mr)
-           : get_empty_type_column(array.length());
-}
-
-}  // namespace
-
-std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
-                                  rmm::cuda_stream_view stream,
-                                  rmm::device_async_resource_ref mr)
-{
-  if (input_table.num_columns() == 0) { return std::make_unique<table>(); }
-  std::vector<std::unique_ptr<column>> columns;
-  auto chunked_arrays = input_table.columns();
-  std::transform(chunked_arrays.begin(),
-                 chunked_arrays.end(),
-                 std::back_inserter(columns),
-                 [&mr, &stream](auto const& chunked_array) {
-                   std::vector<std::unique_ptr<column>> concat_columns;
-                   auto cudf_type    = arrow_to_cudf_type(*(chunked_array->type()));
-                   auto array_chunks = chunked_array->chunks();
-                   if (cudf_type.id() == type_id::EMPTY) {
-                     return get_empty_type_column(chunked_array->length());
-                   }
-                   std::transform(array_chunks.begin(),
-                                  array_chunks.end(),
-                                  std::back_inserter(concat_columns),
-                                  [&cudf_type, &mr, &stream](auto const& array_chunk) {
-                                    return get_column(*array_chunk, cudf_type, false, stream, mr);
-                                  });
-                   if (concat_columns.empty()) {
-                     return std::make_unique<column>(
-                       cudf_type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
-                   } else if (concat_columns.size() == 1) {
-                     return std::move(concat_columns[0]);
-                   }
-
-                   std::vector<cudf::column_view> column_views;
-                   std::transform(concat_columns.begin(),
-                                  concat_columns.end(),
-                                  std::back_inserter(column_views),
-                                  [](auto const& col) { return col->view(); });
-                   return cudf::detail::concatenate(column_views, stream, mr);
-                 });
-
-  return std::make_unique<table>(std::move(columns));
-}
-
-std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::device_async_resource_ref mr)
-{
-  auto maybe_array = arrow::MakeArrayFromScalar(input, 1);
-  if (!maybe_array.ok()) { CUDF_FAIL("Failed to create array"); }
-  auto array = *maybe_array;
-
-  auto field = arrow::field("", input.type);
-
-  auto table = arrow::Table::Make(arrow::schema({field}), {array});
-
-  auto cudf_table = detail::from_arrow(*table, stream, mr);
-
-  auto cv = cudf_table->view().column(0);
-  return get_element(cv, 0, stream);
-}
-
-}  // namespace detail
-
-std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
-                                  rmm::cuda_stream_view stream,
-                                  rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-
-  return detail::from_arrow(input_table, stream, mr);
-}
-
-std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-
-  return detail::from_arrow(input, stream, mr);
-}
-}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
deleted file mode 100644
index a867d4adfa1..00000000000
--- a/cpp/src/interop/to_arrow.cu
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "arrow_utilities.hpp"
-#include "decimal_conversion_utilities.cuh"
-#include "detail/arrow_allocator.hpp"
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/interop.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/unary.hpp>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/interop.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
-namespace cudf {
-namespace detail {
-namespace {
-
-/**
- * @brief Create arrow data buffer from given cudf column
- */
-template <typename T>
-std::shared_ptr<arrow::Buffer> fetch_data_buffer(device_span<T const> input,
-                                                 arrow::MemoryPool* ar_mr,
-                                                 rmm::cuda_stream_view stream)
-{
-  int64_t const data_size_in_bytes = sizeof(T) * input.size();
-
-  auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr);
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                                input.data(),
-                                data_size_in_bytes,
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  return std::move(data_buffer);
-}
-
-/**
- * @brief Create arrow buffer of mask from given cudf column
- */
-std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
-                                                 arrow::MemoryPool* ar_mr,
-                                                 rmm::cuda_stream_view stream)
-{
-  int64_t const mask_size_in_bytes = cudf::bitmask_allocation_size_bytes(input_view.size());
-
-  if (input_view.has_nulls()) {
-    auto mask_buffer = allocate_arrow_bitmap(static_cast<int64_t>(input_view.size()), ar_mr);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      mask_buffer->mutable_data(),
-      (input_view.offset() > 0)
-        ? cudf::detail::copy_bitmask(input_view, stream, rmm::mr::get_current_device_resource())
-            .data()
-        : input_view.null_mask(),
-      mask_size_in_bytes,
-      cudaMemcpyDefault,
-      stream.value()));
-
-    // Resets all padded bits to 0
-    mask_buffer->ZeroPadding();
-
-    return mask_buffer;
-  }
-
-  return nullptr;
-}
-
-/**
- * @brief Functor to convert cudf column to arrow array
- */
-struct dispatch_to_arrow {
-  /**
-   * @brief Creates vector Arrays from given cudf column children
-   */
-  std::vector<std::shared_ptr<arrow::Array>> fetch_child_array(
-    column_view input_view,
-    std::vector<column_metadata> const& metadata,
-    arrow::MemoryPool* ar_mr,
-    rmm::cuda_stream_view stream)
-  {
-    std::vector<std::shared_ptr<arrow::Array>> child_arrays;
-    std::transform(
-      input_view.child_begin(),
-      input_view.child_end(),
-      metadata.begin(),
-      std::back_inserter(child_arrays),
-      [&ar_mr, &stream](auto const& child, auto const& meta) {
-        return type_dispatcher(
-          child.type(), dispatch_to_arrow{}, child, child.type().id(), meta, ar_mr, stream);
-      });
-    return child_arrays;
-  }
-
-  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
-  std::shared_ptr<arrow::Array> operator()(
-    column_view, cudf::type_id, column_metadata const&, arrow::MemoryPool*, rmm::cuda_stream_view)
-  {
-    CUDF_FAIL("Unsupported type for to_arrow.");
-  }
-
-  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
-  std::shared_ptr<arrow::Array> operator()(column_view input_view,
-                                           cudf::type_id id,
-                                           column_metadata const&,
-                                           arrow::MemoryPool* ar_mr,
-                                           rmm::cuda_stream_view stream)
-  {
-    return to_arrow_array(
-      id,
-      static_cast<int64_t>(input_view.size()),
-      fetch_data_buffer<T>(
-        device_span<T const>(input_view.data<T>(), input_view.size()), ar_mr, stream),
-      fetch_mask_buffer(input_view, ar_mr, stream),
-      static_cast<int64_t>(input_view.null_count()));
-  }
-};
-
-// Convert decimal types from libcudf to arrow where those types are not
-// directly supported by Arrow. These types must be fit into 128 bits, the
-// smallest decimal resolution supported by Arrow.
-template <typename DeviceType>
-std::shared_ptr<arrow::Array> unsupported_decimals_to_arrow(column_view input,
-                                                            int32_t precision,
-                                                            arrow::MemoryPool* ar_mr,
-                                                            rmm::cuda_stream_view stream)
-{
-  auto buf = detail::convert_decimals_to_decimal128<DeviceType>(
-    input, stream, rmm::mr::get_current_device_resource());
-
-  // Synchronize stream here to ensure the decimal128 buffer is ready.
-  stream.synchronize();
-
-  auto const buf_size_in_bytes = buf->size();
-  auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                                buf->data(),
-                                buf_size_in_bytes,
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  auto type    = arrow::decimal(precision, -input.type().scale());
-  auto mask    = fetch_mask_buffer(input, ar_mr, stream);
-  auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
-  auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
-
-  return std::make_shared<arrow::Decimal128Array>(data);
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal32>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const&,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  using DeviceType = int32_t;
-  return unsupported_decimals_to_arrow<DeviceType>(
-    input, cudf::detail::max_precision<DeviceType>(), ar_mr, stream);
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const&,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  using DeviceType = int64_t;
-  return unsupported_decimals_to_arrow<DeviceType>(
-    input, cudf::detail::max_precision<DeviceType>(), ar_mr, stream);
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const&,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  using DeviceType         = __int128_t;
-  auto const max_precision = cudf::detail::max_precision<DeviceType>();
-
-  rmm::device_uvector<DeviceType> buf(input.size(), stream);
-
-  thrust::copy(rmm::exec_policy(stream),  //
-               input.begin<DeviceType>(),
-               input.end<DeviceType>(),
-               buf.begin());
-
-  auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
-  auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value()));
-
-  auto type    = arrow::decimal(max_precision, -input.type().scale());
-  auto mask    = fetch_mask_buffer(input, ar_mr, stream);
-  auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
-  auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
-
-  return std::make_shared<arrow::Decimal128Array>(data);
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view input,
-                                                                  cudf::type_id id,
-                                                                  column_metadata const&,
-                                                                  arrow::MemoryPool* ar_mr,
-                                                                  rmm::cuda_stream_view stream)
-{
-  auto bitmask = detail::bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
-
-  auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                                bitmask.first->data(),
-                                bitmask.first->size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  return to_arrow_array(id,
-                        static_cast<int64_t>(input.size()),
-                        std::move(data_buffer),
-                        fetch_mask_buffer(input, ar_mr, stream),
-                        static_cast<int64_t>(input.null_count()));
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const&,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  std::unique_ptr<column> tmp_column =
-    ((input.offset() != 0) or
-     ((input.num_children() == 1) and (input.child(0).size() - 1 != input.size())))
-      ? std::make_unique<cudf::column>(input, stream)
-      : nullptr;
-
-  column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input;
-  auto child_arrays      = fetch_child_array(input_view, {{}, {}}, ar_mr, stream);
-  if (child_arrays.empty()) {
-    // Empty string will have only one value in offset of 4 bytes
-    auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr);
-    auto tmp_data_buffer   = allocate_arrow_buffer(0, ar_mr);
-    memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t));
-
-    return std::make_shared<arrow::StringArray>(
-      0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer));
-  }
-  auto offset_buffer = child_arrays[strings_column_view::offsets_column_index]->data()->buffers[1];
-  auto const sview   = strings_column_view{input_view};
-  auto data_buffer   = fetch_data_buffer<char>(
-    device_span<char const>{sview.chars_begin(stream),
-                              static_cast<std::size_t>(sview.chars_size(stream))},
-    ar_mr,
-    stream);
-  if (sview.offsets().type().id() == cudf::type_id::INT64) {
-    return std::make_shared<arrow::LargeStringArray>(static_cast<int64_t>(input_view.size()),
-                                                     offset_buffer,
-                                                     data_buffer,
-                                                     fetch_mask_buffer(input_view, ar_mr, stream),
-                                                     static_cast<int64_t>(input_view.null_count()));
-  } else {
-    return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
-                                                offset_buffer,
-                                                data_buffer,
-                                                fetch_mask_buffer(input_view, ar_mr, stream),
-                                                static_cast<int64_t>(input_view.null_count()));
-  }
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::struct_view>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const& metadata,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  CUDF_EXPECTS(metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
-               "Number of field names and number of children doesn't match\n");
-  std::unique_ptr<column> tmp_column = nullptr;
-
-  if (input.offset() != 0) { tmp_column = std::make_unique<cudf::column>(input, stream); }
-
-  column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input;
-  auto child_arrays      = fetch_child_array(input_view, metadata.children_meta, ar_mr, stream);
-  auto mask              = fetch_mask_buffer(input_view, ar_mr, stream);
-
-  std::vector<std::shared_ptr<arrow::Field>> fields;
-  std::transform(child_arrays.cbegin(),
-                 child_arrays.cend(),
-                 metadata.children_meta.cbegin(),
-                 std::back_inserter(fields),
-                 [](auto const array, auto const meta) {
-                   return std::make_shared<arrow::Field>(
-                     meta.name, array->type(), array->null_count() > 0);
-                 });
-  auto dtype = std::make_shared<arrow::StructType>(fields);
-
-  return std::make_shared<arrow::StructArray>(dtype,
-                                              static_cast<int64_t>(input_view.size()),
-                                              child_arrays,
-                                              mask,
-                                              static_cast<int64_t>(input_view.null_count()));
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::list_view>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const& metadata,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  CUDF_EXPECTS(metadata.children_meta.empty() ||
-                 metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
-               "Number of field names and number of children do not match\n");
-  std::unique_ptr<column> tmp_column = nullptr;
-  if ((input.offset() != 0) or
-      ((input.num_children() == 2) and (input.child(0).size() - 1 != input.size()))) {
-    tmp_column = std::make_unique<cudf::column>(input, stream);
-  }
-
-  column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input;
-  auto children_meta =
-    metadata.children_meta.empty() ? std::vector<column_metadata>{{}, {}} : metadata.children_meta;
-  auto child_arrays = fetch_child_array(input_view, children_meta, ar_mr, stream);
-  if (child_arrays.empty() || child_arrays[0]->data()->length == 0) {
-    auto element_type = child_arrays.empty() ? arrow::null() : child_arrays[1]->type();
-    auto result       = arrow::MakeEmptyArray(arrow::list(element_type), ar_mr);
-    CUDF_EXPECTS(result.ok(), "Failed to construct empty arrow list array\n");
-    return result.ValueUnsafe();
-  }
-
-  auto offset_buffer = child_arrays[0]->data()->buffers[1];
-  auto data          = child_arrays[1];
-  return std::make_shared<arrow::ListArray>(arrow::list(data->type()),
-                                            static_cast<int64_t>(input_view.size()),
-                                            offset_buffer,
-                                            data,
-                                            fetch_mask_buffer(input_view, ar_mr, stream),
-                                            static_cast<int64_t>(input_view.null_count()));
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::dictionary32>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const& metadata,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  // Arrow dictionary requires indices to be signed integer
-  std::unique_ptr<column> dict_indices =
-    detail::cast(cudf::dictionary_column_view(input).get_indices_annotated(),
-                 cudf::data_type{type_id::INT32},
-                 stream,
-                 rmm::mr::get_current_device_resource());
-  auto indices = dispatch_to_arrow{}.operator()<int32_t>(
-    dict_indices->view(), dict_indices->type().id(), {}, ar_mr, stream);
-  auto dict_keys = cudf::dictionary_column_view(input).keys();
-  auto dictionary =
-    type_dispatcher(dict_keys.type(),
-                    dispatch_to_arrow{},
-                    dict_keys,
-                    dict_keys.type().id(),
-                    metadata.children_meta.empty() ? column_metadata{} : metadata.children_meta[0],
-                    ar_mr,
-                    stream);
-
-  return std::make_shared<arrow::DictionaryArray>(
-    arrow::dictionary(indices->type(), dictionary->type()), indices, dictionary);
-}
-}  // namespace
-
-std::shared_ptr<arrow::Table> to_arrow(table_view input,
-                                       std::vector<column_metadata> const& metadata,
-                                       rmm::cuda_stream_view stream,
-                                       arrow::MemoryPool* ar_mr)
-{
-  CUDF_EXPECTS((metadata.size() == static_cast<std::size_t>(input.num_columns())),
-               "columns' metadata should be equal to number of columns in table");
-
-  std::vector<std::shared_ptr<arrow::Array>> arrays;
-  std::vector<std::shared_ptr<arrow::Field>> fields;
-
-  std::transform(
-    input.begin(),
-    input.end(),
-    metadata.begin(),
-    std::back_inserter(arrays),
-    [&](auto const& c, auto const& meta) {
-      return c.type().id() != type_id::EMPTY
-               ? type_dispatcher(
-                   c.type(), detail::dispatch_to_arrow{}, c, c.type().id(), meta, ar_mr, stream)
-               : std::make_shared<arrow::NullArray>(c.size());
-    });
-
-  std::transform(
-    arrays.begin(),
-    arrays.end(),
-    metadata.begin(),
-    std::back_inserter(fields),
-    [](auto const& array, auto const& meta) { return arrow::field(meta.name, array->type()); });
-
-  auto result = arrow::Table::Make(arrow::schema(fields), arrays);
-
-  // synchronize the stream because after the return the data may be accessed from the host before
-  // the above `cudaMemcpyAsync` calls have completed their copies (especially if pinned host
-  // memory is used).
-  stream.synchronize();
-
-  return result;
-}
-
-std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
-                                        column_metadata const& metadata,
-                                        rmm::cuda_stream_view stream,
-                                        arrow::MemoryPool* ar_mr)
-{
-  auto const column = cudf::make_column_from_scalar(input, 1, stream);
-  cudf::table_view const tv{{column->view()}};
-  auto const arrow_table  = detail::to_arrow(tv, {metadata}, stream, ar_mr);
-  auto const ac           = arrow_table->column(0);
-  auto const maybe_scalar = ac->GetScalar(0);
-  if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); }
-  return maybe_scalar.ValueOrDie();
-}
-}  // namespace detail
-
-std::shared_ptr<arrow::Table> to_arrow(table_view input,
-                                       std::vector<column_metadata> const& metadata,
-                                       rmm::cuda_stream_view stream,
-                                       arrow::MemoryPool* ar_mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::to_arrow(input, metadata, stream, ar_mr);
-}
-
-std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
-                                        column_metadata const& metadata,
-                                        rmm::cuda_stream_view stream,
-                                        arrow::MemoryPool* ar_mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::to_arrow(input, metadata, stream, ar_mr);
-}
-}  // namespace cudf
diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp
index 08eada632a5..70a9fe64d70 100644
--- a/cpp/tests/interop/arrow_utils.hpp
+++ b/cpp/tests/interop/arrow_utils.hpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -30,11 +32,65 @@
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
+#include <arrow/api.h>
 #include <arrow/util/bitmap_builders.h>
 
-#include <algorithm>
-
-#pragma once
+// Creating arrow as per given type_id and buffer arguments
+template <typename... Ts>
+std::shared_ptr<arrow::Array> to_arrow_array(cudf::type_id id, Ts&&... args)
+{
+  switch (id) {
+    case cudf::type_id::BOOL8:
+      return std::make_shared<arrow::BooleanArray>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT8: return std::make_shared<arrow::Int8Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT16:
+      return std::make_shared<arrow::Int16Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT32:
+      return std::make_shared<arrow::Int32Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT64:
+      return std::make_shared<arrow::Int64Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT8:
+      return std::make_shared<arrow::UInt8Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT16:
+      return std::make_shared<arrow::UInt16Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT32:
+      return std::make_shared<arrow::UInt32Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT64:
+      return std::make_shared<arrow::UInt64Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::FLOAT32:
+      return std::make_shared<arrow::FloatArray>(std::forward<Ts>(args)...);
+    case cudf::type_id::FLOAT64:
+      return std::make_shared<arrow::DoubleArray>(std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_DAYS:
+      return std::make_shared<arrow::Date32Array>(std::make_shared<arrow::Date32Type>(),
+                                                  std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_SECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::SECOND),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_MILLISECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::MILLI),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_MICROSECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::MICRO),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_NANOSECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::NANO),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_SECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::SECOND),
+                                                    std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_MILLISECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::MILLI),
+                                                    std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_MICROSECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::MICRO),
+                                                    std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_NANOSECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::NANO),
+                                                    std::forward<Ts>(args)...);
+    default: CUDF_FAIL("Unsupported type_id conversion to arrow");
+  }
+}
 
 template <typename T>
 std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>,
@@ -50,7 +106,7 @@ get_arrow_array(std::vector<T> const& data, std::vector<uint8_t> const& mask = {
   std::shared_ptr<arrow::Buffer> mask_buffer =
     mask.empty() ? nullptr : arrow::internal::BytesToBits(mask).ValueOrDie();
 
-  return cudf::detail::to_arrow_array(cudf::type_to_id<T>(), data.size(), data_buffer, mask_buffer);
+  return to_arrow_array(cudf::type_to_id<T>(), data.size(), data_buffer, mask_buffer);
 }
 
 template <typename T>
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index cdc5aa41abe..9b718b2ed83 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -38,12 +38,70 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <arrow/api.h>
+#include <arrow/c/bridge.h>
 
 #include <algorithm>
 
 using cudf::jni::ptr_as_jlong;
 using cudf::jni::release_as_jlong;
 
+// Creating arrow as per given type_id and buffer arguments
+template <typename... Ts>
+std::shared_ptr<arrow::Array> to_arrow_array(cudf::type_id id, Ts&&... args)
+{
+  switch (id) {
+    case cudf::type_id::BOOL8:
+      return std::make_shared<arrow::BooleanArray>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT8: return std::make_shared<arrow::Int8Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT16:
+      return std::make_shared<arrow::Int16Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT32:
+      return std::make_shared<arrow::Int32Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT64:
+      return std::make_shared<arrow::Int64Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT8:
+      return std::make_shared<arrow::UInt8Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT16:
+      return std::make_shared<arrow::UInt16Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT32:
+      return std::make_shared<arrow::UInt32Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT64:
+      return std::make_shared<arrow::UInt64Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::FLOAT32:
+      return std::make_shared<arrow::FloatArray>(std::forward<Ts>(args)...);
+    case cudf::type_id::FLOAT64:
+      return std::make_shared<arrow::DoubleArray>(std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_DAYS:
+      return std::make_shared<arrow::Date32Array>(std::make_shared<arrow::Date32Type>(),
+                                                  std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_SECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::SECOND),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_MILLISECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::MILLI),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_MICROSECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::MICRO),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_NANOSECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::NANO),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_SECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::SECOND),
+                                                    std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_MILLISECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::MILLI),
+                                                    std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_MICROSECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::MICRO),
+                                                    std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_NANOSECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::NANO),
+                                                    std::forward<Ts>(args)...);
+    default: CUDF_FAIL("Unsupported type_id conversion to arrow");
+  }
+}
+
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(
@@ -141,15 +199,27 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv* env,
         break;
       default:
         // this handles the primitive types
-        arrow_array = cudf::detail::to_arrow_array(
-          n_type, j_col_length, data_buffer, null_buffer, j_null_count);
+        arrow_array = to_arrow_array(n_type, j_col_length, data_buffer, null_buffer, j_null_count);
     }
     auto name_and_type                                = arrow::field("col", arrow_array->type());
     std::vector<std::shared_ptr<arrow::Field>> fields = {name_and_type};
     std::shared_ptr<arrow::Schema> schema             = std::make_shared<arrow::Schema>(fields);
     auto arrow_table =
       arrow::Table::Make(schema, std::vector<std::shared_ptr<arrow::Array>>{arrow_array});
-    auto retCols = cudf::from_arrow(*(arrow_table))->release();
+
+    ArrowSchema sch;
+    if (!arrow::ExportSchema(*arrow_table->schema(), &sch).ok()) {
+      JNI_THROW_NEW(env, "java/lang/RuntimeException", "Unable to produce an ArrowSchema", 0)
+    }
+    auto batch = arrow_table->CombineChunksToBatch().ValueOrDie();
+    ArrowArray arr;
+    if (!arrow::ExportRecordBatch(*batch, &arr).ok()) {
+      JNI_THROW_NEW(env, "java/lang/RuntimeException", "Unable to produce an ArrowArray", 0)
+    }
+    auto retCols = cudf::from_arrow(&sch, &arr)->release();
+    arr.release(&arr);
+    sch.release(&sch);
+
     if (retCols.size() != 1) {
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Must result in one column", 0);
     }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index ecc551f1143..c749c8c84bf 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -54,6 +54,8 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <arrow/api.h>
+#include <arrow/c/bridge.h>
 #include <arrow/io/api.h>
 #include <arrow/ipc/api.h>
 
@@ -1069,6 +1071,15 @@ void append_flattened_child_names(cudf::io::column_name_info const& info,
   }
 }
 
+// Recursively make schema and its children nullable
+void set_nullable(ArrowSchema* schema)
+{
+  schema->flags |= ARROW_FLAG_NULLABLE;
+  for (int i = 0; i < schema->n_children; ++i) {
+    set_nullable(schema->children[i]);
+  }
+}
+
 }  // namespace
 
 }  // namespace jni
@@ -2635,7 +2646,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv
     // The pointer to the shared_ptr<> is returned as a jlong.
     using result_t = std::shared_ptr<arrow::Table>;
 
-    auto result = cudf::to_arrow(*tview, state->get_column_metadata(*tview));
+    auto got_arrow_schema = cudf::to_arrow_schema(*tview, state->get_column_metadata(*tview));
+    cudf::jni::set_nullable(got_arrow_schema.get());
+    auto got_arrow_array = cudf::to_arrow_host(*tview);
+    auto batch =
+      arrow::ImportRecordBatch(&got_arrow_array->array, got_arrow_schema.get()).ValueOrDie();
+    auto result = arrow::Table::FromRecordBatches({batch}).ValueOrDie();
+
     return ptr_as_jlong(new result_t{result});
   }
   CATCH_STD(env, 0)
@@ -2746,7 +2763,21 @@ Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv* env, jclass, jlong arr
 
   try {
     cudf::jni::auto_set_device(env);
-    return convert_table_for_return(env, cudf::from_arrow(*(handle->get())));
+
+    ArrowSchema sch;
+    if (!arrow::ExportSchema(*handle->get()->schema(), &sch).ok()) {
+      JNI_THROW_NEW(env, "java/lang/RuntimeException", "Unable to produce an ArrowSchema", 0)
+    }
+    auto batch = handle->get()->CombineChunksToBatch().ValueOrDie();
+    ArrowArray arr;
+    if (!arrow::ExportRecordBatch(*batch, &arr).ok()) {
+      JNI_THROW_NEW(env, "java/lang/RuntimeException", "Unable to produce an ArrowArray", 0)
+    }
+    auto ret = cudf::from_arrow(&sch, &arr);
+    arr.release(&arr);
+    sch.release(&sch);
+
+    return convert_table_for_return(env, ret);
   }
   CATCH_STD(env, 0)
 }

From 1fd96756daf90b8d2f901fe19a168e9d11974c0b Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 21 Aug 2024 21:10:20 -0700
Subject: [PATCH 105/270] Fix overflow bug in low-memory JSON reader (#16632)

Bug fix for #16627.
Changes byte range offsets and sizes from `size_type` to `size_t` in pylibcudf.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16632
---
 python/pylibcudf/pylibcudf/io/json.pxd         |  4 ++--
 python/pylibcudf/pylibcudf/io/json.pyx         | 12 ++++++------
 python/pylibcudf/pylibcudf/libcudf/io/json.pxd | 12 ++++++------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd
index ab9b5b99ce2..f65c1034598 100644
--- a/python/pylibcudf/pylibcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/io/json.pxd
@@ -15,8 +15,8 @@ cpdef TableWithMetadata read_json(
     list dtypes = *,
     compression_type compression = *,
     bool lines = *,
-    size_type byte_range_offset = *,
-    size_type byte_range_size = *,
+    size_t byte_range_offset = *,
+    size_t byte_range_size = *,
     bool keep_quotes = *,
     bool mixed_types_as_string = *,
     bool prune_columns = *,
diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx
index ce086f4a489..29e49083bc6 100644
--- a/python/pylibcudf/pylibcudf/io/json.pyx
+++ b/python/pylibcudf/pylibcudf/io/json.pyx
@@ -51,8 +51,8 @@ cdef json_reader_options _setup_json_reader_options(
         list dtypes,
         compression_type compression,
         bool lines,
-        size_type byte_range_offset,
-        size_type byte_range_size,
+        size_t byte_range_offset,
+        size_t byte_range_size,
         bool keep_quotes,
         bool mixed_types_as_string,
         bool prune_columns,
@@ -189,8 +189,8 @@ cpdef TableWithMetadata read_json(
     list dtypes = None,
     compression_type compression = compression_type.AUTO,
     bool lines = False,
-    size_type byte_range_offset = 0,
-    size_type byte_range_size = 0,
+    size_t byte_range_offset = 0,
+    size_t byte_range_size = 0,
     bool keep_quotes = False,
     bool mixed_types_as_string = False,
     bool prune_columns = False,
@@ -212,9 +212,9 @@ cpdef TableWithMetadata read_json(
         (column_child_name, column_child_type, list of grandchild dtypes).
     compression: CompressionType, default CompressionType.AUTO
         The compression format of the JSON source.
-    byte_range_offset : size_type, default 0
+    byte_range_offset : size_t, default 0
         Number of bytes to skip from source start.
-    byte_range_size : size_type, default 0
+    byte_range_size : size_t, default 0
         Number of bytes to read. By default, will read all bytes.
     keep_quotes : bool, default False
         Whether the reader should keep quotes of string values.
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
index 7514e6c5258..1c74f8ca3ac 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
@@ -27,8 +27,8 @@ cdef extern from "cudf/io/json.hpp" \
         cudf_io_types.source_info get_source() except +
         vector[string] get_dtypes() except +
         cudf_io_types.compression_type get_compression() except +
-        size_type get_byte_range_offset() except +
-        size_type get_byte_range_size() except +
+        size_t get_byte_range_offset() except +
+        size_t get_byte_range_size() except +
         bool is_enabled_lines() except +
         bool is_enabled_mixed_types_as_string() except +
         bool is_enabled_prune_columns() except +
@@ -41,8 +41,8 @@ cdef extern from "cudf/io/json.hpp" \
         void set_compression(
             cudf_io_types.compression_type compression
         ) except +
-        void set_byte_range_offset(size_type offset) except +
-        void set_byte_range_size(size_type size) except +
+        void set_byte_range_offset(size_t offset) except +
+        void set_byte_range_size(size_t size) except +
         void enable_lines(bool val) except +
         void enable_mixed_types_as_string(bool val) except +
         void enable_prune_columns(bool val) except +
@@ -73,10 +73,10 @@ cdef extern from "cudf/io/json.hpp" \
             cudf_io_types.compression_type compression
         ) except +
         json_reader_options_builder& byte_range_offset(
-            size_type offset
+            size_t offset
         ) except +
         json_reader_options_builder& byte_range_size(
-            size_type size
+            size_t size
         ) except +
         json_reader_options_builder& lines(
             bool val

From 00ff2ee5ec2fd23c65e759dc2f9d2907a1c9cb00 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 22 Aug 2024 10:35:27 -0700
Subject: [PATCH 106/270] [FEA] Add filesystem argument to `cudf.read_parquet`
 (#16577)

This PR adds a `filesystem` kwarg to `cudf.read_parquet` (in alignment with [the pandas API](https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html)).

When a user has already constructed an `fsspec.AbstractFileSystem` object outside of cudf, they can now pass that object in to `read_parquet` to avoid redundant (and possibly inconsistent) filesystem inference. This PR also makes it possible for us to remove [explicit remote-IO handling from dask-cudf](https://github.com/rapidsai/cudf/blob/623dfceb42eb3e73b352b295898ff3e6cfe7c865/python/dask_cudf/dask_cudf/io/parquet.py#L100) (and consolidate the logic in cudf/ioutils).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16577
---
 python/cudf/cudf/io/parquet.py           |  5 ++-
 python/cudf/cudf/tests/test_s3.py        | 22 ++++++++++
 python/cudf/cudf/utils/ioutils.py        | 54 ++++++++++++++++++------
 python/dask_cudf/dask_cudf/io/parquet.py | 23 +++-------
 4 files changed, 75 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index fac51a9e471..560f257c115 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -527,6 +527,7 @@ def read_parquet(
     engine="cudf",
     columns=None,
     storage_options=None,
+    filesystem=None,
     filters=None,
     row_groups=None,
     use_pandas_metadata=True,
@@ -567,7 +568,9 @@ def read_parquet(
     # Start by trying construct a filesystem object, so we
     # can apply filters on remote file-systems
     fs, paths = ioutils._get_filesystem_and_paths(
-        path_or_data=filepath_or_buffer, storage_options=storage_options
+        path_or_data=filepath_or_buffer,
+        storage_options=storage_options,
+        filesystem=filesystem,
     )
 
     # Normalize and validate filters
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 6579fd23634..3b23a53091e 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -269,6 +269,28 @@ def test_read_parquet_ext(
     assert_eq(expect, got1)
 
 
+def test_read_parquet_filesystem(s3_base, s3so, pdf):
+    fname = "data.0.parquet"
+    # NOTE: Need a unique bucket name when a glob pattern
+    # is used, otherwise fsspec seems to cache the bucket
+    # contents, and later tests using the same bucket name
+    # will fail.
+    bucket = "test_read_parquet_filesystem"
+    buffer = BytesIO()
+    pdf.to_parquet(path=buffer)
+    buffer.seek(0)
+    fs = get_fs_token_paths("s3://", mode="rb", storage_options=s3so)[0]
+    with s3_context(
+        s3_base=s3_base,
+        bucket=bucket,
+        files={fname: buffer},
+    ):
+        # Check that a glob pattern works
+        path = f"s3://{bucket}/{'data.*.parquet'}"
+        got = cudf.read_parquet(path, filesystem=fs)
+    assert_eq(pdf, got)
+
+
 def test_read_parquet_multi_file(s3_base, s3so, pdf):
     fname_1 = "test_parquet_reader_multi_file_1.parquet"
     buffer_1 = BytesIO()
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 4ac9b63985f..18106e7475b 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -12,7 +12,7 @@
 import fsspec.implementations.local
 import numpy as np
 import pandas as pd
-from fsspec.core import get_fs_token_paths
+from fsspec.core import expand_paths_if_needed, get_fs_token_paths
 
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
@@ -139,6 +139,9 @@
     For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
     pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
     ``urllib`` for more details.
+filesystem : fsspec.AbstractFileSystem, default None
+    Filesystem object to use when reading the parquet data. This argument
+    should not be used at the same time as `storage_options`.
 filters : list of tuple, list of lists of tuples, default None
     If not None, specifies a filter predicate used to filter out row groups
     using statistics stored for each row group as Parquet metadata. Row groups
@@ -1536,11 +1539,18 @@ def is_directory(path_or_data, storage_options=None):
     return False
 
 
-def _get_filesystem_and_paths(path_or_data, storage_options):
+def _get_filesystem_and_paths(
+    path_or_data,
+    storage_options,
+    *,
+    filesystem=None,
+):
     # Returns a filesystem object and the filesystem-normalized
     # paths. If `path_or_data` does not correspond to a path or
     # list of paths (or if the protocol is not supported), the
     # return will be `None` for the fs and `[]` for the paths.
+    # If a filesystem object is already available, it can be
+    # passed with the `filesystem` argument.
 
     fs = None
     return_paths = path_or_data
@@ -1557,16 +1567,36 @@ def _get_filesystem_and_paths(path_or_data, storage_options):
         else:
             path_or_data = [path_or_data]
 
-        try:
-            fs, _, fs_paths = get_fs_token_paths(
-                path_or_data, mode="rb", storage_options=storage_options
-            )
-            return_paths = fs_paths
-        except ValueError as e:
-            if str(e).startswith("Protocol not known"):
-                return None, []
-            else:
-                raise e
+        if filesystem is None:
+            try:
+                fs, _, fs_paths = get_fs_token_paths(
+                    path_or_data, mode="rb", storage_options=storage_options
+                )
+                return_paths = fs_paths
+            except ValueError as e:
+                if str(e).startswith("Protocol not known"):
+                    return None, []
+                else:
+                    raise e
+        else:
+            if not isinstance(filesystem, fsspec.AbstractFileSystem):
+                raise ValueError(
+                    f"Expected fsspec.AbstractFileSystem. Got {filesystem}"
+                )
+
+            if storage_options:
+                raise ValueError(
+                    f"Cannot specify storage_options when an explicit "
+                    f"filesystem object is specified. Got: {storage_options}"
+                )
+
+            fs = filesystem
+            return_paths = [
+                fs._strip_protocol(u)
+                for u in expand_paths_if_needed(
+                    path_or_data, "rb", 1, fs, None
+                )
+            ]
 
     return fs, return_paths
 
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 8f52fce7818..c025280c240 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -23,11 +23,7 @@
 from cudf.io import write_to_dataset
 from cudf.io.parquet import _apply_post_filters, _normalize_filters
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
-from cudf.utils.ioutils import (
-    _ROW_GROUP_SIZE_BYTES_DEFAULT,
-    _fsspec_data_transfer,
-    _is_local_filesystem,
-)
+from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -93,40 +89,35 @@ def _read_paths(
         dataset_kwargs = dataset_kwargs or {}
         dataset_kwargs["partitioning"] = partitioning or "hive"
 
-        # Non-local filesystem handling
-        paths_or_fobs = paths
-        if not _is_local_filesystem(fs):
-            paths_or_fobs = [
-                _fsspec_data_transfer(fpath, fs=fs) for fpath in paths
-            ]
-
         # Use cudf to read in data
         try:
             df = cudf.read_parquet(
-                paths_or_fobs,
+                paths,
                 engine="cudf",
                 columns=columns,
                 row_groups=row_groups if row_groups else None,
                 dataset_kwargs=dataset_kwargs,
                 categorical_partitions=False,
+                filesystem=fs,
                 **kwargs,
             )
         except RuntimeError as err:
             # TODO: Remove try/except after null-schema issue is resolved
             # (See: https://github.com/rapidsai/cudf/issues/12702)
-            if len(paths_or_fobs) > 1:
+            if len(paths) > 1:
                 df = cudf.concat(
                     [
                         cudf.read_parquet(
-                            pof,
+                            path,
                             engine="cudf",
                             columns=columns,
                             row_groups=row_groups[i] if row_groups else None,
                             dataset_kwargs=dataset_kwargs,
                             categorical_partitions=False,
+                            filesystem=fs,
                             **kwargs,
                         )
-                        for i, pof in enumerate(paths_or_fobs)
+                        for i, path in enumerate(paths)
                     ]
                 )
             else:

From 81d71fce73306ae88bee1c78ed1f88e10916ad17 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Thu, 22 Aug 2024 12:56:09 -0500
Subject: [PATCH 107/270] update-version.sh fix (#16629)

Updates the `update-version.sh` script to include missed version
updates.
---
 ci/release/update-version.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 132e58249e6..e79a91510b8 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -51,6 +51,7 @@ DEPENDENCIES=(
   kvikio
   libkvikio
   librmm
+  pylibcudf
   rapids-dask-dependency
   rmm
 )
@@ -77,7 +78,7 @@ for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
   sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
-sed_runner "s/branch-[0-9]+\.[0-9]+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
+sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
 
 # Java files
 NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT"

From e4e867aace96b80fccf030cc02a11f89cbb9c05f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 22 Aug 2024 09:29:44 -1000
Subject: [PATCH 108/270] Annotate `ColumnAccessor._data` labels as `Hashable`
 (#16623)

The motivating change here is that since we store a dictionary of columns in `ColumnAccessor`, the labels should be `collections.abc.Hashable` and therefore we can type methods that select by key with this annotation.

This led to a mypy-typing-validation cascade that made me type the output of `def as_column(...) -> ColumnBase` which also lead to typing validation in several other files.

Namely there no logic changes here.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16623
---
 python/cudf/cudf/_lib/column.pyi              |  2 +-
 python/cudf/cudf/core/_internals/timezones.py |  2 +-
 python/cudf/cudf/core/column/categorical.py   |  6 +-
 python/cudf/cudf/core/column/column.py        | 22 ++++--
 python/cudf/cudf/core/column/lists.py         |  7 +-
 python/cudf/cudf/core/column/numerical.py     |  4 +-
 python/cudf/cudf/core/column/string.py        |  8 +-
 python/cudf/cudf/core/column_accessor.py      | 76 +++++++++++--------
 python/cudf/cudf/core/copy_types.py           | 19 +++--
 python/cudf/cudf/core/dataframe.py            |  2 +-
 python/cudf/cudf/core/indexed_frame.py        | 26 ++++---
 11 files changed, 105 insertions(+), 69 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
index bcab009c102..bb38488eefb 100644
--- a/python/cudf/cudf/_lib/column.pyi
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -54,7 +54,7 @@ class Column:
     @property
     def mask_ptr(self) -> int: ...
     def set_base_mask(self, value: Buffer | None) -> None: ...
-    def set_mask(self, value: Buffer | None) -> Self: ...
+    def set_mask(self, value: ColumnBase | Buffer | None) -> Self: ...
     @property
     def null_count(self) -> int: ...
     @property
diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 29cb9d7bd12..fd89904e766 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -120,7 +120,7 @@ def _read_tzfile_as_columns(
 
         # this happens for UTC-like zones
         min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
-        return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))
+        return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))  # type: ignore[return-value]
     return tuple(transition_times_and_offsets)  # type: ignore[return-value]
 
 
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 1fdaf9f8c07..a7e98e5218f 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -984,9 +984,9 @@ def find_and_replace(
         )
         replacement_col = catmap._data["index"].astype(replaced.codes.dtype)
 
-        replaced = column.as_column(replaced.codes)
+        replaced_codes = column.as_column(replaced.codes)
         output = libcudf.replace.replace(
-            replaced, to_replace_col, replacement_col
+            replaced_codes, to_replace_col, replacement_col
         )
 
         result = column.build_categorical_column(
@@ -1064,7 +1064,7 @@ def _validate_fillna_value(
                 raise TypeError(
                     "Cannot set a categorical with non-categorical data"
                 )
-            fill_value = fill_value._set_categories(
+            fill_value = cast(CategoricalColumn, fill_value)._set_categories(
                 self.categories,
             )
             return fill_value.codes.astype(self.codes.dtype)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 27278120abb..60b4126ddd4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -553,7 +553,7 @@ def __setitem__(self, key: Any, value: Any):
         """
 
         # Normalize value to scalar/column
-        value_normalized = (
+        value_normalized: cudf.Scalar | ColumnBase = (
             cudf.Scalar(value, dtype=self.dtype)
             if is_scalar(value)
             else as_column(value, dtype=self.dtype)
@@ -609,9 +609,12 @@ def _scatter_by_slice(
                 )
 
         # step != 1, create a scatter map with arange
-        scatter_map = as_column(
-            rng,
-            dtype=cudf.dtype(np.int32),
+        scatter_map = cast(
+            cudf.core.column.NumericalColumn,
+            as_column(
+                rng,
+                dtype=cudf.dtype(np.int32),
+            ),
         )
 
         return self._scatter_by_column(scatter_map, value)
@@ -1111,11 +1114,16 @@ def argsort(
         if (ascending and self.is_monotonic_increasing) or (
             not ascending and self.is_monotonic_decreasing
         ):
-            return as_column(range(len(self)))
+            return cast(
+                cudf.core.column.NumericalColumn, as_column(range(len(self)))
+            )
         elif (ascending and self.is_monotonic_decreasing) or (
             not ascending and self.is_monotonic_increasing
         ):
-            return as_column(range(len(self) - 1, -1, -1))
+            return cast(
+                cudf.core.column.NumericalColumn,
+                as_column(range(len(self) - 1, -1, -1)),
+            )
         else:
             return libcudf.sort.order_by(
                 [self], [ascending], na_position, stable=True
@@ -1752,7 +1760,7 @@ def as_column(
     nan_as_null: bool | None = None,
     dtype: Dtype | None = None,
     length: int | None = None,
-):
+) -> ColumnBase:
     """Create a Column from an arbitrary object
 
     Parameters
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 302f04a0e71..c6a39199e3b 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -256,7 +256,10 @@ def from_sequences(
                 offset += len(data)
                 offset_vals.append(offset)
 
-        offset_col = column.as_column(offset_vals, dtype=size_type_dtype)
+        offset_col = cast(
+            NumericalColumn,
+            column.as_column(offset_vals, dtype=size_type_dtype),
+        )
 
         # Build ListColumn
         res = cls(
@@ -338,7 +341,7 @@ def __init__(self, parent: ParentType):
 
     def get(
         self,
-        index: int,
+        index: int | ColumnLike,
         default: ScalarLike | ColumnLike | None = None,
     ) -> ParentType:
         """
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index a37355dfcda..90bec049831 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -142,7 +142,7 @@ def __setitem__(self, key: Any, value: Any):
         """
 
         # Normalize value to scalar/column
-        device_value = (
+        device_value: cudf.Scalar | ColumnBase = (
             cudf.Scalar(
                 value,
                 dtype=self.dtype
@@ -552,7 +552,7 @@ def _validate_fillna_value(
     ) -> cudf.Scalar | ColumnBase:
         """Align fill_value for .fillna based on column type."""
         if is_scalar(fill_value):
-            cudf_obj = cudf.Scalar(fill_value)
+            cudf_obj: cudf.Scalar | ColumnBase = cudf.Scalar(fill_value)
             if not as_column(cudf_obj).can_cast_safely(self.dtype):
                 raise TypeError(
                     f"Cannot safely cast non-equivalent "
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 6f7508822d4..16e6908f308 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -776,11 +776,13 @@ def contains(
             # TODO: we silently ignore the `regex=` flag here
             if case is False:
                 input_column = libstrings.to_lower(self._column)
-                pat = libstrings.to_lower(column.as_column(pat, dtype="str"))
+                col_pat = libstrings.to_lower(
+                    column.as_column(pat, dtype="str")
+                )
             else:
                 input_column = self._column
-                pat = column.as_column(pat, dtype="str")
-            result_col = libstrings.contains_multiple(input_column, pat)
+                col_pat = column.as_column(pat, dtype="str")
+            result_col = libstrings.contains_multiple(input_column, col_pat)
         return self._return_or_inplace(result_col)
 
     def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex:
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 7aa3e5f8163..34076fa0060 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -6,7 +6,7 @@
 import sys
 from collections import abc
 from functools import cached_property, reduce
-from typing import TYPE_CHECKING, Any, Callable, Mapping
+from typing import TYPE_CHECKING, Any, Callable, Mapping, cast
 
 import numpy as np
 import pandas as pd
@@ -35,7 +35,7 @@ class _NestedGetItemDict(dict):
     """
 
     @classmethod
-    def from_zip(cls, data):
+    def from_zip(cls, data: abc.Iterator):
         """Create from zip, specialized factory for nesting."""
         obj = cls()
         for key, value in data:
@@ -91,12 +91,12 @@ class ColumnAccessor(abc.MutableMapping):
         column length and data.values() are all Columns
     """
 
-    _data: dict[Any, ColumnBase]
-    _level_names: tuple[Any, ...]
+    _data: dict[abc.Hashable, ColumnBase]
+    _level_names: tuple[abc.Hashable, ...]
 
     def __init__(
         self,
-        data: abc.MutableMapping[Any, ColumnBase] | Self,
+        data: abc.MutableMapping[abc.Hashable, ColumnBase] | Self,
         multiindex: bool = False,
         level_names=None,
         rangeindex: bool = False,
@@ -141,16 +141,16 @@ def __init__(
                 f"data must be a ColumnAccessor or MutableMapping, not {type(data).__name__}"
             )
 
-    def __iter__(self):
+    def __iter__(self) -> abc.Iterator:
         return iter(self._data)
 
-    def __getitem__(self, key: Any) -> ColumnBase:
+    def __getitem__(self, key: abc.Hashable) -> ColumnBase:
         return self._data[key]
 
-    def __setitem__(self, key: Any, value: ColumnBase) -> None:
+    def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None:
         self.set_by_label(key, value)
 
-    def __delitem__(self, key: Any) -> None:
+    def __delitem__(self, key: abc.Hashable) -> None:
         old_ncols = len(self._data)
         del self._data[key]
         new_ncols = len(self._data)
@@ -186,7 +186,7 @@ def _from_columns_like_self(
             Whether to verify column length and type.
         """
         if sys.version_info.major >= 3 and sys.version_info.minor >= 10:
-            data = zip(self.names, columns, strict=True)
+            data = zip(self.names, columns, strict=True)  # type: ignore[call-overload]
         else:
             columns = list(columns)
             if len(columns) != len(self.names):
@@ -205,7 +205,7 @@ def _from_columns_like_self(
         )
 
     @property
-    def level_names(self) -> tuple[Any, ...]:
+    def level_names(self) -> tuple[abc.Hashable, ...]:
         if self._level_names is None or len(self._level_names) == 0:
             return tuple((None,) * max(1, self.nlevels))
         else:
@@ -221,7 +221,7 @@ def nlevels(self) -> int:
             return len(next(iter(self.keys())))
 
     @property
-    def name(self) -> Any:
+    def name(self) -> abc.Hashable:
         return self.level_names[-1]
 
     @cached_property
@@ -232,7 +232,7 @@ def nrows(self) -> int:
             return len(next(iter(self.values())))
 
     @cached_property
-    def names(self) -> tuple[Any, ...]:
+    def names(self) -> tuple[abc.Hashable, ...]:
         return tuple(self.keys())
 
     @cached_property
@@ -291,7 +291,7 @@ def to_pandas_index(self) -> pd.Index:
                     )
                 elif cudf.api.types.infer_dtype(self.names) == "integer":
                     if len(self.names) == 1:
-                        start = self.names[0]
+                        start = cast(int, self.names[0])
                         return pd.RangeIndex(
                             start=start, stop=start + 1, step=1, name=self.name
                         )
@@ -299,7 +299,9 @@ def to_pandas_index(self) -> pd.Index:
                     if len(uniques) == 1 and uniques[0] != 0:
                         diff = uniques[0]
                         new_range = range(
-                            self.names[0], self.names[-1] + diff, diff
+                            cast(int, self.names[0]),
+                            cast(int, self.names[-1]) + diff,
+                            diff,
                         )
                         return pd.RangeIndex(new_range, name=self.name)
             result = pd.Index(
@@ -310,7 +312,9 @@ def to_pandas_index(self) -> pd.Index:
             )
         return result
 
-    def insert(self, name: Any, value: ColumnBase, loc: int = -1) -> None:
+    def insert(
+        self, name: abc.Hashable, value: ColumnBase, loc: int = -1
+    ) -> None:
         """
         Insert column into the ColumnAccessor at the specified location.
 
@@ -457,7 +461,7 @@ def select_by_index(self, index: Any) -> Self:
             verify=False,
         )
 
-    def swaplevel(self, i=-2, j=-1) -> Self:
+    def swaplevel(self, i: abc.Hashable = -2, j: abc.Hashable = -1) -> Self:
         """
         Swap level i with level j.
         Calling this method does not change the ordering of the values.
@@ -486,7 +490,7 @@ def swaplevel(self, i=-2, j=-1) -> Self:
 
         # swap old keys for i and j
         for n, row in enumerate(self.names):
-            new_keys[n][i], new_keys[n][j] = row[j], row[i]
+            new_keys[n][i], new_keys[n][j] = row[j], row[i]  # type: ignore[call-overload, index]
             new_dict.update({row: tuple(new_keys[n])})
 
         # TODO: Change to deep=False when copy-on-write is default
@@ -494,10 +498,10 @@ def swaplevel(self, i=-2, j=-1) -> Self:
 
         # swap level_names for i and j
         new_names = list(self.level_names)
-        new_names[i], new_names[j] = new_names[j], new_names[i]
+        new_names[i], new_names[j] = new_names[j], new_names[i]  # type: ignore[call-overload]
 
         return type(self)(
-            new_data,
+            new_data,  # type: ignore[arg-type]
             multiindex=self.multiindex,
             level_names=new_names,
             rangeindex=self.rangeindex,
@@ -505,7 +509,7 @@ def swaplevel(self, i=-2, j=-1) -> Self:
             verify=False,
         )
 
-    def set_by_label(self, key: Any, value: ColumnBase) -> None:
+    def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None:
         """
         Add (or modify) column by name.
 
@@ -555,7 +559,7 @@ def _select_by_label_list_like(self, key: tuple) -> Self:
             verify=False,
         )
 
-    def _select_by_label_grouped(self, key: Any) -> Self:
+    def _select_by_label_grouped(self, key: abc.Hashable) -> Self:
         result = self._grouped_data[key]
         if isinstance(result, column.ColumnBase):
             # self._grouped_data[key] = self._data[key] so skip validation
@@ -606,8 +610,12 @@ def _select_by_label_slice(self, key: slice) -> Self:
         )
 
     def _select_by_label_with_wildcard(self, key: tuple) -> Self:
-        key = self._pad_key(key, slice(None))
-        data = {k: self._data[k] for k in self.names if _keys_equal(k, key)}
+        pad_key = self._pad_key(key, slice(None))
+        data = {
+            k: self._data[k]
+            for k in self.names
+            if _keys_equal(k, pad_key)  # type: ignore[arg-type]
+        }
         return type(self)(
             data,
             multiindex=self.multiindex,
@@ -616,7 +624,9 @@ def _select_by_label_with_wildcard(self, key: tuple) -> Self:
             verify=False,
         )
 
-    def _pad_key(self, key: Any, pad_value="") -> Any:
+    def _pad_key(
+        self, key: abc.Hashable, pad_value: str | slice = ""
+    ) -> abc.Hashable:
         """
         Pad the provided key to a length equal to the number
         of levels.
@@ -628,7 +638,9 @@ def _pad_key(self, key: Any, pad_value="") -> Any:
         return key + (pad_value,) * (self.nlevels - len(key))
 
     def rename_levels(
-        self, mapper: Mapping[Any, Any] | Callable, level: int | None = None
+        self,
+        mapper: Mapping[abc.Hashable, abc.Hashable] | Callable,
+        level: int | None = None,
     ) -> Self:
         """
         Rename the specified levels of the given ColumnAccessor
@@ -701,14 +713,14 @@ def rename_column(x):
             verify=False,
         )
 
-    def droplevel(self, level) -> None:
+    def droplevel(self, level: int) -> None:
         # drop the nth level
         if level < 0:
             level += self.nlevels
 
         old_ncols = len(self._data)
         self._data = {
-            _remove_key_level(key, level): value
+            _remove_key_level(key, level): value  # type: ignore[arg-type]
             for key, value in self._data.items()
         }
         new_ncols = len(self._data)
@@ -722,7 +734,7 @@ def droplevel(self, level) -> None:
         self._clear_cache(old_ncols, new_ncols)
 
 
-def _keys_equal(target: Any, key: Any) -> bool:
+def _keys_equal(target: abc.Hashable, key: abc.Iterable) -> bool:
     """
     Compare `key` to `target`.
 
@@ -740,7 +752,7 @@ def _keys_equal(target: Any, key: Any) -> bool:
     return True
 
 
-def _remove_key_level(key: Any, level: int) -> Any:
+def _remove_key_level(key: tuple, level: int) -> abc.Hashable:
     """
     Remove a level from key. If detupleize is True, and if only a
     single level remains, convert the tuple to a scalar.
@@ -751,7 +763,9 @@ def _remove_key_level(key: Any, level: int) -> Any:
     return result
 
 
-def _get_level(x, nlevels, level_names):
+def _get_level(
+    x: abc.Hashable, nlevels: int, level_names: tuple[abc.Hashable, ...]
+) -> abc.Hashable:
     """Get the level index from a level number or name.
 
     If given an integer, this function will handle wraparound for
diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py
index 6afbc0bbc65..16d8964f083 100644
--- a/python/cudf/cudf/core/copy_types.py
+++ b/python/cudf/cudf/core/copy_types.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, cast
 
@@ -44,15 +44,17 @@ class GatherMap:
         If the map is not in bounds.
     """
 
-    #: The gather map
-    column: "NumericalColumn"
     #: The number of rows the gather map has been validated for
     nrows: int
     #: Was the validation for nullify=True?
     nullify: bool
 
     def __init__(self, column: Any, nrows: int, *, nullify: bool):
-        self.column = cudf.core.column.as_column(column)
+        #: The gather map
+        self.column = cast(
+            cudf.core.column.NumericalColumn,
+            cudf.core.column.as_column(column),
+        )
         self.nrows = nrows
         self.nullify = nullify
         if len(self.column) == 0:
@@ -135,11 +137,12 @@ class BooleanMask:
         If the mask has the wrong number of rows
     """
 
-    #: The boolean mask
-    column: "NumericalColumn"
-
     def __init__(self, column: Any, nrows: int):
-        self.column = cudf.core.column.as_column(column)
+        #: The boolean mask
+        self.column = cast(
+            cudf.core.column.NumericalColumn,
+            cudf.core.column.as_column(column),
+        )
         if self.column.dtype.kind != "b":
             raise TypeError("Boolean mask must have bool dtype")
         if len(column) != nrows:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 43693ec20b1..14b63c2b0d7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5830,7 +5830,7 @@ def from_records(
 
         df = cls._from_data(
             ColumnAccessor(
-                data=ca_data,
+                data=ca_data,  # type: ignore[arg-type]
                 multiindex=isinstance(
                     columns, (pd.MultiIndex, cudf.MultiIndex)
                 ),
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e46e24dd0d8..60253b9ae5d 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -40,7 +40,7 @@
 from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.column import ColumnBase, as_column
+from cudf.core.column import ColumnBase, NumericalColumn, as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import BooleanMask, GatherMap
 from cudf.core.dtypes import ListDtype
@@ -3008,9 +3008,12 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
         if stride != 1:
             return self._gather(
                 GatherMap.from_column_unchecked(
-                    as_column(
-                        range(start, stop, stride),
-                        dtype=libcudf.types.size_type_dtype,
+                    cast(
+                        NumericalColumn,
+                        as_column(
+                            range(start, stop, stride),
+                            dtype=libcudf.types.size_type_dtype,
+                        ),
                     ),
                     len(self),
                     nullify=False,
@@ -4761,10 +4764,13 @@ def _sample_axis_0(
     ):
         try:
             gather_map = GatherMap.from_column_unchecked(
-                cudf.core.column.as_column(
-                    random_state.choice(
-                        len(self), size=n, replace=replace, p=weights
-                    )
+                cast(
+                    NumericalColumn,
+                    cudf.core.column.as_column(
+                        random_state.choice(
+                            len(self), size=n, replace=replace, p=weights
+                        )
+                    ),
                 ),
                 len(self),
                 nullify=False,
@@ -6599,7 +6605,7 @@ def _drop_rows_by_labels(
             level = 0
 
         levels_index = obj.index.get_level_values(level)
-        if errors == "raise" and not labels.isin(levels_index).all():
+        if errors == "raise" and not labels.isin(levels_index).all():  # type: ignore[union-attr]
             raise KeyError("One or more values not found in axis")
 
         if isinstance(level, int):
@@ -6649,7 +6655,7 @@ def _drop_rows_by_labels(
             )
 
     else:
-        if errors == "raise" and not labels.isin(obj.index).all():
+        if errors == "raise" and not labels.isin(obj.index).all():  # type: ignore[union-attr]
             raise KeyError("One or more values not found in axis")
 
         if isinstance(labels, ColumnBase):

From 8b20298c960387c825cfd1476bcf0bc9119df58e Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 22 Aug 2024 16:48:22 -0500
Subject: [PATCH 109/270] Move pragma once in rolling/jit/operation.hpp.
 (#16636)

I noticed from https://github.com/rapidsai/cudf/pull/16590#discussion_r1725842333 that there was one other file where `#pragma once` was not at the top. This PR fixes that.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16636
---
 cpp/src/rolling/jit/operation.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/rolling/jit/operation.hpp b/cpp/src/rolling/jit/operation.hpp
index f8a52c03d4e..3be739ec5bf 100644
--- a/cpp/src/rolling/jit/operation.hpp
+++ b/cpp/src/rolling/jit/operation.hpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include "rolling/jit/operation-udf.hpp"
 
 #include <cudf/types.hpp>
 
-#pragma once
-
 struct rolling_udf_ptx {
   template <typename OutType, typename InType>
   static OutType operate(InType const* in_col, cudf::size_type start, cudf::size_type count)

From eaefcb4e9baa587f40bc6daa5452c170b9f9616b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 22 Aug 2024 12:33:13 -1000
Subject: [PATCH 110/270] Support DecimalDtype meta in dask_cudf (#16634)

To enable some tpch benchmarking for dask-cudf

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/16634
---
 python/dask_cudf/dask_cudf/backends.py        |  2 ++
 python/dask_cudf/dask_cudf/tests/test_join.py | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 16b2c8959e2..5bd3eb5fa7f 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -134,6 +134,8 @@ def _get_non_empty_data(
         return cudf.core.column.as_column(
             np.arange(start=0, stop=2, dtype=s.dtype)
         )
+    elif isinstance(s.dtype, cudf.core.dtypes.DecimalDtype):
+        return cudf.core.column.as_column(range(2), dtype=s.dtype)
     else:
         raise TypeError(
             f"Don't know how to handle column of type {type(s).__name__}"
diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py
index ed291ef31a7..3e078c47cdd 100644
--- a/python/dask_cudf/dask_cudf/tests/test_join.py
+++ b/python/dask_cudf/dask_cudf/tests/test_join.py
@@ -386,3 +386,14 @@ def test_issue_12773():
         expected.to_pandas(),
         check_index=False,
     )
+
+
+@pytest.mark.parametrize(
+    "typ", [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype]
+)
+def test_merge_on_decimal(typ):
+    df = cudf.DataFrame({"a": [1], "b": [2]}, dtype=typ(1))
+    ddf = dask_cudf.from_cudf(df, npartitions=1)
+    result = ddf.merge(ddf, left_on="a", right_on="a")
+    expected = df.merge(df, left_on="a", right_on="a")
+    dd.assert_eq(result, expected)

From 83f68c920f51f9e69f2a5bf0fddf26babac2483b Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 22 Aug 2024 18:59:47 -0400
Subject: [PATCH 111/270] Revert "Hide all gtest symbols in cudftestutil
 (#16546)" (#16644)

This reverts commit ac42bc870a65d807784cae63e25b9e9ca788eb23.

We need to revert #16546 as it broke the gtest builds for cudf. Therefore gtests that actually fail wouldn't properly report an error but silently continue and report as passed.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16644
---
 cpp/cmake/thirdparty/get_gtest.cmake | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index ec8cbd8c568..10e6b026d9a 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -16,18 +16,9 @@
 function(find_and_configure_gtest)
   include(${rapids-cmake-dir}/cpm/gtest.cmake)
 
-  # Mark all the non explicit googletest symbols as hidden. This ensures that libcudftestutil can be
-  # used by consumers with a different shared gtest.
-  set(gtest_hide_internal_symbols ON)
-
   # Find or install GoogleTest
   rapids_cpm_gtest(BUILD_STATIC)
 
-  # Mark all the explicit googletest symbols as hidden. This ensures that libcudftestutil can be
-  # used by consumers with a different shared gtest.
-  if(TARGET gtest)
-    target_compile_definitions(gtest PUBLIC "$<BUILD_LOCAL_INTERFACE:GTEST_API_=>")
-  endif()
 endfunction()
 
 find_and_configure_gtest()

From 91f304ecb16dbe06c1405df42ada9b66875f61c8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 23 Aug 2024 07:51:23 -0500
Subject: [PATCH 112/270] Enable testing `cudf.pandas` unit tests for all minor
 versions of pandas (#16595)

Fixes: https://github.com/rapidsai/cudf/issues/16537

This PR enables testing `cudf.pandas` unit tests with all minor versions of pandas-2

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16595
---
 .../fetch_pandas_versions.py                  | 24 +++++++++++++
 ci/cudf_pandas_scripts/run_tests.sh           | 36 ++++++++++++++++---
 .../cudf_pandas_tests/test_cudf_pandas.py     | 18 ++++++++++
 .../cudf/cudf_pandas_tests/test_profiler.py   |  8 +++++
 4 files changed, 82 insertions(+), 4 deletions(-)
 create mode 100644 ci/cudf_pandas_scripts/fetch_pandas_versions.py

diff --git a/ci/cudf_pandas_scripts/fetch_pandas_versions.py b/ci/cudf_pandas_scripts/fetch_pandas_versions.py
new file mode 100644
index 00000000000..b6913f947e8
--- /dev/null
+++ b/ci/cudf_pandas_scripts/fetch_pandas_versions.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import requests
+from packaging.version import Version
+from packaging.specifiers import SpecifierSet
+import argparse
+
+def get_pandas_versions(pandas_range):
+    url = "https://pypi.org/pypi/pandas/json"
+    response = requests.get(url)
+    data = response.json()
+    versions = [Version(v) for v in data['releases']]
+    specifier = SpecifierSet(pandas_range.lstrip("pandas"))
+    matching_versions = [v for v in versions if v in specifier]
+    matching_minors = sorted(set(".".join((str(v.major), str(v.minor))) for v in matching_versions), key=Version)
+    return matching_minors
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Filter pandas versions by prefix.")
+    parser.add_argument("pandas_range", type=str, help="The version prefix to filter by.")
+    args = parser.parse_args()
+
+    versions = get_pandas_versions(args.pandas_range)
+    print(','.join(versions))
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 8215ce729b3..5bfc083bcd3 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -9,13 +9,20 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
 RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}
 mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}"
 
+DEPENDENCIES_PATH="dependencies.yaml"
+package_name="pandas"
+
+# Use grep to find the line containing the package name and version constraint
+pandas_version_constraint=$(grep -oP "pandas>=\d+\.\d+,\<\d+\.\d+\.\d+dev\d+" $DEPENDENCIES_PATH)
+
 # Function to display script usage
 function display_usage {
-    echo "Usage: $0 [--no-cudf]"
+    echo "Usage: $0 [--no-cudf] [pandas-version]"
 }
 
 # Default value for the --no-cudf option
 no_cudf=false
+PANDAS_VERSION=""
 
 # Parse command-line arguments
 while [[ $# -gt 0 ]]; do
@@ -25,9 +32,14 @@ while [[ $# -gt 0 ]]; do
             shift
             ;;
         *)
-            echo "Error: Unknown option $1"
-            display_usage
-            exit 1
+            if [[ -z "$PANDAS_VERSION" ]]; then
+                PANDAS_VERSION=$1
+                shift
+            else
+                echo "Error: Unknown option $1"
+                display_usage
+                exit 1
+            fi
             ;;
     esac
 done
@@ -53,3 +65,19 @@ python -m pytest -p cudf.pandas \
     --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \
     --cov-report=term \
     ./python/cudf/cudf_pandas_tests/
+
+output=$(python ci/cudf_pandas_scripts/fetch_pandas_versions.py $pandas_version_constraint)
+
+# Convert the comma-separated list into an array
+IFS=',' read -r -a versions <<< "$output"
+
+for version in "${versions[@]}"; do
+    echo "Installing pandas version: ${version}"
+    python -m pip install "pandas==${version}"
+    python -m pytest -p cudf.pandas \
+    --cov-config=./python/cudf/.coveragerc \
+    --cov=cudf \
+    --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \
+    --cov-report=term \
+    ./python/cudf/cudf_pandas_tests/
+done
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 6292022d8e4..028f5f173ac 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -42,6 +42,8 @@
     get_calendar,
 )
 
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+
 # Accelerated pandas has the real pandas and cudf modules as attributes
 pd = xpd._fsproxy_slow
 cudf = xpd._fsproxy_fast
@@ -607,6 +609,10 @@ def test_array_function_series_fallback(series):
     tm.assert_equal(expect, got)
 
 
+@pytest.mark.xfail(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_timedeltaproperties(series):
     psr, sr = series
     psr, sr = psr.astype("timedelta64[ns]"), sr.astype("timedelta64[ns]")
@@ -666,6 +672,10 @@ def test_maintain_container_subclasses(multiindex):
     assert isinstance(got, xpd.core.indexes.frozen.FrozenList)
 
 
+@pytest.mark.xfail(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas due to unsupported boxcar window type",
+)
 def test_rolling_win_type():
     pdf = pd.DataFrame(range(5))
     df = xpd.DataFrame(range(5))
@@ -1281,6 +1291,10 @@ def max_times_two(self):
     assert s.max_times_two() == 6
 
 
+@pytest.mark.xfail(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="DatetimeArray.__floordiv__ missing in pandas-2.0.0",
+)
 def test_floordiv_array_vs_df():
     xarray = xpd.Series([1, 2, 3], dtype="datetime64[ns]").array
     parray = pd.Series([1, 2, 3], dtype="datetime64[ns]").array
@@ -1552,6 +1566,10 @@ def test_numpy_cupy_flatiter(series):
     assert type(arr.flat._fsproxy_slow) == np.flatiter
 
 
+@pytest.mark.xfail(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="pyarrow_numpy storage type was not supported in pandas-2.0.0",
+)
 def test_arrow_string_arrays():
     cu_s = xpd.Series(["a", "b", "c"])
     pd_s = pd.Series(["a", "b", "c"])
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index 588398265f2..5b7bde06d1d 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -5,6 +5,8 @@
 import os
 import subprocess
 
+import pytest
+
 from cudf.pandas import LOADED, Profiler
 
 if not LOADED:
@@ -13,7 +15,13 @@
 import numpy as np
 import pandas as pd
 
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="function names change across versions of pandas, so making sure it only runs on latest version of pandas",
+)
 def test_profiler():
     np.random.seed(42)
     with Profiler() as profiler:

From 8d6b2616af8aeec6dfd02d787084c583e2447791 Mon Sep 17 00:00:00 2001
From: Mike Sarahan <msarahan@gmail.com>
Date: Fri, 23 Aug 2024 10:47:40 -0500
Subject: [PATCH 113/270] adding wheel build for libcudf (#15483)

Contributes to https://github.com/rapidsai/build-planning/issues/33

Adds a standalone `libcudf` wheel, containing the `libcudf` C++ shared library.

Fixes #16588

Authors:
  - Mike Sarahan (https://github.com/msarahan)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15483
---
 .github/workflows/build.yaml                  | 20 ++++
 .github/workflows/pr.yaml                     | 11 ++-
 ci/build_wheel_cudf.sh                        | 26 ++++--
 ci/build_wheel_libcudf.sh                     | 15 +++
 ci/build_wheel_pylibcudf.sh                   | 22 ++++-
 ci/cudf_pandas_scripts/pandas-tests/run.sh    |  8 +-
 ci/cudf_pandas_scripts/run_tests.sh           |  8 +-
 ci/release/update-version.sh                  |  1 +
 ci/test_wheel_cudf.sh                         |  8 +-
 ci/test_wheel_cudf_polars.sh                  |  8 +-
 ci/test_wheel_dask_cudf.sh                    | 10 +-
 dependencies.yaml                             | 91 ++++++++++++++++++-
 python/cudf/CMakeLists.txt                    | 69 ++------------
 python/cudf/cudf/__init__.py                  | 10 ++
 python/cudf/cudf/_lib/CMakeLists.txt          |  1 +
 python/cudf/pyproject.toml                    |  3 +
 python/libcudf/CMakeLists.txt                 | 58 ++++++++++++
 python/libcudf/LICENSE                        |  1 +
 python/libcudf/README.md                      |  1 +
 .../cmake/Modules/WheelHelpers.cmake          |  0
 python/libcudf/libcudf/VERSION                |  1 +
 python/libcudf/libcudf/__init__.py            | 16 ++++
 python/libcudf/libcudf/_version.py            | 33 +++++++
 python/libcudf/libcudf/load.py                | 51 +++++++++++
 python/libcudf/pyproject.toml                 | 75 +++++++++++++++
 python/pylibcudf/CMakeLists.txt               | 68 ++------------
 python/pylibcudf/pylibcudf/CMakeLists.txt     |  2 +
 python/pylibcudf/pylibcudf/__init__.py        | 10 ++
 python/pylibcudf/pyproject.toml               |  3 +
 29 files changed, 476 insertions(+), 154 deletions(-)
 create mode 100755 ci/build_wheel_libcudf.sh
 create mode 100644 python/libcudf/CMakeLists.txt
 create mode 120000 python/libcudf/LICENSE
 create mode 120000 python/libcudf/README.md
 rename python/{pylibcudf => libcudf}/cmake/Modules/WheelHelpers.cmake (100%)
 create mode 120000 python/libcudf/libcudf/VERSION
 create mode 100644 python/libcudf/libcudf/__init__.py
 create mode 100644 python/libcudf/libcudf/_version.py
 create mode 100644 python/libcudf/libcudf/load.py
 create mode 100644 python/libcudf/pyproject.toml

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 9943b02a521..0ea4d5c54dc 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -67,7 +67,27 @@ jobs:
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
+  wheel-build-libcudf:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_libcudf.sh
+  wheel-publish-libcudf:
+    needs: wheel-build-libcudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: libcudf
   wheel-build-pylibcudf:
+    needs: [wheel-publish-libcudf]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 74bdc666c68..2e2a8b6b9bc 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -23,6 +23,7 @@ jobs:
       - static-configure
       - conda-notebook-tests
       - docs-build
+      - wheel-build-libcudf
       - wheel-build-pylibcudf
       - wheel-build-cudf
       - wheel-tests-cudf
@@ -121,10 +122,18 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
-  wheel-build-pylibcudf:
+  wheel-build-libcudf:
     needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    with:
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
+      build_type: pull-request
+      script: "ci/build_wheel_libcudf.sh"
+  wheel-build-pylibcudf:
+    needs: [checks, wheel-build-libcudf]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index 7c0fb1efebe..cf33703f544 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -5,16 +5,28 @@ set -euo pipefail
 
 package_dir="python/cudf"
 
-export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON"
-
-# Download the pylibcudf built in the previous step
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 /tmp/pylibcudf_dist
 
-echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" > /tmp/constraints.txt
+# Downloads libcudf and pylibcudf wheels from this current build,
+# then ensures 'cudf' wheel builds always use the 'libcudf' and 'pylibcudf' just built in the same CI run.
+#
+# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints
+# are used when creating the isolated build environment.
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libcudf_dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python /tmp/pylibcudf_dist
+echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt
+echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" >> /tmp/constraints.txt
 export PIP_CONSTRAINT="/tmp/constraints.txt"
+
 ./ci/build_wheel.sh ${package_dir}
 
-python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
+python -m auditwheel repair \
+    --exclude libcudf.so \
+    --exclude libarrow.so.1601 \
+    --exclude libnvcomp.so \
+    --exclude libnvcomp_bitcomp.so \
+    --exclude libnvcomp_gdeflate.so \
+    -w ${package_dir}/final_dist \
+    ${package_dir}/dist/*
 
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
new file mode 100755
index 00000000000..9694c3f6144
--- /dev/null
+++ b/ci/build_wheel_libcudf.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir="python/libcudf"
+
+./ci/build_wheel.sh ${package_dir}
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+mkdir -p ${package_dir}/final_dist
+python -m auditwheel repair --exclude libarrow.so.1601 -w ${package_dir}/final_dist ${package_dir}/dist/*
+
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index b25d118ff81..7181a49d397 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -5,12 +5,26 @@ set -euo pipefail
 
 package_dir="python/pylibcudf"
 
-export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-./ci/build_wheel.sh ${package_dir}
+# Downloads libcudf wheel from this current build,
+# then ensures 'pylibcudf' wheel builds always use the 'libcudf' just built in the same CI run.
+#
+# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints
+# are used when creating the isolated build environment.
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libcudf_dist
+echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt
+export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
+./ci/build_wheel.sh ${package_dir}
 
+python -m auditwheel repair \
+    --exclude libcudf.so \
+    --exclude libarrow.so.1601 \
+    --exclude libnvcomp.so \
+    --exclude libnvcomp_bitcomp.so \
+    --exclude libnvcomp_gdeflate.so \
+    -w ${package_dir}/final_dist \
+    ${package_dir}/dist/*
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index 97c3139080f..e5cd4436a3a 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -12,13 +12,15 @@ rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# Download the cudf and pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# Download the cudf, libcudf, and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,pandas-tests]" \
+  "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 5bfc083bcd3..90ea1afbe6a 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -49,13 +49,15 @@ if [ "$no_cudf" = true ]; then
 else
     RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-    # Download the cudf and pylibcudf built in the previous step
-    RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
-    RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+    # Download the cudf, libcudf, and pylibcudf built in the previous step
+    RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
+    RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
+    RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
     # echo to expand wildcard before adding `[extra]` requires for pip
     python -m pip install \
         "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,cudf-pandas-tests]" \
+        "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
         "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 fi
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index e79a91510b8..be55b49870f 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -49,6 +49,7 @@ DEPENDENCIES=(
   dask-cuda
   dask-cudf
   kvikio
+  libcudf
   libkvikio
   librmm
   pylibcudf
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 19131952098..6861d699695 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -5,13 +5,15 @@ set -eou pipefail
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# Download the cudf and pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# Download the cudf, libcudf, and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+  "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index e9c6188502c..0baf6c9e277 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -18,16 +18,18 @@ else
 fi
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist
 
-# Download pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# Download libcudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 rapids-logger "Installing cudf_polars and its dependencies"
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
     "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+    "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
     "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
 rapids-logger "Run cudf_polars tests"
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index ff893a08e27..fa74b2398f7 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -4,16 +4,18 @@
 set -eou pipefail
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist
 
-# Download the cudf and pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# Download the cudf, libcudf, and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+  "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
diff --git a/dependencies.yaml b/dependencies.yaml
index 150d03be021..553d01735b2 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -13,6 +13,7 @@ files:
       - cuda
       - cuda_version
       - depends_on_cupy
+      - depends_on_librmm
       - depends_on_rmm
       - develop
       - docs
@@ -95,6 +96,8 @@ files:
       - build_base
       - build_python_common
       - depends_on_pylibcudf
+      - depends_on_libcudf
+      - depends_on_librmm
       - depends_on_rmm
   py_run_cudf:
     output: pyproject
@@ -106,6 +109,7 @@ files:
       - run_cudf
       - pyarrow_run
       - depends_on_cupy
+      - depends_on_libcudf
       - depends_on_pylibcudf
       - depends_on_rmm
   py_test_cudf:
@@ -117,6 +121,31 @@ files:
     includes:
       - test_python_common
       - test_python_cudf
+  py_build_libcudf:
+    output: pyproject
+    pyproject_dir: python/libcudf
+    extras:
+      table: build-system
+    includes:
+      - rapids_build_skbuild
+  py_rapids_build_libcudf:
+    output: pyproject
+    pyproject_dir: python/libcudf
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
+    includes:
+      - build_base
+      - build_cpp
+      - build_python_libcudf
+      - depends_on_librmm
+  py_run_libcudf:
+    output: pyproject
+    pyproject_dir: python/libcudf
+    extras:
+      table: project
+    includes:
+      - pyarrow_run
   py_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
@@ -133,6 +162,8 @@ files:
     includes:
       - build_base
       - build_python_common
+      - depends_on_libcudf
+      - depends_on_librmm
       - depends_on_rmm
   py_run_pylibcudf:
     output: pyproject
@@ -140,6 +171,7 @@ files:
     extras:
       table: project
     includes:
+      - depends_on_libcudf
       - depends_on_rmm
       - pyarrow_run
       - run_pylibcudf
@@ -359,13 +391,18 @@ dependencies:
           - cython>=3.0.3
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==16.1.0.*
+          - &pyarrow_build pyarrow==16.1.0.*
       - output_types: pyproject
         packages:
           # Hard pin the patch version used during the build.
           # Sync with conda build constraint & wheel run constraint.
           # TODO: Change to `2.0.*` for NumPy 2
           - numpy==1.23.*
+  build_python_libcudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - *pyarrow_build
   libarrow_build:
     common:
       - output_types: conda
@@ -759,6 +796,31 @@ dependencies:
         packages:
           - dask-cuda==24.10.*,>=0.0.0a0
           - *numba
+  depends_on_libcudf:
+    common:
+      - output_types: conda
+        packages:
+          - &libcudf_unsuffixed libcudf==24.10.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for libcudf-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - libcudf-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - libcudf-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*libcudf_unsuffixed]}
   depends_on_pylibcudf:
     common:
       - output_types: conda
@@ -849,6 +911,33 @@ dependencies:
             packages: &cupy_packages_cu11
               - cupy-cuda11x>=12.0.0
           - {matrix: null, packages: *cupy_packages_cu11}
+  depends_on_librmm:
+    common:
+      - output_types: conda
+        packages:
+          - &librmm_unsuffixed librmm==24.10.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for librmm-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - librmm-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - librmm-cu11==24.10.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *librmm_unsuffixed
   depends_on_rmm:
     common:
       - output_types: conda
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index e11d62b3bd5..72f20b30052 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -24,72 +24,15 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
-       OFF
-)
-option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF)
-mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
-
-# Find Python early so that later commands can use it
-find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
-
-# If the user requested it we attempt to find CUDF.
-if(FIND_CUDF_CPP)
-  include(rapids-cpm)
-  include(rapids-export)
-  include(rapids-find)
-  rapids_cpm_init()
+find_package(cudf "${RAPIDS_VERSION}" REQUIRED)
 
-  if(USE_LIBARROW_FROM_PYARROW)
-    # We need to find arrow before libcudf since libcudf requires it but doesn't bundle arrow
-    # libraries. These variables have no effect because we are always searching for arrow via
-    # pyarrow, but they must be set as they are required arguments to the function in
-    # get_arrow.cmake.
-    set(CUDF_USE_ARROW_STATIC OFF)
-    set(CUDF_ENABLE_ARROW_S3 OFF)
-    set(CUDF_ENABLE_ARROW_ORC OFF)
-    set(CUDF_ENABLE_ARROW_PYTHON OFF)
-    set(CUDF_ENABLE_ARROW_PARQUET OFF)
-    include(../../cpp/cmake/thirdparty/get_arrow.cmake)
-  endif()
-
-  find_package(cudf "${RAPIDS_VERSION}" REQUIRED)
-
-  # an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
-  # for the interop.pyx
-  include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
-else()
-  set(cudf_FOUND OFF)
-endif()
+# an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
+# for the interop.pyx
+include(rapids-cpm)
+rapids_cpm_init()
+include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
 
 include(rapids-cython-core)
-
-if(NOT cudf_FOUND)
-  set(BUILD_TESTS OFF)
-  set(BUILD_BENCHMARKS OFF)
-  set(CUDF_BUILD_TESTUTIL OFF)
-  set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
-  set(CUDA_STATIC_RUNTIME ON)
-
-  add_subdirectory(../../cpp cudf-cpp EXCLUDE_FROM_ALL)
-
-  # libcudf targets are excluded by default above via EXCLUDE_FROM_ALL to remove extraneous
-  # components like headers from libcudacxx, but we do need the libraries. However, we want to
-  # control where they are installed to. Since there are multiple subpackages of cudf._lib that
-  # require access to libcudf, we place the library and all its dependent artifacts in the cudf
-  # directory as a single source of truth and modify the other rpaths appropriately.
-  set(cython_lib_dir cudf)
-  include(../pylibcudf/cmake/Modules/WheelHelpers.cmake)
-  # TODO: This install is currently overzealous. We should only install the libraries that are
-  # downloaded by CPM during the build, not libraries that were found on the system.  However, in
-  # practice right this would only be a problem is if libcudf was not found but some of the
-  # dependencies were, and we have no real use cases where that happens.
-  install_aliased_imported_targets(
-    TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
-    DESTINATION ${cython_lib_dir}
-  )
-endif()
-
 rapids_cython_init()
 
 include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index ccc45413de4..d7da42a1708 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -1,5 +1,15 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
+# If libcudf was installed as a wheel, we must request it to load the library symbols.
+# Otherwise, we assume that the library was installed in a system path that ld can find.
+try:
+    import libcudf
+except ModuleNotFoundError:
+    pass
+else:
+    libcudf.load_library()
+    del libcudf
+
 # _setup_numba _must be called before numba.cuda is imported, because
 # it sets the numba config variable responsible for enabling
 # Minor Version Compatibility. Setting it after importing numba.cuda has no effect.
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index d6182673308..5ea378fc0e5 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -63,6 +63,7 @@ rapids_cython_create_modules(
 )
 
 target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
+target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 
 set(targets_using_arrow_headers avro csv orc json parquet)
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 9db52164eca..cb9fa30afab 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
+    "libcudf==24.10.*,>=0.0.0a0",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
     "nvtx>=0.2.1",
@@ -126,6 +127,8 @@ matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
+    "libcudf==24.10.*,>=0.0.0a0",
+    "librmm==24.10.*,>=0.0.0a0",
     "ninja",
     "numpy==1.23.*",
     "pyarrow==16.1.0.*",
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
new file mode 100644
index 00000000000..09c7ed2e217
--- /dev/null
+++ b/python/libcudf/CMakeLists.txt
@@ -0,0 +1,58 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+include(../../rapids_config.cmake)
+
+project(
+  libcudf-python
+  VERSION "${RAPIDS_VERSION}"
+  LANGUAGES CXX
+)
+
+# Check if cudf is already available. If so, it is the user's responsibility to ensure that the
+# CMake package is also available at build time of the Python cudf package.
+find_package(cudf "${RAPIDS_VERSION}")
+
+if(cudf_FOUND)
+  return()
+endif()
+
+unset(cudf_FOUND)
+
+# For wheels, this should always be true
+set(USE_LIBARROW_FROM_PYARROW ON)
+
+# Find Python early so that later commands can use it
+find_package(Python 3.10 REQUIRED COMPONENTS Interpreter)
+
+set(BUILD_TESTS OFF)
+set(BUILD_BENCHMARKS OFF)
+set(CUDF_BUILD_TESTUTIL OFF)
+set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
+set(CUDA_STATIC_RUNTIME ON)
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+
+include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
+
+add_subdirectory(../../cpp cudf-cpp)
+
+# Ensure other libraries needed by libcudf.so get installed alongside it.
+include(cmake/Modules/WheelHelpers.cmake)
+install_aliased_imported_targets(
+  TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
+  DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
+)
diff --git a/python/libcudf/LICENSE b/python/libcudf/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/libcudf/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/libcudf/README.md b/python/libcudf/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/libcudf/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/pylibcudf/cmake/Modules/WheelHelpers.cmake b/python/libcudf/cmake/Modules/WheelHelpers.cmake
similarity index 100%
rename from python/pylibcudf/cmake/Modules/WheelHelpers.cmake
rename to python/libcudf/cmake/Modules/WheelHelpers.cmake
diff --git a/python/libcudf/libcudf/VERSION b/python/libcudf/libcudf/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/libcudf/libcudf/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/libcudf/libcudf/__init__.py b/python/libcudf/libcudf/__init__.py
new file mode 100644
index 00000000000..10c476cbe89
--- /dev/null
+++ b/python/libcudf/libcudf/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libcudf._version import __git_commit__, __version__
+from libcudf.load import load_library
diff --git a/python/libcudf/libcudf/_version.py b/python/libcudf/libcudf/_version.py
new file mode 100644
index 00000000000..7dd732b4905
--- /dev/null
+++ b/python/libcudf/libcudf/_version.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files(__package__)
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/libcudf/libcudf/load.py b/python/libcudf/libcudf/load.py
new file mode 100644
index 00000000000..f6ba0d51bdb
--- /dev/null
+++ b/python/libcudf/libcudf/load.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ctypes
+import os
+
+
+def load_library():
+    # This is loading the libarrow shared library in situations where it comes from the
+    # pyarrow package (i.e. when installed as a wheel).
+    import pyarrow  # noqa: F401
+
+    # Dynamically load libcudf.so. Prefer a system library if one is present to
+    # avoid clobbering symbols that other packages might expect, but if no
+    # other library is present use the one in the wheel.
+    libcudf_lib = None
+    try:
+        libcudf_lib = ctypes.CDLL("libcudf.so", ctypes.RTLD_GLOBAL)
+    except OSError:
+        # If neither of these directories contain the library, we assume we are in an
+        # environment where the C++ library is already installed somewhere else and the
+        # CMake build of the libcudf Python package was a no-op.
+        #
+        # Note that this approach won't work for real editable installs of the libcudf package.
+        # scikit-build-core has limited support for importlib.resources so there isn't a clean
+        # way to support that case yet.
+        for lib_dir in ("lib", "lib64"):
+            if os.path.isfile(
+                lib := os.path.join(
+                    os.path.dirname(__file__), lib_dir, "libcudf.so"
+                )
+            ):
+                libcudf_lib = ctypes.CDLL(lib, ctypes.RTLD_GLOBAL)
+                break
+
+    # The caller almost never needs to do anything with this library, but no
+    # harm in offering the option since this object at least provides a handle
+    # to inspect where libcudf was loaded from.
+    return libcudf_lib
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
new file mode 100644
index 00000000000..fd01f7f6e2f
--- /dev/null
+++ b/python/libcudf/pyproject.toml
@@ -0,0 +1,75 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+build-backend = "rapids_build_backend.build"
+requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "scikit-build-core[pyproject]>=0.10.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project]
+name = "libcudf"
+dynamic = ["version"]
+description = "cuDF - GPU Dataframe (C++)"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.10"
+classifiers = [
+    "Intended Audience :: Developers",
+    "Topic :: Database",
+    "Topic :: Scientific/Engineering",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: C++",
+    "Environment :: GPU :: NVIDIA CUDA",
+]
+dependencies = [
+    "pyarrow>=16.1.0,<16.2.0a0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cudf"
+
+[project.entry-points."cmake.prefix"]
+libcudf = "libcudf"
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
+ninja.make-fallback = true
+sdist.reproducible = true
+wheel.packages = ["libcudf"]
+wheel.install-dir = "libcudf"
+wheel.py-api = "py3"
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "libcudf/VERSION"
+regex = "(?P<value>.*)"
+
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
+requires = [
+    "cmake>=3.26.4,!=3.30.0",
+    "librmm==24.10.*,>=0.0.0a0",
+    "ninja",
+    "pyarrow==16.1.0.*",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt
index 424d8372280..340ad120377 100644
--- a/python/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/CMakeLists.txt
@@ -24,72 +24,16 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
-       OFF
-)
-option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF)
-mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
-
-# Find Python early so that later commands can use it
-find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
-
-# If the user requested it we attempt to find CUDF.
-if(FIND_CUDF_CPP)
-  include(rapids-cpm)
-  include(rapids-export)
-  include(rapids-find)
-  rapids_cpm_init()
-
-  if(USE_LIBARROW_FROM_PYARROW)
-    # We need to find arrow before libcudf since libcudf requires it but doesn't bundle arrow
-    # libraries. These variables have no effect because we are always searching for arrow via
-    # pyarrow, but they must be set as they are required arguments to the function in
-    # get_arrow.cmake.
-    set(CUDF_USE_ARROW_STATIC OFF)
-    set(CUDF_ENABLE_ARROW_S3 OFF)
-    set(CUDF_ENABLE_ARROW_ORC OFF)
-    set(CUDF_ENABLE_ARROW_PYTHON OFF)
-    set(CUDF_ENABLE_ARROW_PARQUET OFF)
-    include(../../cpp/cmake/thirdparty/get_arrow.cmake)
-  endif()
-
-  find_package(cudf "${RAPIDS_VERSION}" REQUIRED)
+find_package(cudf "${RAPIDS_VERSION}" REQUIRED)
 
-  # an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
-  # for the interop.pyx
-  include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
-else()
-  set(cudf_FOUND OFF)
-endif()
+# an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
+# for the interop.pyx
+include(rapids-cpm)
+rapids_cpm_init()
+include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
 
 include(rapids-cython-core)
 
-if(NOT cudf_FOUND)
-  set(BUILD_TESTS OFF)
-  set(BUILD_BENCHMARKS OFF)
-  set(CUDF_BUILD_TESTUTIL OFF)
-  set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
-  set(CUDA_STATIC_RUNTIME ON)
-
-  add_subdirectory(../../cpp cudf-cpp EXCLUDE_FROM_ALL)
-
-  # libcudf targets are excluded by default above via EXCLUDE_FROM_ALL to remove extraneous
-  # components like headers from libcudacxx, but we do need the libraries. However, we want to
-  # control where they are installed to. Since there are multiple subpackages of pylibcudf that
-  # require access to libcudf, we place the library and all its dependent artifacts in the cudf
-  # directory as a single source of truth and modify the other rpaths appropriately.
-  set(cython_lib_dir pylibcudf)
-  include(cmake/Modules/WheelHelpers.cmake)
-  # TODO: This install is currently overzealous. We should only install the libraries that are
-  # downloaded by CPM during the build, not libraries that were found on the system.  However, in
-  # practice right this would only be a problem is if libcudf was not found but some of the
-  # dependencies were, and we have no real use cases where that happens.
-  install_aliased_imported_targets(
-    TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
-    DESTINATION ${cython_lib_dir}
-  )
-endif()
-
 rapids_cython_init()
 
 include(cmake/Modules/LinkPyarrowHeaders.cmake)
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index ab21bfe97ab..f81a32e07f9 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -53,6 +53,8 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
 
+target_include_directories(pylibcudf_interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
+
 include(${rapids-cmake-dir}/export/find_package_root.cmake)
 include(../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
 target_link_libraries(pylibcudf_interop PUBLIC nanoarrow)
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 677fdaf80d0..e784c6c6dd5 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -1,5 +1,15 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+# If libcudf was installed as a wheel, we must request it to load the library symbols.
+# Otherwise, we assume that the library was installed in a system path that ld can find.
+try:
+    import libcudf
+except ModuleNotFoundError:
+    pass
+else:
+    libcudf.load_library()
+    del libcudf
+
 from . import (
     aggregation,
     binaryop,
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index b037508d03f..63d76e9fd4e 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -19,6 +19,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
+    "libcudf==24.10.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
     "pyarrow>=16.1.0,<16.2.0a0",
@@ -101,6 +102,8 @@ matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
+    "libcudf==24.10.*,>=0.0.0a0",
+    "librmm==24.10.*,>=0.0.0a0",
     "ninja",
     "numpy==1.23.*",
     "pyarrow==16.1.0.*",

From a7ca3afb251805face3dd3248381f4cc9503e143 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 23 Aug 2024 12:24:30 -0700
Subject: [PATCH 114/270] Add the missing `num_aggregations` axis for
 `groupby_max_cardinality` (#16630)

This PR fixes a minor bug where the `num_aggregations` axis was missed when working on #16154.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16630
---
 cpp/benchmarks/groupby/group_max.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
index f41285008c4..b9a701a71f4 100644
--- a/cpp/benchmarks/groupby/group_max.cpp
+++ b/cpp/benchmarks/groupby/group_max.cpp
@@ -101,4 +101,5 @@ NVBENCH_BENCH_TYPES(bench_groupby_max,
 
 NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t>))
   .set_name("groupby_max_cardinality")
+  .add_int64_axis("num_aggregations", {1})
   .add_int64_axis("cardinality", {10, 20, 50, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000});

From 7bd14a58cd10504c99044a2d33159bc3d59e7139 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 23 Aug 2024 15:27:12 -0500
Subject: [PATCH 115/270] Add pylibcudf build dir in build.sh for `clean`
 (#16648)

This PR adds `pylibcudf` build dir in `build.sh` for `clean` to properly delete the pylibcudf build files and folders.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16648
---
 build.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/build.sh b/build.sh
index 957f41aedac..211e1db9fbf 100755
--- a/build.sh
+++ b/build.sh
@@ -54,10 +54,11 @@ KAFKA_LIB_BUILD_DIR=${KAFKA_LIB_BUILD_DIR:=${REPODIR}/cpp/libcudf_kafka/build}
 CUDF_KAFKA_BUILD_DIR=${REPODIR}/python/cudf_kafka/build
 CUDF_BUILD_DIR=${REPODIR}/python/cudf/build
 DASK_CUDF_BUILD_DIR=${REPODIR}/python/dask_cudf/build
+PYLIBCUDF_BUILD_DIR=${REPODIR}/python/pylibcudf/build
 CUSTREAMZ_BUILD_DIR=${REPODIR}/python/custreamz/build
 CUDF_JAR_JAVA_BUILD_DIR="$REPODIR/java/target"
 
-BUILD_DIRS="${LIB_BUILD_DIR} ${CUDF_BUILD_DIR} ${DASK_CUDF_BUILD_DIR} ${KAFKA_LIB_BUILD_DIR} ${CUDF_KAFKA_BUILD_DIR} ${CUSTREAMZ_BUILD_DIR} ${CUDF_JAR_JAVA_BUILD_DIR}"
+BUILD_DIRS="${LIB_BUILD_DIR} ${CUDF_BUILD_DIR} ${DASK_CUDF_BUILD_DIR} ${KAFKA_LIB_BUILD_DIR} ${CUDF_KAFKA_BUILD_DIR} ${CUSTREAMZ_BUILD_DIR} ${CUDF_JAR_JAVA_BUILD_DIR} ${PYLIBCUDF_BUILD_DIR}"
 
 # Set defaults for vars modified by flags to this script
 VERBOSE_FLAG=""

From 7ca6a8cfb40291d28dbd0a99e00275e1b4fc869b Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 23 Aug 2024 16:53:59 -0400
Subject: [PATCH 116/270] fix libcudf wheel publishing, make package-type
 explicit in wheel publishing (#16650)

Follow-up to #15483.

Contributes to https://github.com/rapidsai/build-planning/issues/33

Wheel publishing for `libcudf` is failing like this:

```text
Error:  File "./dist/*.whl" does not exist
```

([build link](https://github.com/rapidsai/cudf/actions/runs/10528569930/job/29176811683))

Because the `package-type` was not set to `cpp` in the `wheels-publish` CI workflow, and that workflow defaults to `python`. ([shared-workflows code link](https://github.com/rapidsai/shared-workflows/blob/157e9824e6e2181fca9aa5c4bea4defd4cc322b0/.github/workflows/wheels-publish.yaml#L23-L26)).

This fixes that, and makes that choice explicit for all wheel publishing jobs.

References for this `package-type` argument:

* https://github.com/rapidsai/shared-workflows/pull/209
* https://github.com/rapidsai/gha-tools/pull/105

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16650
---
 .github/workflows/build.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0ea4d5c54dc..72daff7b66b 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -86,6 +86,7 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: libcudf
+      package-type: cpp
   wheel-build-pylibcudf:
     needs: [wheel-publish-libcudf]
     secrets: inherit
@@ -106,6 +107,7 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: pylibcudf
+      package-type: python
   wheel-build-cudf:
     needs: wheel-publish-pylibcudf
     secrets: inherit
@@ -126,6 +128,7 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: cudf
+      package-type: python
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
@@ -148,6 +151,7 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: dask_cudf
+      package-type: python
   wheel-build-cudf-polars:
     needs: wheel-publish-pylibcudf
     secrets: inherit
@@ -170,6 +174,7 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: cudf_polars
+      package-type: python
   trigger-pandas-tests:
     if: inputs.build_type == 'nightly'
     needs: wheel-build-cudf

From 508bdea0dac581d5a33ceb609766c419ef51bbbb Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Fri, 23 Aug 2024 19:15:07 -0700
Subject: [PATCH 117/270] Rebuild for & Support NumPy 2 (#16300)

Part of issue: https://github.com/rapidsai/build-planning/issues/38

Start building `cudf` with `numpy` version `2.0`. This remains compatible with `numpy` version `1.x` and `2.x`. Allows us to test building with `numpy` version `2.0` (and make sure we catch any issues that show up). Also relaxes the `numpy` `1.x` pin. Pulls in the RDFG changes that are rolling out for broader RAPIDS NumPy 2 support.

Authors:
  - https://github.com/jakirkham
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ray Douglass (https://github.com/raydouglass)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16300
---
 ci/cudf_pandas_scripts/run_tests.sh              | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 6 ++----
 conda/recipes/pylibcudf/meta.yaml                | 6 ++----
 dependencies.yaml                                | 8 +++-----
 python/cudf/pyproject.toml                       | 4 ++--
 python/cudf_kafka/pyproject.toml                 | 2 +-
 python/dask_cudf/pyproject.toml                  | 2 +-
 python/pylibcudf/pyproject.toml                  | 2 +-
 10 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 90ea1afbe6a..39056d58d56 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -75,7 +75,7 @@ IFS=',' read -r -a versions <<< "$output"
 
 for version in "${versions[@]}"; do
     echo "Installing pandas version: ${version}"
-    python -m pip install "pandas==${version}"
+    python -m pip install "numpy>=1.23,<2.0a0" "pandas==${version}"
     python -m pytest -p cudf.pandas \
     --cov-config=./python/cudf/.coveragerc \
     --cov=cudf \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 018162bd848..5cf7508ba51 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -57,7 +57,7 @@ dependencies:
 - notebook
 - numba>=0.57
 - numpy
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
 - nvcomp==3.0.6
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index c60ffa7aaa5..28b927254f7 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -56,7 +56,7 @@ dependencies:
 - notebook
 - numba>=0.57
 - numpy
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc
 - nvcomp==3.0.6
 - nvtx>=0.2.1
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 7e86147732e..b2dad767da4 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -64,8 +64,7 @@ requirements:
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
     - dlpack >=0.8,<1.0
-    # TODO: Change to `2.0` for NumPy 2
-    - numpy 1.23
+    - numpy 2.0
     - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
@@ -84,8 +83,7 @@ requirements:
     - pandas >=2.0,<2.2.3dev0
     - cupy >=12.0.0
     - numba >=0.57
-    # TODO: Update `numpy` in `host` when dropping `<2.0a0`
-    - numpy >=1.23,<2.0a0
+    - numpy >=1.23,<3.0a0
     - {{ pin_compatible('pyarrow', max_pin='x.x') }}
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index f405fd10f5d..fef78467027 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -64,8 +64,7 @@ requirements:
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
     - dlpack >=0.8,<1.0
-    # TODO: Change to `2.0` for NumPy 2
-    - numpy 1.23
+    - numpy 2.0
     - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
@@ -81,8 +80,7 @@ requirements:
     - python
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.3dev0
-    # TODO: Update `numpy` in `host` when dropping `<2.0a0`
-    - numpy >=1.23,<2.0a0
+    - numpy >=1.23,<3.0a0
     - {{ pin_compatible('pyarrow', max_pin='x.x') }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
diff --git a/dependencies.yaml b/dependencies.yaml
index 553d01735b2..194577817db 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -394,10 +394,9 @@ dependencies:
           - &pyarrow_build pyarrow==16.1.0.*
       - output_types: pyproject
         packages:
-          # Hard pin the patch version used during the build.
+          # Hard pin the version used during the build.
           # Sync with conda build constraint & wheel run constraint.
-          # TODO: Change to `2.0.*` for NumPy 2
-          - numpy==1.23.*
+          - numpy==2.0.*
   build_python_libcudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -605,8 +604,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fsspec>=0.6.0
-          # TODO: Update `numpy` in `build_python_common` when dropping `<2.0a0`
-          - numpy>=1.23,<2.0a0
+          - numpy>=1.23,<3.0a0
           - pandas>=2.0,<2.2.3dev0
   run_pylibcudf:
     common:
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index cb9fa30afab..e7bac17f8ba 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -25,7 +25,7 @@ dependencies = [
     "fsspec>=0.6.0",
     "libcudf==24.10.*,>=0.0.0a0",
     "numba>=0.57",
-    "numpy>=1.23,<2.0a0",
+    "numpy>=1.23,<3.0a0",
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
@@ -130,7 +130,7 @@ requires = [
     "libcudf==24.10.*,>=0.0.0a0",
     "librmm==24.10.*,>=0.0.0a0",
     "ninja",
-    "numpy==1.23.*",
+    "numpy==2.0.*",
     "pyarrow==16.1.0.*",
     "pylibcudf==24.10.*,>=0.0.0a0",
     "rmm==24.10.*,>=0.0.0a0",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 63c5b07c5f3..2d0222a3fe9 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -106,6 +106,6 @@ requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
-    "numpy==1.23.*",
+    "numpy==2.0.*",
     "pyarrow==16.1.0.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 872ecd35c28..d5da7030a75 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "cudf==24.10.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
-    "numpy>=1.23,<2.0a0",
+    "numpy>=1.23,<3.0a0",
     "pandas>=2.0,<2.2.3dev0",
     "rapids-dask-dependency==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index 63d76e9fd4e..5f5594b462b 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -105,7 +105,7 @@ requires = [
     "libcudf==24.10.*,>=0.0.0a0",
     "librmm==24.10.*,>=0.0.0a0",
     "ninja",
-    "numpy==1.23.*",
+    "numpy==2.0.*",
     "pyarrow==16.1.0.*",
     "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 96f2cc5262e5b6b0f50109d327857e306214b3a4 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 26 Aug 2024 10:21:48 -0400
Subject: [PATCH 118/270] Remove CUDA whole compilation ODR violations (#16603)

CUDA whole compilation mode requires that all kernels are only launched from TUs that compile them. Previously libcudf would compile a subset of kernels in separate TUs from where they are launched.
To keep compile times ( and library size ) as low as possible I have introduced a single C++ function call between the original call site and the kernel launch. In testing this neglibile differences on compile time and binary size.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16603
---
 cpp/src/join/mixed_join.cu                   | 191 ++++++++++---------
 cpp/src/join/mixed_join_kernel.cu            |  10 +-
 cpp/src/join/mixed_join_kernel.cuh           |  64 +++++--
 cpp/src/join/mixed_join_kernel.hpp           |  80 ++++++++
 cpp/src/join/mixed_join_kernel_nulls.cu      |  10 +-
 cpp/src/join/mixed_join_kernels.cuh          | 124 ------------
 cpp/src/join/mixed_join_kernels_semi.cu      |  86 +++++----
 cpp/src/join/mixed_join_kernels_semi.cuh     |  29 +--
 cpp/src/join/mixed_join_semi.cu              |  38 ++--
 cpp/src/join/mixed_join_size_kernel.cu       |  12 +-
 cpp/src/join/mixed_join_size_kernel.cuh      |  64 +++++--
 cpp/src/join/mixed_join_size_kernel.hpp      |  85 +++++++++
 cpp/src/join/mixed_join_size_kernel_nulls.cu |  12 +-
 13 files changed, 472 insertions(+), 333 deletions(-)
 create mode 100644 cpp/src/join/mixed_join_kernel.hpp
 delete mode 100644 cpp/src/join/mixed_join_kernels.cuh
 create mode 100644 cpp/src/join/mixed_join_size_kernel.hpp

diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 48b94c777de..eb12065c6a9 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -16,7 +16,8 @@
 
 #include "join_common_utils.cuh"
 #include "join_common_utils.hpp"
-#include "mixed_join_kernels.cuh"
+#include "mixed_join_kernel.hpp"
+#include "mixed_join_size_kernel.hpp"
 
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/expressions.hpp>
@@ -178,9 +179,6 @@ mixed_join(
     join_size            = output_size_data->first;
     matches_per_row_span = output_size_data->second;
   } else {
-    // Allocate storage for the counter used to get the size of the join output
-    rmm::device_scalar<std::size_t> size(0, stream, mr);
-
     matches_per_row =
       rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
     // Note that the view goes out of scope after this else statement, but the
@@ -190,37 +188,38 @@ mixed_join(
     matches_per_row_span = cudf::device_span<size_type const>{
       matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
     if (has_nulls) {
-      compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
-        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_conditional_view,
-          *right_conditional_view,
-          *probe_view,
-          *build_view,
-          hash_probe,
-          equality_probe,
-          kernel_join_type,
-          hash_table_view,
-          parser.device_expression_data,
-          swap_tables,
-          size.data(),
-          mutable_matches_per_row_span);
+      join_size = launch_compute_mixed_join_output_size<true>(*left_conditional_view,
+                                                              *right_conditional_view,
+                                                              *probe_view,
+                                                              *build_view,
+                                                              hash_probe,
+                                                              equality_probe,
+                                                              kernel_join_type,
+                                                              hash_table_view,
+                                                              parser.device_expression_data,
+                                                              swap_tables,
+                                                              mutable_matches_per_row_span,
+                                                              config,
+                                                              shmem_size_per_block,
+                                                              stream,
+                                                              mr);
     } else {
-      compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
-        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_conditional_view,
-          *right_conditional_view,
-          *probe_view,
-          *build_view,
-          hash_probe,
-          equality_probe,
-          kernel_join_type,
-          hash_table_view,
-          parser.device_expression_data,
-          swap_tables,
-          size.data(),
-          mutable_matches_per_row_span);
+      join_size = launch_compute_mixed_join_output_size<false>(*left_conditional_view,
+                                                               *right_conditional_view,
+                                                               *probe_view,
+                                                               *build_view,
+                                                               hash_probe,
+                                                               equality_probe,
+                                                               kernel_join_type,
+                                                               hash_table_view,
+                                                               parser.device_expression_data,
+                                                               swap_tables,
+                                                               mutable_matches_per_row_span,
+                                                               config,
+                                                               shmem_size_per_block,
+                                                               stream,
+                                                               mr);
     }
-    join_size = size.value(stream);
   }
 
   // The initial early exit clauses guarantee that we will not reach this point
@@ -249,37 +248,39 @@ mixed_join(
   auto const& join_output_r = right_indices->data();
 
   if (has_nulls) {
-    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        kernel_join_type,
-        hash_table_view,
-        join_output_l,
-        join_output_r,
-        parser.device_expression_data,
-        join_result_offsets.data(),
-        swap_tables);
+    launch_mixed_join<true>(*left_conditional_view,
+                            *right_conditional_view,
+                            *probe_view,
+                            *build_view,
+                            hash_probe,
+                            equality_probe,
+                            kernel_join_type,
+                            hash_table_view,
+                            join_output_l,
+                            join_output_r,
+                            parser.device_expression_data,
+                            join_result_offsets.data(),
+                            swap_tables,
+                            config,
+                            shmem_size_per_block,
+                            stream);
   } else {
-    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        kernel_join_type,
-        hash_table_view,
-        join_output_l,
-        join_output_r,
-        parser.device_expression_data,
-        join_result_offsets.data(),
-        swap_tables);
+    launch_mixed_join<false>(*left_conditional_view,
+                             *right_conditional_view,
+                             *probe_view,
+                             *build_view,
+                             hash_probe,
+                             equality_probe,
+                             kernel_join_type,
+                             hash_table_view,
+                             join_output_l,
+                             join_output_r,
+                             parser.device_expression_data,
+                             join_result_offsets.data(),
+                             swap_tables,
+                             config,
+                             shmem_size_per_block,
+                             stream);
   }
 
   auto join_indices = std::pair(std::move(left_indices), std::move(right_indices));
@@ -423,9 +424,6 @@ compute_mixed_join_output_size(table_view const& left_equality,
   detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
-  // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<std::size_t> size(0, stream, mr);
-
   auto const preprocessed_probe =
     experimental::row::equality::preprocessed_table::create(probe, stream);
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
@@ -436,39 +434,42 @@ compute_mixed_join_output_size(table_view const& left_equality,
 
   // Determine number of output rows without actually building the output to simply
   // find what the size of the output will be.
+  std::size_t size = 0;
   if (has_nulls) {
-    compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        join_type,
-        hash_table_view,
-        parser.device_expression_data,
-        swap_tables,
-        size.data(),
-        matches_per_row_span);
+    size = launch_compute_mixed_join_output_size<true>(*left_conditional_view,
+                                                       *right_conditional_view,
+                                                       *probe_view,
+                                                       *build_view,
+                                                       hash_probe,
+                                                       equality_probe,
+                                                       join_type,
+                                                       hash_table_view,
+                                                       parser.device_expression_data,
+                                                       swap_tables,
+                                                       matches_per_row_span,
+                                                       config,
+                                                       shmem_size_per_block,
+                                                       stream,
+                                                       mr);
   } else {
-    compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        join_type,
-        hash_table_view,
-        parser.device_expression_data,
-        swap_tables,
-        size.data(),
-        matches_per_row_span);
+    size = launch_compute_mixed_join_output_size<false>(*left_conditional_view,
+                                                        *right_conditional_view,
+                                                        *probe_view,
+                                                        *build_view,
+                                                        hash_probe,
+                                                        equality_probe,
+                                                        join_type,
+                                                        hash_table_view,
+                                                        parser.device_expression_data,
+                                                        swap_tables,
+                                                        matches_per_row_span,
+                                                        config,
+                                                        shmem_size_per_block,
+                                                        stream,
+                                                        mr);
   }
 
-  return {size.value(stream), std::move(matches_per_row)};
+  return {size, std::move(matches_per_row)};
 }
 
 }  // namespace detail
diff --git a/cpp/src/join/mixed_join_kernel.cu b/cpp/src/join/mixed_join_kernel.cu
index 61cfa168b03..cd4016837cc 100644
--- a/cpp/src/join/mixed_join_kernel.cu
+++ b/cpp/src/join/mixed_join_kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 
 #include "mixed_join_kernel.cuh"
+#include "mixed_join_kernel.hpp"
 
 namespace cudf {
 namespace detail {
 
-template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>(
+template void launch_mixed_join<false>(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
@@ -32,7 +33,10 @@ template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>(
   size_type* join_output_r,
   cudf::ast::detail::expression_device_view device_expression_data,
   cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
+  bool const swap_tables,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index ea59f23c77f..9d011d43de6 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -19,6 +19,7 @@
 #include "join_common_utils.cuh"
 #include "join_common_utils.hpp"
 #include "mixed_join_common_utils.cuh"
+#include "mixed_join_kernel.hpp"
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
@@ -39,20 +40,20 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <cudf::size_type block_size, bool has_nulls>
-CUDF_HIDDEN __launch_bounds__(block_size) __global__
-  void mixed_join(table_device_view left_table,
-                  table_device_view right_table,
-                  table_device_view probe,
-                  table_device_view build,
-                  row_hash const hash_probe,
-                  row_equality const equality_probe,
-                  join_kind const join_type,
-                  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-                  size_type* join_output_l,
-                  size_type* join_output_r,
-                  cudf::ast::detail::expression_device_view device_expression_data,
-                  cudf::size_type const* join_result_offsets,
-                  bool const swap_tables)
+CUDF_KERNEL void __launch_bounds__(block_size)
+  mixed_join(table_device_view left_table,
+             table_device_view right_table,
+             table_device_view probe,
+             table_device_view build,
+             row_hash const hash_probe,
+             row_equality const equality_probe,
+             join_kind const join_type,
+             cudf::detail::mixed_multimap_type::device_view hash_table_view,
+             size_type* join_output_l,
+             size_type* join_output_r,
+             cudf::ast::detail::expression_device_view device_expression_data,
+             cudf::size_type const* join_result_offsets,
+             bool const swap_tables)
 {
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
@@ -111,6 +112,41 @@ CUDF_HIDDEN __launch_bounds__(block_size) __global__
   }
 }
 
+template <bool has_nulls>
+void launch_mixed_join(table_device_view left_table,
+                       table_device_view right_table,
+                       table_device_view probe,
+                       table_device_view build,
+                       row_hash const hash_probe,
+                       row_equality const equality_probe,
+                       join_kind const join_type,
+                       cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                       size_type* join_output_l,
+                       size_type* join_output_r,
+                       cudf::ast::detail::expression_device_view device_expression_data,
+                       cudf::size_type const* join_result_offsets,
+                       bool const swap_tables,
+                       detail::grid_1d const config,
+                       int64_t shmem_size_per_block,
+                       rmm::cuda_stream_view stream)
+{
+  mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>
+    <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+      left_table,
+      right_table,
+      probe,
+      build,
+      hash_probe,
+      equality_probe,
+      join_type,
+      hash_table_view,
+      join_output_l,
+      join_output_r,
+      device_expression_data,
+      join_result_offsets,
+      swap_tables);
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernel.hpp b/cpp/src/join/mixed_join_kernel.hpp
new file mode 100644
index 00000000000..cc92e9d8ba4
--- /dev/null
+++ b/cpp/src/join/mixed_join_kernel.hpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "join/join_common_utils.hpp"
+#include "join/mixed_join_common_utils.cuh"
+
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+/**
+ * @brief Performs a join using the combination of a hash lookup to identify
+ * equal rows between one pair of tables and the evaluation of an expression
+ * containing an arbitrary expression.
+ *
+ * This method probes the hash table with each row in the probe table using a
+ * custom equality comparator that also checks that the conditional expression
+ * evaluates to true between the left/right tables when a match is found
+ * between probe and build rows.
+ *
+ * @tparam block_size The number of threads per block for this kernel
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
+ * @param[in] left_table The left table
+ * @param[in] right_table The right table
+ * @param[in] probe The table with which to probe the hash table for matches.
+ * @param[in] build The table with which the hash table was built.
+ * @param[in] hash_probe The hasher used for the probe table.
+ * @param[in] equality_probe The equality comparator used when probing the hash table.
+ * @param[in] join_type The type of join to be performed
+ * @param[in] hash_table_view The hash table built from `build`.
+ * @param[out] join_output_l The left result of the join operation
+ * @param[out] join_output_r The right result of the join operation
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
+ * @param[in] join_result_offsets The starting indices in join_output[l|r]
+ * where the matches for each row begin. Equivalent to a prefix sum of
+ * matches_per_row.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
+ */
+template <bool has_nulls>
+void launch_mixed_join(table_device_view left_table,
+                       table_device_view right_table,
+                       table_device_view probe,
+                       table_device_view build,
+                       row_hash const hash_probe,
+                       row_equality const equality_probe,
+                       join_kind const join_type,
+                       cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                       size_type* join_output_l,
+                       size_type* join_output_r,
+                       cudf::ast::detail::expression_device_view device_expression_data,
+                       cudf::size_type const* join_result_offsets,
+                       bool const swap_tables,
+                       detail::grid_1d const config,
+                       int64_t shmem_size_per_block,
+                       rmm::cuda_stream_view stream);
+
+}  // namespace detail
+
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/join/mixed_join_kernel_nulls.cu b/cpp/src/join/mixed_join_kernel_nulls.cu
index 518f8ed8555..185aa133f2d 100644
--- a/cpp/src/join/mixed_join_kernel_nulls.cu
+++ b/cpp/src/join/mixed_join_kernel_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 
 #include "mixed_join_kernel.cuh"
+#include "mixed_join_kernel.hpp"
 
 namespace cudf {
 namespace detail {
 
-template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>(
+template void launch_mixed_join<true>(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
@@ -32,7 +33,10 @@ template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>(
   size_type* join_output_r,
   cudf::ast::detail::expression_device_view device_expression_data,
   cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
+  bool const swap_tables,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_kernels.cuh b/cpp/src/join/mixed_join_kernels.cuh
deleted file mode 100644
index 037c02666d4..00000000000
--- a/cpp/src/join/mixed_join_kernels.cuh
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "join/join_common_utils.hpp"
-#include "join/mixed_join_common_utils.cuh"
-
-#include <cudf/ast/detail/expression_parser.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/utilities/span.hpp>
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief Computes the output size of joining the left table to the right table.
- *
- * This method probes the hash table with each row in the probe table using a
- * custom equality comparator that also checks that the conditional expression
- * evaluates to true between the left/right tables when a match is found
- * between probe and build rows.
- *
- * @tparam block_size The number of threads per block for this kernel
- * @tparam has_nulls Whether or not the inputs may contain nulls.
- *
- * @param[in] left_table The left table
- * @param[in] right_table The right table
- * @param[in] probe The table with which to probe the hash table for matches.
- * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
- * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] join_type The type of join to be performed
- * @param[in] hash_table_view The hash table built from `build`.
- * @param[in] device_expression_data Container of device data required to evaluate the desired
- * expression.
- * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
- * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
- * @param[out] output_size The resulting output size
- * @param[out] matches_per_row The number of matches in one pair of
- * equality/conditional tables for each row in the other pair of tables. If
- * swap_tables is true, matches_per_row corresponds to the right_table,
- * otherwise it corresponds to the left_table. Note that corresponding swap of
- * left/right tables to determine which is the build table and which is the
- * probe table has already happened on the host.
- */
-
-template <int block_size, bool has_nulls>
-__global__ void compute_mixed_join_output_size(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-/**
- * @brief Performs a join using the combination of a hash lookup to identify
- * equal rows between one pair of tables and the evaluation of an expression
- * containing an arbitrary expression.
- *
- * This method probes the hash table with each row in the probe table using a
- * custom equality comparator that also checks that the conditional expression
- * evaluates to true between the left/right tables when a match is found
- * between probe and build rows.
- *
- * @tparam block_size The number of threads per block for this kernel
- * @tparam has_nulls Whether or not the inputs may contain nulls.
- *
- * @param[in] left_table The left table
- * @param[in] right_table The right table
- * @param[in] probe The table with which to probe the hash table for matches.
- * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
- * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] join_type The type of join to be performed
- * @param[in] hash_table_view The hash table built from `build`.
- * @param[out] join_output_l The left result of the join operation
- * @param[out] join_output_r The right result of the join operation
- * @param[in] device_expression_data Container of device data required to evaluate the desired
- * expression.
- * @param[in] join_result_offsets The starting indices in join_output[l|r]
- * where the matches for each row begin. Equivalent to a prefix sum of
- * matches_per_row.
- * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
- * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
- */
-template <cudf::size_type block_size, bool has_nulls>
-__global__ void mixed_join(table_device_view left_table,
-                           table_device_view right_table,
-                           table_device_view probe,
-                           table_device_view build,
-                           row_hash const hash_probe,
-                           row_equality const equality_probe,
-                           join_kind const join_type,
-                           cudf::detail::mixed_multimap_type::device_view hash_table_view,
-                           size_type* join_output_l,
-                           size_type* join_output_r,
-                           cudf::ast::detail::expression_device_view device_expression_data,
-                           cudf::size_type const* join_result_offsets,
-                           bool const swap_tables);
-
-}  // namespace detail
-
-}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 1f31eaa7878..7459ac3e99c 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -14,9 +14,7 @@
  * limitations under the License.
  */
 
-#include "join/join_common_utils.cuh"
-#include "join/join_common_utils.hpp"
-#include "join/mixed_join_common_utils.cuh"
+#include "join/mixed_join_kernels_semi.cuh"
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
@@ -35,16 +33,16 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <cudf::size_type block_size, bool has_nulls>
-CUDF_HIDDEN __launch_bounds__(block_size) __global__
-  void mixed_join_semi(table_device_view left_table,
-                       table_device_view right_table,
-                       table_device_view probe,
-                       table_device_view build,
-                       row_hash const hash_probe,
-                       row_equality const equality_probe,
-                       cudf::detail::semi_map_type::device_view hash_table_view,
-                       cudf::device_span<bool> left_table_keep_mask,
-                       cudf::ast::detail::expression_device_view device_expression_data)
+CUDF_KERNEL void __launch_bounds__(block_size)
+  mixed_join_semi(table_device_view left_table,
+                  table_device_view right_table,
+                  table_device_view probe,
+                  table_device_view build,
+                  row_hash const hash_probe,
+                  row_equality const equality_probe,
+                  cudf::detail::semi_map_type::device_view hash_table_view,
+                  cudf::device_span<bool> left_table_keep_mask,
+                  cudf::ast::detail::expression_device_view device_expression_data)
 {
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
@@ -75,28 +73,46 @@ CUDF_HIDDEN __launch_bounds__(block_size) __global__
   }
 }
 
-template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  cudf::device_span<bool> left_table_keep_mask,
-  cudf::ast::detail::expression_device_view device_expression_data);
-
-template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  cudf::device_span<bool> left_table_keep_mask,
-  cudf::ast::detail::expression_device_view device_expression_data);
+void launch_mixed_join_semi(bool has_nulls,
+                            table_device_view left_table,
+                            table_device_view right_table,
+                            table_device_view probe,
+                            table_device_view build,
+                            row_hash const hash_probe,
+                            row_equality const equality_probe,
+                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            cudf::device_span<bool> left_table_keep_mask,
+                            cudf::ast::detail::expression_device_view device_expression_data,
+                            detail::grid_1d const config,
+                            int64_t shmem_size_per_block,
+                            rmm::cuda_stream_view stream)
+{
+  if (has_nulls) {
+    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        left_table,
+        right_table,
+        probe,
+        build,
+        hash_probe,
+        equality_probe,
+        hash_table_view,
+        left_table_keep_mask,
+        device_expression_data);
+  } else {
+    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        left_table,
+        right_table,
+        probe,
+        build,
+        hash_probe,
+        equality_probe,
+        hash_table_view,
+        left_table_keep_mask,
+        device_expression_data);
+  }
+}
 
 }  // namespace detail
-
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index 4ea404d451c..43714ffb36a 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -16,8 +16,9 @@
 
 #pragma once
 
-#include "join/join_common_utils.hpp"
-#include "join/mixed_join_common_utils.cuh"
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_common_utils.cuh"
 
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -39,6 +40,7 @@ namespace detail {
  * @tparam block_size The number of threads per block for this kernel
  * @tparam has_nulls Whether or not the inputs may contain nulls.
  *
+ * @param[in] has_nulls If the input has nulls
  * @param[in] left_table The left table
  * @param[in] right_table The right table
  * @param[in] probe The table with which to probe the hash table for matches.
@@ -51,16 +53,19 @@ namespace detail {
  * @param[in] device_expression_data Container of device data required to evaluate the desired
  * expression.
  */
-template <cudf::size_type block_size, bool has_nulls>
-__global__ void mixed_join_semi(table_device_view left_table,
-                                table_device_view right_table,
-                                table_device_view probe,
-                                table_device_view build,
-                                row_hash const hash_probe,
-                                row_equality const equality_probe,
-                                cudf::detail::semi_map_type::device_view hash_table_view,
-                                cudf::device_span<bool> left_table_keep_mask,
-                                cudf::ast::detail::expression_device_view device_expression_data);
+void launch_mixed_join_semi(bool has_nulls,
+                            table_device_view left_table,
+                            table_device_view right_table,
+                            table_device_view probe,
+                            table_device_view build,
+                            row_hash const hash_probe,
+                            row_equality const equality_probe,
+                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            cudf::device_span<bool> left_table_keep_mask,
+                            cudf::ast::detail::expression_device_view device_expression_data,
+                            detail::grid_1d const config,
+                            int64_t shmem_size_per_block,
+                            rmm::cuda_stream_view stream);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 3e4188a0fbd..a79aa6673d6 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -227,31 +227,19 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
-  if (has_nulls) {
-    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        hash_table_view,
-        cudf::device_span<bool>(left_table_keep_mask),
-        parser.device_expression_data);
-  } else {
-    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        hash_table_view,
-        cudf::device_span<bool>(left_table_keep_mask),
-        parser.device_expression_data);
-  }
+  launch_mixed_join_semi(has_nulls,
+                         *left_conditional_view,
+                         *right_conditional_view,
+                         *probe_view,
+                         *build_view,
+                         hash_probe,
+                         equality_probe,
+                         hash_table_view,
+                         cudf::device_span<bool>(left_table_keep_mask),
+                         parser.device_expression_data,
+                         config,
+                         shmem_size_per_block,
+                         stream);
 
   auto gather_map = std::make_unique<rmm::device_uvector<size_type>>(probe.num_rows(), stream, mr);
 
diff --git a/cpp/src/join/mixed_join_size_kernel.cu b/cpp/src/join/mixed_join_size_kernel.cu
index 4011acb65d6..4882c8769e6 100644
--- a/cpp/src/join/mixed_join_size_kernel.cu
+++ b/cpp/src/join/mixed_join_size_kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 
 #include "mixed_join_size_kernel.cuh"
+#include "mixed_join_size_kernel.hpp"
 
 namespace cudf {
 namespace detail {
 
-template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>(
+template std::size_t launch_compute_mixed_join_output_size<false>(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
@@ -30,8 +31,11 @@ template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE,
   cudf::detail::mixed_multimap_type::device_view hash_table_view,
   ast::detail::expression_device_view device_expression_data,
   bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
+  cudf::device_span<cudf::size_type> matches_per_row,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 00a90f8273f..a1066e32331 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -36,19 +36,19 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <int block_size, bool has_nulls>
-CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row)
+CUDF_KERNEL void __launch_bounds__(block_size)
+  compute_mixed_join_output_size(table_device_view left_table,
+                                 table_device_view right_table,
+                                 table_device_view probe,
+                                 table_device_view build,
+                                 row_hash const hash_probe,
+                                 row_equality const equality_probe,
+                                 join_kind const join_type,
+                                 cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                                 ast::detail::expression_device_view device_expression_data,
+                                 bool const swap_tables,
+                                 std::size_t* output_size,
+                                 cudf::device_span<cudf::size_type> matches_per_row)
 {
   // The (required) extern storage of the shared memory array leads to
   // conflicting declarations between different templates. The easiest
@@ -103,5 +103,43 @@ CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_out
   }
 }
 
+template <bool has_nulls>
+std::size_t launch_compute_mixed_join_output_size(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_hash const hash_probe,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  cudf::device_span<cudf::size_type> matches_per_row,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  // Allocate storage for the counter used to get the size of the join output
+  rmm::device_scalar<std::size_t> size(0, stream, mr);
+
+  compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+    <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+      left_table,
+      right_table,
+      probe,
+      build,
+      hash_probe,
+      equality_probe,
+      join_type,
+      hash_table_view,
+      device_expression_data,
+      swap_tables,
+      size.data(),
+      matches_per_row);
+  return size.value(stream);
+}
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernel.hpp b/cpp/src/join/mixed_join_size_kernel.hpp
new file mode 100644
index 00000000000..b09805c14dc
--- /dev/null
+++ b/cpp/src/join/mixed_join_size_kernel.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_common_utils.cuh"
+
+#include <cudf/ast/detail/expression_evaluator.cuh>
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <cooperative_groups.h>
+#include <cub/cub.cuh>
+#include <thrust/iterator/discard_iterator.h>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+/**
+ * @brief Computes the output size of joining the left table to the right table.
+ *
+ * This method probes the hash table with each row in the probe table using a
+ * custom equality comparator that also checks that the conditional expression
+ * evaluates to true between the left/right tables when a match is found
+ * between probe and build rows.
+ *
+ * @tparam block_size The number of threads per block for this kernel
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
+ * @param[in] left_table The left table
+ * @param[in] right_table The right table
+ * @param[in] probe The table with which to probe the hash table for matches.
+ * @param[in] build The table with which the hash table was built.
+ * @param[in] hash_probe The hasher used for the probe table.
+ * @param[in] equality_probe The equality comparator used when probing the hash table.
+ * @param[in] join_type The type of join to be performed
+ * @param[in] hash_table_view The hash table built from `build`.
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
+ * @param[out] output_size The resulting output size
+ * @param[out] matches_per_row The number of matches in one pair of
+ * equality/conditional tables for each row in the other pair of tables. If
+ * swap_tables is true, matches_per_row corresponds to the right_table,
+ * otherwise it corresponds to the left_table. Note that corresponding swap of
+ * left/right tables to determine which is the build table and which is the
+ * probe table has already happened on the host.
+ */
+
+template <bool has_nulls>
+std::size_t launch_compute_mixed_join_output_size(
+  cudf::table_device_view left_table,
+  cudf::table_device_view right_table,
+  cudf::table_device_view probe,
+  cudf::table_device_view build,
+  row_hash const hash_probe,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  cudf::device_span<cudf::size_type> matches_per_row,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/join/mixed_join_size_kernel_nulls.cu b/cpp/src/join/mixed_join_size_kernel_nulls.cu
index 2868113bf33..11f9103da4d 100644
--- a/cpp/src/join/mixed_join_size_kernel_nulls.cu
+++ b/cpp/src/join/mixed_join_size_kernel_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 namespace cudf {
 namespace detail {
 
-template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>(
+template std::size_t launch_compute_mixed_join_output_size<true>(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
@@ -30,8 +30,10 @@ template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE,
   cudf::detail::mixed_multimap_type::device_view hash_table_view,
   ast::detail::expression_device_view device_expression_data,
   bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
+  cudf::device_span<cudf::size_type> matches_per_row,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 }  // namespace detail
 }  // namespace cudf

From a2503913bb362e43fa77615748ed4b4e31ac5055 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 26 Aug 2024 09:26:32 -0700
Subject: [PATCH 119/270] Revise `get_reader_filepath_or_buffer` to handle a
 list of data sources (#16613)

The cudf read APIs (e.g. `cudf.read_parquet`, `cudf.read_json`, etc...) currently iterate over data sources, calling `get_reader_filepath_or_buffer` on each source independently when multiple files are mapped to the same `cudf.DataFrame`. This is suboptimal when the data sources are remote-file paths (e.g. in S3).  In this case, we **should** be initiating network transfers for all files in parallel (and as early as possible).

This PR makes it easier to optimize multi-file data transfer in follow-up work. It also simplifies and centralizes some of the common logic used by the various read APIs.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16613
---
 python/cudf/cudf/io/avro.py       |  17 +--
 python/cudf/cudf/io/csv.py        |  15 +--
 python/cudf/cudf/io/json.py       |  58 ++-------
 python/cudf/cudf/io/orc.py        |  41 ++----
 python/cudf/cudf/io/parquet.py    |  59 ++-------
 python/cudf/cudf/io/text.py       |   6 +-
 python/cudf/cudf/utils/ioutils.py | 210 ++++++++++++++++--------------
 7 files changed, 161 insertions(+), 245 deletions(-)

diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py
index 728b34045bf..964bd02b03e 100644
--- a/python/cudf/cudf/io/avro.py
+++ b/python/cudf/cudf/io/avro.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import cudf
 from cudf import _lib as libcudf
@@ -15,22 +15,13 @@ def read_avro(
 ):
     """{docstring}"""
 
-    is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
+    filepath_or_buffer = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         storage_options=storage_options,
     )
-    if not is_single_filepath_or_buffer:
-        raise NotImplementedError(
-            "`read_avro` does not yet support reading multiple files"
-        )
-
-    filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
-        path_or_data=filepath_or_buffer,
-        compression=None,
-        storage_options=storage_options,
+    filepath_or_buffer = ioutils._select_single_source(
+        filepath_or_buffer, "read_avro"
     )
-    if compression is not None:
-        ValueError("URL content-encoding decompression is not supported")
 
     return cudf.DataFrame._from_data(
         *libcudf.avro.read_avro(
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index e61fc5063dc..a9c20150930 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -64,22 +64,15 @@ def read_csv(
     if bytes_per_thread is None:
         bytes_per_thread = ioutils._BYTES_PER_THREAD_DEFAULT
 
-    is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
+    filepath_or_buffer = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
-        storage_options=storage_options,
-    )
-    if not is_single_filepath_or_buffer:
-        raise NotImplementedError(
-            "`read_csv` does not yet support reading multiple files"
-        )
-
-    filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
-        path_or_data=filepath_or_buffer,
-        compression=compression,
         iotypes=(BytesIO, StringIO),
         storage_options=storage_options,
         bytes_per_thread=bytes_per_thread,
     )
+    filepath_or_buffer = ioutils._select_single_source(
+        filepath_or_buffer, "read_csv"
+    )
 
     if na_values is not None and is_scalar(na_values):
         na_values = [na_values]
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index fc3387d5117..d86db656fd0 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -9,7 +9,6 @@
 
 import cudf
 from cudf._lib import json as libjson
-from cudf.api.types import is_list_like
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
 
@@ -62,37 +61,15 @@ def read_json(
                 f"following positional arguments: {list(args)}"
             )
 
-        # Multiple sources are passed as a list. If a single source is passed,
-        # wrap it in a list for unified processing downstream.
-        if not is_list_like(path_or_buf):
-            path_or_buf = [path_or_buf]
-
-        filepaths_or_buffers = []
-        for source in path_or_buf:
-            if ioutils.is_directory(
-                path_or_data=source, storage_options=storage_options
-            ):
-                fs = ioutils._ensure_filesystem(
-                    passed_filesystem=None,
-                    path=source,
-                    storage_options=storage_options,
-                )
-                source = ioutils.stringify_pathlike(source)
-                source = fs.sep.join([source, "*.json"])
-
-            tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
-                path_or_data=source,
-                compression=compression,
-                iotypes=(BytesIO, StringIO),
-                allow_raw_text_input=True,
-                storage_options=storage_options,
-                warn_on_raw_text_input=True,
-                warn_meta=("json", "read_json"),
-            )
-            if isinstance(tmp_source, list):
-                filepaths_or_buffers.extend(tmp_source)
-            else:
-                filepaths_or_buffers.append(tmp_source)
+        filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer(
+            path_or_buf,
+            iotypes=(BytesIO, StringIO),
+            allow_raw_text_input=True,
+            storage_options=storage_options,
+            warn_on_raw_text_input=True,
+            warn_meta=("json", "read_json"),
+            expand_dir_pattern="*.json",
+        )
 
         df = libjson.read_json(
             filepaths_or_buffers=filepaths_or_buffers,
@@ -111,25 +88,18 @@ def read_json(
             "be GPU accelerated in the future"
         )
 
-        if not ioutils.ensure_single_filepath_or_buffer(
-            path_or_data=path_or_buf,
-            storage_options=storage_options,
-        ):
-            raise NotImplementedError(
-                "`read_json` does not yet support reading "
-                "multiple files via pandas"
-            )
-
-        path_or_buf, compression = ioutils.get_reader_filepath_or_buffer(
+        filepath_or_buffer = ioutils.get_reader_filepath_or_buffer(
             path_or_data=path_or_buf,
-            compression=compression,
             iotypes=(BytesIO, StringIO),
             allow_raw_text_input=True,
             storage_options=storage_options,
         )
+        filepath_or_buffer = ioutils._select_single_source(
+            filepath_or_buffer, "read_json (via pandas)"
+        )
 
         pd_value = pd.read_json(
-            path_or_buf,
+            filepath_or_buffer,
             lines=lines,
             dtype=dtype,
             compression=compression,
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 4f04caafc5d..fd246c6215f 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -4,7 +4,6 @@
 import warnings
 
 import pyarrow as pa
-from fsspec.utils import stringify_path
 
 import cudf
 from cudf._lib import orc as liborc
@@ -170,8 +169,11 @@ def read_orc_statistics(
     files_statistics = []
     stripes_statistics = []
     for source in filepaths_or_buffers:
-        path_or_buf, _ = ioutils.get_reader_filepath_or_buffer(
-            path_or_data=source, compression=None, **kwargs
+        path_or_buf = ioutils.get_reader_filepath_or_buffer(
+            path_or_data=source, **kwargs
+        )
+        path_or_buf = ioutils._select_single_source(
+            path_or_buf, "read_orc_statistics"
         )
         (
             column_names,
@@ -318,33 +320,12 @@ def read_orc(
                 "A list of stripes must be provided for each input source"
             )
 
-    filepaths_or_buffers = []
-    for source in filepath_or_buffer:
-        if ioutils.is_directory(
-            path_or_data=source, storage_options=storage_options
-        ):
-            fs = ioutils._ensure_filesystem(
-                passed_filesystem=None,
-                path=source,
-                storage_options=storage_options,
-            )
-            source = stringify_path(source)
-            source = fs.sep.join([source, "*.orc"])
-
-        tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
-            path_or_data=source,
-            compression=None,
-            storage_options=storage_options,
-            bytes_per_thread=bytes_per_thread,
-        )
-        if compression is not None:
-            raise ValueError(
-                "URL content-encoding decompression is not supported"
-            )
-        if isinstance(tmp_source, list):
-            filepaths_or_buffers.extend(tmp_source)
-        else:
-            filepaths_or_buffers.append(tmp_source)
+    filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer(
+        path_or_data=filepath_or_buffer,
+        storage_options=storage_options,
+        bytes_per_thread=bytes_per_thread,
+        expand_dir_pattern="*.orc",
+    )
 
     if filters is not None:
         selected_stripes = _filter_stripes(
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 560f257c115..6b895abbf66 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -329,39 +329,12 @@ def write_to_dataset(
 @_performance_tracking
 def read_parquet_metadata(filepath_or_buffer):
     """{docstring}"""
-    # Multiple sources are passed as a list. If a single source is passed,
-    # wrap it in a list for unified processing downstream.
-    if not is_list_like(filepath_or_buffer):
-        filepath_or_buffer = [filepath_or_buffer]
-
-    # Start by trying to construct a filesystem object
-    fs, paths = ioutils._get_filesystem_and_paths(
-        path_or_data=filepath_or_buffer, storage_options=None
-    )
-
-    # Check if filepath or buffer
-    filepath_or_buffer = paths if paths else filepath_or_buffer
 
     # List of filepaths or buffers
-    filepaths_or_buffers = []
-
-    for source in filepath_or_buffer:
-        tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
-            path_or_data=source,
-            compression=None,
-            fs=fs,
-            storage_options=None,
-            bytes_per_thread=None,
-        )
-
-        if compression is not None:
-            raise ValueError(
-                "URL content-encoding decompression is not supported"
-            )
-        if isinstance(tmp_source, list):
-            filepath_or_buffer.extend(tmp_source)
-        else:
-            filepaths_or_buffers.append(tmp_source)
+    filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer(
+        path_or_data=filepath_or_buffer,
+        bytes_per_thread=None,
+    )
 
     return libparquet.read_parquet_metadata(filepaths_or_buffers)
 
@@ -598,24 +571,12 @@ def read_parquet(
         )
     filepath_or_buffer = paths if paths else filepath_or_buffer
 
-    filepaths_or_buffers = []
-    for source in filepath_or_buffer:
-        tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
-            path_or_data=source,
-            compression=None,
-            fs=fs,
-            storage_options=storage_options,
-            bytes_per_thread=bytes_per_thread,
-        )
-
-        if compression is not None:
-            raise ValueError(
-                "URL content-encoding decompression is not supported"
-            )
-        if isinstance(tmp_source, list):
-            filepath_or_buffer.extend(tmp_source)
-        else:
-            filepaths_or_buffers.append(tmp_source)
+    filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer(
+        path_or_data=filepath_or_buffer,
+        fs=fs,
+        storage_options=storage_options,
+        bytes_per_thread=bytes_per_thread,
+    )
 
     # Warn user if they are not using cudf for IO
     # (There is a good chance this was not the intention)
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 4329480bb2c..0043efce1e4 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -24,12 +24,14 @@ def read_text(
     if delimiter is None:
         raise ValueError("delimiter needs to be provided")
 
-    filepath_or_buffer, _ = ioutils.get_reader_filepath_or_buffer(
+    filepath_or_buffer = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
-        compression=None,
         iotypes=(BytesIO, StringIO),
         storage_options=storage_options,
     )
+    filepath_or_buffer = ioutils._select_single_source(
+        filepath_or_buffer, "read_text"
+    )
 
     return cudf.Series._from_data(
         libtext.read_text(
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 18106e7475b..e5944d7093c 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -14,6 +14,7 @@
 import pandas as pd
 from fsspec.core import expand_paths_if_needed, get_fs_token_paths
 
+from cudf.api.types import is_list_like
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
 
@@ -799,7 +800,7 @@
     k1   k2
 0  1.0  [1]
 """  # noqa: E501
-doc_read_json = docfmt_partial(docstring=_docstring_read_json)
+doc_read_json: Callable = docfmt_partial(docstring=_docstring_read_json)
 
 _docstring_to_json = """
 Convert the cuDF object to a JSON string.
@@ -869,7 +870,7 @@
 --------
 cudf.read_json
 """
-doc_to_json = docfmt_partial(docstring=_docstring_to_json)
+doc_to_json: Callable = docfmt_partial(docstring=_docstring_to_json)
 
 _docstring_read_hdf = """
 Read from the store, close it if we opened it.
@@ -1399,13 +1400,14 @@
 Return either a filepath string to data, or a memory buffer of data.
 If filepath, then the source filepath is expanded to user's environment.
 If buffer, then data is returned in-memory as bytes or a ByteIO object.
+This function is designed to process multiple data sources of the same
+type at once. If path_or_data is a list, the output will also be a list.
 
 Parameters
 ----------
-path_or_data : str, file-like object, bytes, ByteIO
-    Path to data or the data itself.
-compression : str
-    Type of compression algorithm for the content
+path_or_data : str, file-like object, bytes, ByteIO, list
+    Path to data or the data itself. Pass in a list to process multiple
+    sources of the same type at once.
 mode : str
     Mode in which file is opened
 iotypes : (), default (BytesIO)
@@ -1430,14 +1432,15 @@
     better throughput by decomposing it and transferring multiple "blocks"
     in parallel (using a Python thread pool). Default allocation is
     {bytes_per_thread} bytes.
+expand_dir_pattern : str, default None
+    Glob pattern to use when expanding directories into file paths
+    (e.g. "*.json"). If this parameter is not specified, directories
+    will not be expanded.
 
 Returns
 -------
-filepath_or_buffer : str, bytes, BytesIO, list
-    Filepath string or in-memory buffer of data or a
-    list of Filepath strings or in-memory buffers of data.
-compression : str
-    Type of compression algorithm for the content
+List[str, bytes, BytesIO]
+    List of filepath strings or in-memory data buffers.
     """.format(bytes_per_thread=_BYTES_PER_THREAD_DEFAULT)
 
 
@@ -1494,29 +1497,15 @@ def _is_local_filesystem(fs):
     return isinstance(fs, fsspec.implementations.local.LocalFileSystem)
 
 
-def ensure_single_filepath_or_buffer(path_or_data, storage_options=None):
-    """Return False if `path_or_data` resolves to multiple filepaths or
-    buffers.
+def _select_single_source(sources: list, caller: str):
+    """Select the first element from a list of sources.
+    Raise an error if sources contains multiple elements
     """
-    path_or_data = stringify_pathlike(path_or_data)
-    if isinstance(path_or_data, str):
-        path_or_data = os.path.expanduser(path_or_data)
-        try:
-            fs, _, paths = get_fs_token_paths(
-                path_or_data, mode="rb", storage_options=storage_options
-            )
-        except ValueError as e:
-            if str(e).startswith("Protocol not known"):
-                return True
-            else:
-                raise e
-
-        if len(paths) > 1:
-            return False
-    elif isinstance(path_or_data, (list, tuple)) and len(path_or_data) > 1:
-        return False
-
-    return True
+    if len(sources) > 1:
+        raise ValueError(
+            f"{caller} does not support multiple sources, got: {sources}"
+        )
+    return sources[0]
 
 
 def is_directory(path_or_data, storage_options=None):
@@ -1601,10 +1590,24 @@ def _get_filesystem_and_paths(
     return fs, return_paths
 
 
+def _maybe_expand_directories(paths, glob_pattern, fs):
+    # Expand directory paths using a glob pattern.
+    # This is a no-op if either glob_pattern or fs are None
+    if fs is None or glob_pattern is None:
+        return paths
+    expanded_paths = []
+    for path in paths:
+        if fs.isdir(path):
+            expanded_paths.extend(fs.glob(fs.sep.join([path, glob_pattern])))
+        else:
+            expanded_paths.append(path)
+    return expanded_paths
+
+
 @doc_get_reader_filepath_or_buffer()
 def get_reader_filepath_or_buffer(
     path_or_data,
-    compression,
+    *,
     mode="rb",
     fs=None,
     iotypes=(BytesIO,),
@@ -1613,32 +1616,38 @@ def get_reader_filepath_or_buffer(
     bytes_per_thread=_BYTES_PER_THREAD_DEFAULT,
     warn_on_raw_text_input=None,
     warn_meta=None,
+    expand_dir_pattern=None,
 ):
     """{docstring}"""
 
-    path_or_data = stringify_pathlike(path_or_data)
-
-    if isinstance(path_or_data, str):
-        # Get a filesystem object if one isn't already available
-        paths = [path_or_data]
+    # Convert path_or_data to a list of input data sources
+    input_sources = [
+        stringify_pathlike(source)
+        for source in (
+            path_or_data if is_list_like(path_or_data) else [path_or_data]
+        )
+    ]
+    if not input_sources:
+        raise ValueError("Empty input source list: {input_sources}.")
+
+    filepaths_or_buffers = []
+    string_paths = [isinstance(source, str) for source in input_sources]
+    if any(string_paths):
+        # Sources are all strings. Thes strings are typically
+        # file paths, but they may also be raw text strings.
+
+        # Don't allow a mix of source types
+        if not all(string_paths):
+            raise ValueError("Invalid input source list: {input_sources}.")
+
+        # Make sure we define a filesystem (if possible)
+        paths = input_sources
+        raw_text_input = False
         if fs is None:
-            fs, paths = _get_filesystem_and_paths(
-                path_or_data, storage_options
-            )
-            if fs is None:
-                if warn_on_raw_text_input:
-                    # Do not remove until pandas 3.0 support is added.
-                    assert (
-                        PANDAS_LT_300
-                    ), "Need to drop after pandas-3.0 support is added."
-                    warnings.warn(
-                        f"Passing literal {warn_meta[0]} to {warn_meta[1]} is "
-                        "deprecated and will be removed in a future version. "
-                        "To read from a literal string, wrap it in a "
-                        "'StringIO' object.",
-                        FutureWarning,
-                    )
-                return path_or_data, compression
+            fs, paths = _get_filesystem_and_paths(paths, storage_options)
+
+        # Expand directories (if necessary)
+        paths = _maybe_expand_directories(paths, expand_dir_pattern, fs)
 
         if _is_local_filesystem(fs):
             # Doing this as `read_json` accepts a json string
@@ -1660,7 +1669,7 @@ def get_reader_filepath_or_buffer(
 
             if len(paths):
                 if fs.exists(paths[0]):
-                    path_or_data = paths if len(paths) > 1 else paths[0]
+                    filepaths_or_buffers = paths
 
                 # raise FileNotFound if path looks like json
                 # following pandas
@@ -1670,21 +1679,40 @@ def get_reader_filepath_or_buffer(
                     tuple(f".json{c}" for c in compression_extensions)
                 ):
                     raise FileNotFoundError(
-                        f"{path_or_data} could not be resolved to any files"
+                        f"{input_sources} could not be resolved to any files"
                     )
-                elif warn_on_raw_text_input:
-                    # Do not remove until pandas 3.0 support is added.
-                    assert (
-                        PANDAS_LT_300
-                    ), "Need to drop after pandas-3.0 support is added."
-                    warnings.warn(
-                        f"Passing literal {warn_meta[0]} to {warn_meta[1]} is "
-                        "deprecated and will be removed in a future version. "
-                        "To read from a literal string, wrap it in a "
-                        "'StringIO' object.",
-                        FutureWarning,
+                else:
+                    raw_text_input = True
+            else:
+                raw_text_input = True
+
+        elif fs is not None:
+            # TODO: We can use cat_ranges and/or parquet-aware logic
+            # to copy all remote data into host memory at once here.
+            # The current solution iterates over files, and copies
+            # ALL data from each file (even when we are performing
+            # partial IO, and don't need the entire file)
+            if len(paths) == 0:
+                raise FileNotFoundError(
+                    f"{input_sources} could not be resolved to any files"
+                )
+            filepaths_or_buffers = [
+                BytesIO(
+                    _fsspec_data_transfer(
+                        fpath,
+                        fs=fs,
+                        mode=mode,
+                        bytes_per_thread=bytes_per_thread,
                     )
-            elif warn_on_raw_text_input:
+                )
+                for fpath in paths
+            ]
+        else:
+            raw_text_input = True
+
+        if raw_text_input:
+            filepaths_or_buffers = input_sources
+            if warn_on_raw_text_input:
                 # Do not remove until pandas 3.0 support is added.
                 assert (
                     PANDAS_LT_300
@@ -1697,35 +1725,25 @@ def get_reader_filepath_or_buffer(
                     FutureWarning,
                 )
 
-        else:
-            if len(paths) == 0:
-                raise FileNotFoundError(
-                    f"{path_or_data} could not be resolved to any files"
-                )
-            path_or_data = [
-                BytesIO(
-                    _fsspec_data_transfer(
-                        fpath,
-                        fs=fs,
-                        mode=mode,
-                        bytes_per_thread=bytes_per_thread,
+    else:
+        # Sources are already buffers or file-like objects
+        for source in input_sources:
+            if not isinstance(source, iotypes) and is_file_like(source):
+                if isinstance(source, TextIOWrapper):
+                    source = source.buffer
+                filepaths_or_buffers.append(
+                    BytesIO(
+                        _fsspec_data_transfer(
+                            source,
+                            mode=mode,
+                            bytes_per_thread=bytes_per_thread,
+                        )
                     )
                 )
-                for fpath in paths
-            ]
-            if len(path_or_data) == 1:
-                path_or_data = path_or_data[0]
-
-    elif not isinstance(path_or_data, iotypes) and is_file_like(path_or_data):
-        if isinstance(path_or_data, TextIOWrapper):
-            path_or_data = path_or_data.buffer
-        path_or_data = BytesIO(
-            _fsspec_data_transfer(
-                path_or_data, mode=mode, bytes_per_thread=bytes_per_thread
-            )
-        )
+            else:
+                filepaths_or_buffers.append(source)
 
-    return path_or_data, compression
+    return filepaths_or_buffers
 
 
 def get_writer_filepath_or_buffer(path_or_data, mode, storage_options=None):

From d15d470e526de205bed8808a9c15d0a4d7642667 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 26 Aug 2024 12:03:41 -0500
Subject: [PATCH 120/270] Preserve Series name in duplicated method. (#16655)

Closes #16654.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16655
---
 python/cudf/cudf/core/indexed_frame.py | 4 +++-
 python/cudf/cudf/tests/test_series.py  | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 60253b9ae5d..ad6aa56d472 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3198,8 +3198,10 @@ def duplicated(self, subset=None, keep="first"):
         """
         subset = self._preprocess_subset(subset)
 
+        name = None
         if isinstance(self, cudf.Series):
             columns = [self._column]
+            name = self.name
         else:
             columns = [self._data[n] for n in subset]
         distinct = libcudf.stream_compaction.distinct_indices(
@@ -3211,7 +3213,7 @@ def duplicated(self, subset=None, keep="first"):
             [as_column(True, length=len(self), dtype=bool)],
             bounds_check=False,
         )[0]
-        return cudf.Series._from_column(result, index=self.index)
+        return cudf.Series._from_column(result, index=self.index, name=name)
 
     @_performance_tracking
     def _empty_like(self, keep_index=True) -> Self:
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index c7aea563535..8d673e23ab2 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2115,8 +2115,9 @@ def test_series_hasnans(data):
     ],
 )
 @pytest.mark.parametrize("keep", ["first", "last", False])
-def test_series_duplicated(data, index, keep):
-    gs = cudf.Series(data, index=index)
+@pytest.mark.parametrize("name", [None, "a"])
+def test_series_duplicated(data, index, keep, name):
+    gs = cudf.Series(data, index=index, name=name)
     ps = gs.to_pandas()
 
     assert_eq(gs.duplicated(keep=keep), ps.duplicated(keep=keep))

From f5113228c3aa89d49e71d42d11c38afe52695aa6 Mon Sep 17 00:00:00 2001
From: "Marcus D. Hanwell" <mhanwell@gmail.com>
Date: Mon, 26 Aug 2024 16:19:30 -0400
Subject: [PATCH 121/270] bug-fix: Don't enable the CUDA language if testing
 was requested when finding cudf (#16615)

This PR removes CMake code enabling the CUDA language if the testing component was requested.
Closes #16614

Authors:
  - Marcus D. Hanwell (https://github.com/cryos)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16615
---
 cpp/CMakeLists.txt | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6b8bb26825b..a6f72ed6b75 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1069,23 +1069,12 @@ if(CUDF_ENABLE_ARROW_PARQUET)
   )
 endif()
 
-string(
-  APPEND
-  install_code_string
-  [=[
-if(testing IN_LIST cudf_FIND_COMPONENTS)
-  enable_language(CUDA)
-endif()
-]=]
-)
-
 rapids_export(
   INSTALL cudf
   EXPORT_SET cudf-exports ${_components_export_string}
   GLOBAL_TARGETS cudf cudftestutil
   NAMESPACE cudf::
   DOCUMENTATION doc_string
-  FINAL_CODE_BLOCK install_code_string
 )
 
 # ##################################################################################################

From c4591c06db5347ea2bf6e37ead678343042a7932 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 27 Aug 2024 09:23:43 -0400
Subject: [PATCH 122/270] Use non-mangled type names in nvbench output (#16649)

Uses the `NVBENCH_DECLARE_TYPE_STRINGS` feature to produce readable type names in the nvbench output.
Example previous output for `cudf::timestamp_ms` would appear like this:
```
| cuda::std::__4::chrono::time_point<cuda::std::__4::chrono::system_clock, cuda::std::__4::chrono::duration<long, cuda::std::__4::ratio<1l, 1000l> > > | 100000 |  23840x | 25.138 us | 21.98% | 20.979 us |  9.54% | 4.767G |  38.134 GB/s |  4.38% |
```
Adding the nvbench name feature changes this to:
```
| cudf::timestamp_ms | 100000 |  24752x | 24.387 us | 21.58% | 20.208 us | 3.79% | 4.948G |  39.588 GB/s |  4.55% |
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16649
---
 cpp/benchmarks/reduction/minmax.cpp | 2 ++
 cpp/benchmarks/reduction/reduce.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp
index c89e22d3f44..636de303cc4 100644
--- a/cpp/benchmarks/reduction/minmax.cpp
+++ b/cpp/benchmarks/reduction/minmax.cpp
@@ -47,6 +47,8 @@ static void reduction_minmax(nvbench::state& state, nvbench::type_list<DataType>
   set_throughputs(state);
 }
 
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
+
 using Types = nvbench::type_list<bool, int8_t, int32_t, float, cudf::timestamp_ms>;
 
 NVBENCH_BENCH_TYPES(reduction_minmax, NVBENCH_TYPE_AXES(Types))
diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp
index 14bf90c4943..a30c27c519c 100644
--- a/cpp/benchmarks/reduction/reduce.cpp
+++ b/cpp/benchmarks/reduction/reduce.cpp
@@ -81,6 +81,8 @@ static void reduction(nvbench::state& state, nvbench::type_list<DataType, nvbenc
   set_throughputs(state);
 }
 
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
+
 using Types    = nvbench::type_list<int32_t, int64_t, double, cudf::timestamp_ms>;
 using AggKinds = nvbench::enum_type_list<cudf::reduce_aggregation::MIN,
                                          cudf::reduce_aggregation::SUM,

From 115ddcef6451ec7befad69affdafd6a2c8304660 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 27 Aug 2024 09:34:51 -0400
Subject: [PATCH 123/270] Fix integer overflow in indexalator pointer logic
 (#16643)

Fixes integer overflow in the indexalator logic when incrementing/decrementing its data pointer. Any sufficiently large int32 input values used in computing the byte-pointer position causes an overflow when multiplying the value by the byte-width of the underlying index type. For example, this overflow would occur when accessing rows greater than 536,870,912 with an underlying index type of int32 (4-bytes).

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/16643
---
 cpp/include/cudf/detail/indexalator.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index ec7b1c3e6b6..f0510c86c3a 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -93,7 +93,7 @@ struct input_indexalator : base_normalator<input_indexalator, cudf::size_type> {
    */
   __device__ inline cudf::size_type operator[](size_type idx) const
   {
-    void const* tp = p_ + (idx * this->width_);
+    void const* tp = p_ + (static_cast<std::ptrdiff_t>(idx) * this->width_);
     return type_dispatcher(this->dtype_, normalize_type{}, tp);
   }
 
@@ -109,7 +109,7 @@ struct input_indexalator : base_normalator<input_indexalator, cudf::size_type> {
   CUDF_HOST_DEVICE input_indexalator(void const* data, data_type dtype, cudf::size_type offset = 0)
     : base_normalator<input_indexalator, cudf::size_type>(dtype), p_{static_cast<char const*>(data)}
   {
-    p_ += offset * this->width_;
+    p_ += static_cast<std::ptrdiff_t>(offset) * this->width_;
   }
 
  protected:
@@ -165,7 +165,7 @@ struct output_indexalator : base_normalator<output_indexalator, cudf::size_type>
   __device__ inline output_indexalator const operator[](size_type idx) const
   {
     output_indexalator tmp{*this};
-    tmp.p_ += (idx * this->width_);
+    tmp.p_ += static_cast<std::ptrdiff_t>(idx) * this->width_;
     return tmp;
   }
 

From efa97704d0c1ee83d04ab59f1746194c86743656 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 27 Aug 2024 11:02:22 -0500
Subject: [PATCH 124/270] Drop Python 3.9 support (#16637)

Contributes to https://github.com/rapidsai/build-planning/issues/88

Finishes the work of dropping Python 3.9 support.

This project stopped building / testing against Python 3.9 as of https://github.com/rapidsai/shared-workflows/pull/235.
This PR updates configuration and docs to reflect that.

## Notes for Reviewers

### How I tested this

Checked that there were no remaining uses like this:

```shell
git grep -E '3\.9'
git grep '39'
git grep 'py39'
```

And similar for variations on Python 3.8 (to catch things that were missed the last time this was done).

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16637
---
 README.md                                     |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-125_arch-x86_64.yaml             |  2 +-
 cpp/cmake/thirdparty/get_arrow.cmake          |  2 +-
 dependencies.yaml                             |  6 +----
 python/cudf/pyproject.toml                    |  3 +--
 python/cudf_kafka/pyproject.toml              |  2 +-
 .../cudf_polars/containers/dataframe.py       | 13 +++++----
 python/cudf_polars/cudf_polars/dsl/ir.py      | 27 ++++++++++++-------
 .../cudf_polars/typing/__init__.py            |  4 +--
 .../cudf_polars/cudf_polars/utils/sorting.py  |  2 +-
 python/cudf_polars/pyproject.toml             | 12 ++++++---
 python/custreamz/pyproject.toml               |  3 +--
 python/dask_cudf/pyproject.toml               |  3 +--
 python/pylibcudf/pyproject.toml               |  3 +--
 15 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index fd8b0365807..f1b010394d6 100644
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ conda install -c rapidsai -c conda-forge -c nvidia \
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
 of our latest development branch.
 
-Note: cuDF is supported only on Linux, and with Python versions 3.9 and later.
+Note: cuDF is supported only on Linux, and with Python versions 3.10 and later.
 
 See the [RAPIDS installation guide](https://docs.rapids.ai/install) for more OS and version info.
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 5cf7508ba51..fcd6e27a7f6 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -76,7 +76,7 @@ dependencies:
 - pytest-xdist
 - pytest<8
 - python-confluent-kafka>=1.9.0,<1.10.0a0
-- python>=3.9,<3.12
+- python>=3.10,<3.12
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 28b927254f7..bedc3a90885 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -74,7 +74,7 @@ dependencies:
 - pytest-xdist
 - pytest<8
 - python-confluent-kafka>=1.9.0,<1.10.0a0
-- python>=3.9,<3.12
+- python>=3.10,<3.12
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 0afdc526981..e3e6a07661a 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -45,7 +45,7 @@ function(find_libarrow_in_python_wheel PYARROW_VERSION)
     APPEND
     initial_code_block
     [=[
-find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+find_package(Python 3.10 REQUIRED COMPONENTS Interpreter)
 execute_process(
     COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])"
     OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR
diff --git a/dependencies.yaml b/dependencies.yaml
index 194577817db..04b5940c9fb 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -584,10 +584,6 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              py: "3.9"
-            packages:
-              - python=3.9
           - matrix:
               py: "3.10"
             packages:
@@ -598,7 +594,7 @@ dependencies:
               - python=3.11
           - matrix:
             packages:
-              - python>=3.9,<3.12
+              - python>=3.10,<3.12
   run_common:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index e7bac17f8ba..a6d26d17d46 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -16,7 +16,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "cachetools",
     "cubinlinker",
@@ -42,7 +42,6 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 2d0222a3fe9..01e7299a33a 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -16,7 +16,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "cudf==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 7c28e7b9a6c..a5c99e2bc11 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -105,7 +105,9 @@ def from_polars(cls, df: pl.DataFrame) -> Self:
         return cls(
             [
                 NamedColumn(column, h_col.name).copy_metadata(h_col)
-                for column, h_col in zip(d_table.columns(), df.iter_columns())
+                for column, h_col in zip(
+                    d_table.columns(), df.iter_columns(), strict=True
+                )
             ]
         )
 
@@ -134,8 +136,10 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
         if table.num_columns() != len(names):
             raise ValueError("Mismatching name and table length.")
         return cls(
-            # TODO: strict=True when we drop py39
-            [NamedColumn(c, name) for c, name in zip(table.columns(), names)]
+            [
+                NamedColumn(c, name)
+                for c, name in zip(table.columns(), names, strict=True)
+            ]
         )
 
     def sorted_like(
@@ -165,8 +169,7 @@ def sorted_like(
         subset = self.column_names_set if subset is None else subset
         self.columns = [
             c.sorted_like(other) if c.name in subset else c
-            # TODO: strict=True when we drop py39
-            for c, other in zip(self.columns, like.columns)
+            for c, other in zip(self.columns, like.columns, strict=True)
         ]
         return self
 
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 019f00f4fca..ebc7dee6bfb 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -310,7 +310,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 *(
                     (piece.tbl, piece.column_names(include_children=False))
                     for piece in pieces
-                )
+                ),
+                strict=True,
             )
             df = DataFrame.from_table(
                 plc.concatenate.concatenate(list(tables)),
@@ -426,7 +427,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             pdf = pdf.select(self.projection)
         df = DataFrame.from_polars(pdf)
         assert all(
-            c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values())
+            c.obj.type() == dtype
+            for c, dtype in zip(df.columns, self.schema.values(), strict=True)
         )
         if self.predicate is not None:
             (mask,) = broadcast(self.predicate.evaluate(df), target_length=df.num_rows)
@@ -600,9 +602,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         for i, table in enumerate(raw_tables):
             (column,) = table.columns()
             raw_columns.append(NamedColumn(column, f"tmp{i}"))
-        mapping = dict(zip(replacements, raw_columns))
+        mapping = dict(zip(replacements, raw_columns, strict=True))
         result_keys = [
-            NamedColumn(gk, k.name) for gk, k in zip(group_keys.columns(), keys)
+            NamedColumn(gk, k.name)
+            for gk, k in zip(group_keys.columns(), keys, strict=True)
         ]
         result_subs = DataFrame(raw_columns)
         results = [
@@ -752,7 +755,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             columns = plc.join.cross_join(left.table, right.table).columns()
             left_cols = [
                 NamedColumn(new, old.name).sorted_like(old)
-                for new, old in zip(columns[: left.num_columns], left.columns)
+                for new, old in zip(
+                    columns[: left.num_columns], left.columns, strict=True
+                )
             ]
             right_cols = [
                 NamedColumn(
@@ -761,7 +766,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                     if old.name not in left.column_names_set
                     else f"{old.name}{suffix}",
                 )
-                for new, old in zip(columns[left.num_columns :], right.columns)
+                for new, old in zip(
+                    columns[left.num_columns :], right.columns, strict=True
+                )
             ]
             return DataFrame([*left_cols, *right_cols])
         # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
@@ -803,6 +810,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                         for left_col, right_col in zip(
                             left.select_columns(left_on.column_names_set),
                             right.select_columns(right_on.column_names_set),
+                            strict=True,
                         )
                     )
                 )
@@ -909,7 +917,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         result = DataFrame(
             [
                 NamedColumn(c, old.name).sorted_like(old)
-                for c, old in zip(table.columns(), df.columns)
+                for c, old in zip(table.columns(), df.columns, strict=True)
             ]
         )
         if keys_sorted or self.stable:
@@ -974,7 +982,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             self.null_order,
         )
         columns = [
-            NamedColumn(c, old.name) for c, old in zip(table.columns(), df.columns)
+            NamedColumn(c, old.name)
+            for c, old in zip(table.columns(), df.columns, strict=True)
         ]
         # If a sort key is in the result table, set the sortedness property
         for k, i in enumerate(keys_in_result):
@@ -1089,7 +1098,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             # final tag is "swapping" which is useful for the
             # optimiser (it blocks some pushdown operations)
             old, new, _ = self.options
-            return df.rename_columns(dict(zip(old, new)))
+            return df.rename_columns(dict(zip(old, new, strict=True)))
         elif self.name == "explode":
             df = self.df.evaluate(cache=cache)
             ((to_explode,),) = self.options
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 02440e67fde..5276073e62a 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -13,9 +13,7 @@
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 if TYPE_CHECKING:
-    from typing import Callable
-
-    from typing_extensions import TypeAlias
+    from typing import Callable, TypeAlias
 
     import polars as pl
 
diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py
index 17ea44e5b1b..6ce216cbf8f 100644
--- a/python/cudf_polars/cudf_polars/utils/sorting.py
+++ b/python/cudf_polars/cudf_polars/utils/sorting.py
@@ -45,7 +45,7 @@ def sort_order(
     null_precedence = []
     if len(descending) != len(nulls_last) or len(descending) != num_keys:
         raise ValueError("Mismatching length of arguments in sort_order")
-    for asc, null_last in zip(column_order, nulls_last):
+    for asc, null_last in zip(column_order, nulls_last, strict=True):
         if (asc == plc.types.Order.ASCENDING) ^ (not null_last):
             null_precedence.append(plc.types.NullOrder.AFTER)
         elif (asc == plc.types.Order.ASCENDING) ^ null_last:
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index c380853035d..0382e3ce6a2 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "polars>=1.0,<1.3",
     "pylibcudf==24.10.*,>=0.0.0a0",
@@ -28,7 +28,6 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]
@@ -62,7 +61,7 @@ exclude_also = [
 [tool.ruff]
 line-length = 88
 indent-width = 4
-target-version = "py39"
+target-version = "py310"
 fix = true
 
 [tool.ruff.lint]
@@ -115,6 +114,9 @@ ignore = [
   "TD003", # Missing issue link on the line following this TODO
   # tryceratops
   "TRY003", # Avoid specifying long messages outside the exception class
+  # pyupgrade
+  "UP035",  # Import from `collections.abc` instead: `Callable`
+  "UP038",  # Use `X | Y` in `isinstance` call instead of `(X, Y)`
   # Lints below are turned off because of conflicts with the ruff
   # formatter
   # See https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
@@ -137,6 +139,10 @@ fixable = ["ALL"]
 
 [tool.ruff.lint.per-file-ignores]
 "**/tests/**/*.py" = ["D"]
+"**/cudf_polars/typing/__init__.py" = [
+  # pyupgrade
+  "UP007", # Use `X | Y` for type annotations
+]
 
 [tool.ruff.lint.flake8-pytest-style]
 # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index d6b88167262..be5331236a5 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
     "cudf==24.10.*,>=0.0.0a0",
@@ -31,7 +31,6 @@ classifiers = [
     "Topic :: Apache Kafka",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index d5da7030a75..93bf532d67f 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "cudf==24.10.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
@@ -32,7 +32,6 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index 5f5594b462b..0d673ea4cc3 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -16,7 +16,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
     "libcudf==24.10.*,>=0.0.0a0",
@@ -32,7 +32,6 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]

From f1cc962df38b1fc113b579bef57a27f93d11cec2 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Tue, 27 Aug 2024 09:16:39 -0700
Subject: [PATCH 125/270] Fix `cudf::rank` not getting enough params (#16666)

Fix issue #16624

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16666
---
 cpp/benchmarks/sort/rank_lists.cpp   | 2 ++
 cpp/benchmarks/sort/rank_structs.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/cpp/benchmarks/sort/rank_lists.cpp b/cpp/benchmarks/sort/rank_lists.cpp
index fbdb40b3537..7015fe08089 100644
--- a/cpp/benchmarks/sort/rank_lists.cpp
+++ b/cpp/benchmarks/sort/rank_lists.cpp
@@ -37,6 +37,8 @@ void nvbench_rank_lists(nvbench::state& state, nvbench::type_list<nvbench::enum_
                cudf::order::ASCENDING,
                null_frequency ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE,
                cudf::null_order::AFTER,
+               false,
+               cudf::get_default_stream(),
                rmm::mr::get_current_device_resource());
   });
 }
diff --git a/cpp/benchmarks/sort/rank_structs.cpp b/cpp/benchmarks/sort/rank_structs.cpp
index 4b0da29df9d..8b4b09464d8 100644
--- a/cpp/benchmarks/sort/rank_structs.cpp
+++ b/cpp/benchmarks/sort/rank_structs.cpp
@@ -35,6 +35,8 @@ void nvbench_rank_structs(nvbench::state& state, nvbench::type_list<nvbench::enu
                cudf::order::ASCENDING,
                nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE,
                cudf::null_order::AFTER,
+               false,
+               cudf::get_default_stream(),
                rmm::mr::get_current_device_resource());
   });
 }

From 2d494ed7860c4c3295c5a9f4dc3a605565f30494 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 27 Aug 2024 09:30:16 -0700
Subject: [PATCH 126/270] Add `num_multiprocessors` utility (#16628)

This PR introduces a new `num_multiprocessors` utility and moves the existing `elements_per_thread` host utility to the new `cuda.hpp` header.

Needed by #16619.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16628
---
 cpp/CMakeLists.txt                            |  1 +
 cpp/benchmarks/join/generate_input_tables.cuh | 10 +---
 cpp/include/cudf/detail/copy_if.cuh           |  1 +
 cpp/include/cudf/detail/utilities/cuda.cuh    | 29 ---------
 cpp/include/cudf/detail/utilities/cuda.hpp    | 59 +++++++++++++++++++
 cpp/src/io/comp/debrotli.cu                   | 18 +++---
 cpp/src/utilities/cuda.cpp                    | 34 +++++++++++
 7 files changed, 105 insertions(+), 47 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/cuda.hpp
 create mode 100644 cpp/src/utilities/cuda.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a6f72ed6b75..4080c5d02da 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -666,6 +666,7 @@ add_library(
   src/unary/math_ops.cu
   src/unary/nan_ops.cu
   src/unary/null_ops.cu
+  src/utilities/cuda.cpp
   src/utilities/cuda_memcpy.cu
   src/utilities/default_stream.cpp
   src/utilities/host_memory.cpp
diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index f7984b29d6b..75bbe8174d3 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -150,13 +151,8 @@ void generate_input_tables(key_type* const build_tbl,
   CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     &num_blocks_init_probe_tbl, init_probe_tbl<key_type, size_type>, block_size, 0));
 
-  int dev_id{-1};
-  CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
-
-  int num_sms{-1};
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
-
-  int const num_states =
+  auto const num_sms = cudf::detail::num_multiprocessors();
+  auto const num_states =
     num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
   rmm::device_uvector<curandState> devStates(num_states, cudf::get_default_stream());
 
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index b6310e6cd2f..4071fa01fb2 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -22,6 +22,7 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/table/table.hpp>
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 5007af7f9f1..d31ca3d92d1 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -189,35 +189,6 @@ __device__ T single_lane_block_sum_reduce(T lane_value)
   return result;
 }
 
-/**
- * @brief Get the number of elements that can be processed per thread.
- *
- * @param[in] kernel The kernel for which the elements per thread needs to be assessed
- * @param[in] total_size Number of elements
- * @param[in] block_size Expected block size
- *
- * @return cudf::size_type Elements per thread that can be processed for given specification.
- */
-template <typename Kernel>
-cudf::size_type elements_per_thread(Kernel kernel,
-                                    cudf::size_type total_size,
-                                    cudf::size_type block_size,
-                                    cudf::size_type max_per_thread = 32)
-{
-  CUDF_FUNC_RANGE();
-
-  // calculate theoretical occupancy
-  int max_blocks = 0;
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
-
-  int device = 0;
-  CUDF_CUDA_TRY(cudaGetDevice(&device));
-  int num_sms = 0;
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
-  int per_thread = total_size / (max_blocks * num_sms * block_size);
-  return std::clamp(per_thread, 1, max_per_thread);
-}
-
 /**
  * @brief Finds the smallest value not less than `number_to_round` and modulo `modulus` is
  * zero. Expects modulus to be a power of 2.
diff --git a/cpp/include/cudf/detail/utilities/cuda.hpp b/cpp/include/cudf/detail/utilities/cuda.hpp
new file mode 100644
index 00000000000..58c7ae8ed6a
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/cuda.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <algorithm>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+/**
+ * @brief Get the number of multiprocessors on the device
+ */
+cudf::size_type num_multiprocessors();
+
+/**
+ * @brief Get the number of elements that can be processed per thread.
+ *
+ * @param[in] kernel The kernel for which the elements per thread needs to be assessed
+ * @param[in] total_size Number of elements
+ * @param[in] block_size Expected block size
+ *
+ * @return cudf::size_type Elements per thread that can be processed for given specification.
+ */
+template <typename Kernel>
+cudf::size_type elements_per_thread(Kernel kernel,
+                                    cudf::size_type total_size,
+                                    cudf::size_type block_size,
+                                    cudf::size_type max_per_thread = 32)
+{
+  CUDF_FUNC_RANGE();
+
+  // calculate theoretical occupancy
+  int max_blocks = 0;
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
+
+  int per_thread = total_size / (max_blocks * num_multiprocessors() * block_size);
+  return std::clamp(per_thread, 1, max_per_thread);
+}
+
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 861820f47e7..72649dbe427 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -58,6 +58,7 @@ THE SOFTWARE.
 #include "gpuinflate.hpp"
 #include "io/utilities/block_utils.cuh"
 
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -2047,19 +2048,14 @@ CUDF_KERNEL void __launch_bounds__(block_size, 2)
  */
 size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs)
 {
-  int sm_count = 0;
-  int dev      = 0;
   uint32_t max_fb_size, min_fb_size, fb_size;
-  CUDF_CUDA_TRY(cudaGetDevice(&dev));
-  if (cudaSuccess == cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev)) {
-    // printf("%d SMs on device %d\n", sm_count, dev);
-    max_num_inputs =
-      min(max_num_inputs, sm_count * 3);  // no more than 3 blocks/sm at most due to 32KB smem use
-    if (max_num_inputs <= 0) {
-      max_num_inputs = sm_count * 2;  // Target 2 blocks/SM by default for scratch mem computation
-    }
+  auto const sm_count = cudf::detail::num_multiprocessors();
+  // no more than 3 blocks/sm at most due to 32KB smem use
+  max_num_inputs = std::min(max_num_inputs, sm_count * 3);
+  if (max_num_inputs <= 0) {
+    max_num_inputs = sm_count * 2;  // Target 2 blocks/SM by default for scratch mem computation
   }
-  max_num_inputs = min(max(max_num_inputs, 1), 512);
+  max_num_inputs = std::min(std::max(max_num_inputs, 1), 512);
   // Max fb size per block occurs if all huffman tables for all 3 group types fail local_alloc()
   // with num_htrees=256 (See HuffmanTreeGroupAlloc)
   max_fb_size = 256 * (630 + 1080 + 920) * 2;  // 1.3MB
diff --git a/cpp/src/utilities/cuda.cpp b/cpp/src/utilities/cuda.cpp
new file mode 100644
index 00000000000..53ca0608170
--- /dev/null
+++ b/cpp/src/utilities/cuda.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <cuda_runtime.h>
+
+namespace cudf::detail {
+
+cudf::size_type num_multiprocessors()
+{
+  int device = 0;
+  CUDF_CUDA_TRY(cudaGetDevice(&device));
+  int num_sms = 0;
+  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
+  return num_sms;
+}
+
+}  // namespace cudf::detail

From dd585e84756992bee0ecbae6f77107d64cddaede Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 27 Aug 2024 12:58:02 -0400
Subject: [PATCH 127/270] Prune workflows based on changed files (#16642)

Only run tests based on things that have actually changed. For example, if only Python files have changed, we don't need to run the C++ tests.

Contributes to https://github.com/rapidsai/build-planning/issues/94

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/16642
---
 .github/workflows/pr.yaml | 88 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 78 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 2e2a8b6b9bc..35c7e3d95b6 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,6 +12,7 @@ concurrency:
 jobs:
   pr-builder:
     needs:
+      - changed-files
       - checks
       - conda-cpp-build
       - conda-cpp-checks
@@ -37,6 +38,63 @@ jobs:
       - pandas-tests-diff
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    if: always()
+    with:
+      needs: ${{ toJSON(needs) }}
+  changed-files:
+    runs-on: ubuntu-latest
+    name: "Check changed files"
+    outputs:
+      test_cpp: ${{ steps.changed-files.outputs.cpp_any_changed == 'true' }}
+      test_java: ${{ steps.changed-files.outputs.java_any_changed == 'true' }}
+      test_notebooks: ${{ steps.changed-files.outputs.notebooks_any_changed == 'true' }}
+      test_python: ${{ steps.changed-files.outputs.python_any_changed == 'true' }}
+    steps:
+      - name: Get PR info
+        id: get-pr-info
+        uses: rapidsai/shared-actions/get-pr-info@main
+      - name: Checkout code repo
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.sha }}
+          fetch-depth: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).commits }}
+          persist-credentials: false
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files@v45
+        with:
+          base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+          files_yaml: |
+            cpp:
+              - '**'
+              - '!CONTRIBUTING.md'
+              - '!README.md'
+              - '!docs/**'
+              - '!img/**'
+              - '!java/**'
+              - '!notebooks/**'
+              - '!python/**'
+            java:
+              - '**'
+              - '!CONTRIBUTING.md'
+              - '!README.md'
+              - '!docs/**'
+              - '!img/**'
+              - '!notebooks/**'
+              - '!python/**'
+            notebooks:
+              - '**'
+              - '!CONTRIBUTING.md'
+              - '!README.md'
+              - '!java/**'
+            python:
+              - '**'
+              - '!CONTRIBUTING.md'
+              - '!README.md'
+              - '!docs/**'
+              - '!img/**'
+              - '!java/**'
+              - '!notebooks/**'
   checks:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
@@ -56,9 +114,10 @@ jobs:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
-    needs: conda-cpp-build
+    needs: [conda-cpp-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_cpp == 'true'
     with:
       build_type: pull-request
   conda-python-build:
@@ -68,24 +127,27 @@ jobs:
     with:
       build_type: pull-request
   conda-python-cudf-tests:
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
-    needs: conda-cpp-build
+    needs: [conda-cpp-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_java == 'true'
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -103,9 +165,10 @@ jobs:
       container_image: "rapidsai/ci-wheel:latest"
       run_script: "ci/configure_cpp_static.sh"
   conda-notebook-tests:
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_notebooks == 'true'
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -145,9 +208,10 @@ jobs:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
-    needs: wheel-build-cudf
+    needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
@@ -161,9 +225,10 @@ jobs:
       build_type: pull-request
       script: "ci/build_wheel_cudf_polars.sh"
   wheel-tests-cudf-polars:
-    needs: wheel-build-cudf-polars
+    needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -181,9 +246,10 @@ jobs:
       build_type: pull-request
       script: "ci/build_wheel_dask_cudf.sh"
   wheel-tests-dask-cudf:
-    needs: wheel-build-dask-cudf
+    needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -200,9 +266,10 @@ jobs:
         build-all -DBUILD_BENCHMARKS=ON --verbose;
         sccache -s;
   unit-tests-cudf-pandas:
-    needs: wheel-build-cudf
+    needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -210,9 +277,10 @@ jobs:
       script: ci/cudf_pandas_scripts/run_tests.sh
   pandas-tests:
     # run the Pandas unit tests using PR branch
-    needs: wheel-build-cudf
+    needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))

From 6747d2dc9d0deb4585b6306fed8a41bdf65e5558 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 27 Aug 2024 14:48:14 -0400
Subject: [PATCH 128/270] Update rapidsai/pre-commit-hooks (#16669)

This PR updates rapidsai/pre-commit-hooks to the version 0.4.0.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16669
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1b17eae0842..f861fb57916 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -144,7 +144,7 @@ repos:
       - id: ruff-format
         files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.3.1
+    rev: v0.4.0
     hooks:
       - id: verify-copyright
         exclude: |

From 1a2aad27b7e136f87be80debed6da7d3528ebda1 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 27 Aug 2024 13:33:47 -0700
Subject: [PATCH 129/270] Remove arrow dependency (#16640)

This PR removes libarrow as a dependency of libcudf since we no longer use any of its APIs in our C++ code. The following places remain dependent on libarrow:
- tests: We have tests demonstrating how to interoperate with libarrow objects, as well as other tests that leverage Arrow for I/O.
- examples: We have an example demonstrating interop with libarrow arrays.
- JNI: The JNI is still using libarrow to handle ingestion or production of Arrow buffers.

In all three cases above, we are now statically linking libarrow. We also always pull it in via CPM, which means that we never require libarrow to exist on the user's system anymore. Of the above three cases, we should expect the first two to persist indefinitely. The JNI could be updated to use nanoarrow instead if desired, but that is not critical since the primary benefit of removing libarrow as a direct dependency is to remove it as a constraint for package managers such as conda in environments where we must match the version of Arrow required by other dependencies.

pyarrow remains a dependency of the cudf Python packages. For now, this PR retains the tight pinning on 16.1 since we know that this version works. A future PR will loosen this pinning since we are no longer constrained to ABI-compatible versions and can support a range of pyarrow versions that support the necessary Python APIs (I believe pyarrow>=13 will work, but that remains to be tested).

Resolves #15193

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - James Lamb (https://github.com/jameslamb)
  - Robert Maynard (https://github.com/robertmaynard)
  - https://github.com/jakirkham
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16640
---
 ci/build_wheel_cudf.sh                        |   1 -
 ci/build_wheel_libcudf.sh                     |   2 +-
 ci/build_wheel_pylibcudf.sh                   |   1 -
 .../all_cuda-118_arch-x86_64.yaml             |   6 -
 .../all_cuda-125_arch-x86_64.yaml             |   6 -
 conda/recipes/cudf/meta.yaml                  |   4 +-
 conda/recipes/libcudf/conda_build_config.yaml |   3 -
 conda/recipes/libcudf/meta.yaml               |   2 -
 conda/recipes/pylibcudf/meta.yaml             |   4 +-
 cpp/CMakeLists.txt                            |  27 +-
 cpp/cmake/thirdparty/get_arrow.cmake          | 285 +++++++-----------
 cpp/examples/interop/CMakeLists.txt           |   7 +
 cpp/tests/CMakeLists.txt                      |  23 +-
 dependencies.yaml                             |  42 +--
 java/src/main/native/CMakeLists.txt           |   4 +
 python/cudf/CMakeLists.txt                    |   1 -
 python/cudf/cudf/_lib/CMakeLists.txt          |   3 -
 python/cudf/cudf/_lib/io/CMakeLists.txt       |   2 -
 python/cudf/pyproject.toml                    |   2 -
 .../cudf_kafka/cudf_kafka/_lib/CMakeLists.txt |   2 -
 python/cudf_kafka/pyproject.toml              |   2 -
 python/libcudf/CMakeLists.txt                 |   9 +-
 python/libcudf/libcudf/load.py                |   4 -
 python/libcudf/pyproject.toml                 |   1 -
 python/pylibcudf/CMakeLists.txt               |   1 -
 .../cmake/Modules/LinkPyarrowHeaders.cmake    |  40 ---
 python/pylibcudf/pylibcudf/io/CMakeLists.txt  |   5 -
 .../pylibcudf/libcudf/io/CMakeLists.txt       |   3 -
 python/pylibcudf/pyproject.toml               |   4 +-
 29 files changed, 145 insertions(+), 351 deletions(-)
 delete mode 100644 python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake

diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index cf33703f544..e5565c4b53c 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -22,7 +22,6 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"
 
 python -m auditwheel repair \
     --exclude libcudf.so \
-    --exclude libarrow.so.1601 \
     --exclude libnvcomp.so \
     --exclude libnvcomp_bitcomp.so \
     --exclude libnvcomp_gdeflate.so \
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index 9694c3f6144..8975381ceba 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -10,6 +10,6 @@ package_dir="python/libcudf"
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
 mkdir -p ${package_dir}/final_dist
-python -m auditwheel repair --exclude libarrow.so.1601 -w ${package_dir}/final_dist ${package_dir}/dist/*
+python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
 RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index 7181a49d397..0e4745bda28 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -20,7 +20,6 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"
 
 python -m auditwheel repair \
     --exclude libcudf.so \
-    --exclude libarrow.so.1601 \
     --exclude libnvcomp.so \
     --exclude libnvcomp_bitcomp.so \
     --exclude libnvcomp_gdeflate.so \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index fcd6e27a7f6..96596958636 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -37,15 +37,11 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==16.1.0.*
-- libarrow-dataset==16.1.0.*
-- libarrow==16.1.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libkvikio==24.10.*,>=0.0.0a0
-- libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.10.*,>=0.0.0a0
 - make
@@ -56,7 +52,6 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
-- numpy
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
@@ -68,7 +63,6 @@ dependencies:
 - pandoc
 - pre-commit
 - ptxcompiler
-- pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index bedc3a90885..efc5f76b90f 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -38,13 +38,9 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==16.1.0.*
-- libarrow-dataset==16.1.0.*
-- libarrow==16.1.0.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==24.10.*,>=0.0.0a0
-- libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.10.*,>=0.0.0a0
 - make
@@ -55,7 +51,6 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
-- numpy
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcomp==3.0.6
@@ -65,7 +60,6 @@ dependencies:
 - pandas>=2.0,<2.2.3dev0
 - pandoc
 - pre-commit
-- pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink>=0.0.0a0
 - pytest-benchmark
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index b2dad767da4..53f52a35651 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -64,8 +64,6 @@ requirements:
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
     - dlpack >=0.8,<1.0
-    - numpy 2.0
-    - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
     - rmm ={{ minor_version }}
@@ -84,7 +82,7 @@ requirements:
     - cupy >=12.0.0
     - numba >=0.57
     - numpy >=1.23,<3.0a0
-    - {{ pin_compatible('pyarrow', max_pin='x.x') }}
+    - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index ff7458caf82..4b1c4cca828 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -19,9 +19,6 @@ c_stdlib_version:
 cmake_version:
   - ">=3.26.4,!=3.30.0"
 
-libarrow_version:
-  - "==16.1.0"
-
 dlpack_version:
   - ">=0.8,<1.0"
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index aa1c94a4bca..1c2e9e8dd98 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -64,7 +64,6 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - nvcomp {{ nvcomp_version }}
-    - libarrow {{ libarrow_version }}
     - dlpack {{ dlpack_version }}
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
@@ -92,7 +91,6 @@ outputs:
         - cmake {{ cmake_version }}
       host:
         - cuda-version ={{ cuda_version }}
-        - libarrow {{ libarrow_version }}
       run:
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         {% if cuda_major == "11" %}
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index fef78467027..67b9b76bb8c 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -64,8 +64,6 @@ requirements:
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
     - dlpack >=0.8,<1.0
-    - numpy 2.0
-    - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
@@ -81,7 +79,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.3dev0
     - numpy >=1.23,<3.0a0
-    - {{ pin_compatible('pyarrow', max_pin='x.x') }}
+    - pyarrow ==16.1.0.*
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4080c5d02da..1040fcb7b91 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -54,11 +54,6 @@ mark_as_advanced(CUDF_BUILD_TESTUTIL)
 option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON)
 option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF)
 mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED)
-option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF)
-option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF)
-option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF)
-option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF)
-option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" OFF)
 option(
   CUDF_USE_PER_THREAD_DEFAULT_STREAM
   "Build cuDF with per-thread default stream, including passing the per-thread default
@@ -81,8 +76,6 @@ option(CUDA_ENABLE_LINEINFO
 option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
 # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
-option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF)
-mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
 
 set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL ON)
 if(CUDA_STATIC_RUNTIME OR NOT BUILD_SHARED_LIBS)
@@ -100,8 +93,6 @@ message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
 message(VERBOSE "CUDF: Configure CMake to build (google & nvbench) benchmarks: ${BUILD_BENCHMARKS}")
 message(VERBOSE "CUDF: Build cuDF shared libraries: ${BUILD_SHARED_LIBS}")
 message(VERBOSE "CUDF: Use a file cache for JIT compiled kernels: ${JITIFY_USE_CACHE}")
-message(VERBOSE "CUDF: Build and statically link Arrow libraries: ${CUDF_USE_ARROW_STATIC}")
-message(VERBOSE "CUDF: Build and enable S3 filesystem support for Arrow: ${CUDF_ENABLE_ARROW_S3}")
 message(VERBOSE "CUDF: Build with per-thread default stream: ${CUDF_USE_PER_THREAD_DEFAULT_STREAM}")
 message(
   VERBOSE
@@ -192,8 +183,6 @@ include(cmake/thirdparty/get_nvcomp.cmake)
 include(cmake/thirdparty/get_cccl.cmake)
 # find rmm
 include(cmake/thirdparty/get_rmm.cmake)
-# find arrow
-include(cmake/thirdparty/get_arrow.cmake)
 # find flatbuffers
 include(cmake/thirdparty/get_flatbuffers.cmake)
 # find dlpack
@@ -807,7 +796,7 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
+  PUBLIC CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
           kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
@@ -1056,20 +1045,6 @@ following IMPORTED GLOBAL  targets:
     ]=]
 )
 
-if(CUDF_ENABLE_ARROW_PARQUET)
-  string(
-    APPEND
-    install_code_string
-    [=[
-  if(NOT Parquet_DIR)
-    set(Parquet_DIR "${Arrow_DIR}")
-  endif()
-  set(ArrowDataset_DIR "${Arrow_DIR}")
-  find_dependency(ArrowDataset)
-  ]=]
-  )
-endif()
-
 rapids_export(
   INSTALL cudf
   EXPORT_SET cudf-exports ${_components_export_string}
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index e3e6a07661a..07cbf5150f4 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -22,82 +22,8 @@
 
 include_guard(GLOBAL)
 
-# Generate a FindArrow module for the case where we need to search for arrow within a pip install
-# pyarrow.
-function(find_libarrow_in_python_wheel PYARROW_VERSION)
-  string(REPLACE "." ";" PYARROW_VER_COMPONENTS "${PYARROW_VERSION}")
-  list(GET PYARROW_VER_COMPONENTS 0 PYARROW_MAJOR_VER)
-  list(GET PYARROW_VER_COMPONENTS 1 PYARROW_MINOR_VER)
-
-  # Ensure that the major and minor versions are two digits long
-  string(LENGTH ${PYARROW_MAJOR_VER} PYARROW_MAJOR_LENGTH)
-  string(LENGTH ${PYARROW_MINOR_VER} PYARROW_MINOR_LENGTH)
-  if(${PYARROW_MAJOR_LENGTH} EQUAL 1)
-    set(PYARROW_MAJOR_VER "0${PYARROW_MAJOR_VER}")
-  endif()
-  if(${PYARROW_MINOR_LENGTH} EQUAL 1)
-    set(PYARROW_MINOR_VER "0${PYARROW_MINOR_VER}")
-  endif()
-
-  set(PYARROW_LIB "libarrow.so.${PYARROW_MAJOR_VER}${PYARROW_MINOR_VER}")
-
-  string(
-    APPEND
-    initial_code_block
-    [=[
-find_package(Python 3.10 REQUIRED COMPONENTS Interpreter)
-execute_process(
-    COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])"
-    OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-    COMMAND_ERROR_IS_FATAL ANY
-)
-list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}")
-]=]
-  )
-  string(
-    APPEND
-    final_code_block
-    [=[
-list(POP_BACK CMAKE_PREFIX_PATH)
-]=]
-  )
-  rapids_find_generate_module(
-    Arrow NO_CONFIG
-    VERSION "${PYARROW_VERSION}"
-    LIBRARY_NAMES "${PYARROW_LIB}"
-    BUILD_EXPORT_SET cudf-exports
-    INSTALL_EXPORT_SET cudf-exports
-    HEADER_NAMES arrow/python/arrow_to_pandas.h INITIAL_CODE_BLOCK initial_code_block
-                 FINAL_CODE_BLOCK final_code_block
-  )
-
-  find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
-  add_library(arrow_shared ALIAS Arrow::Arrow)
-
-  rapids_export_package(BUILD Arrow cudf-exports)
-  rapids_export_package(INSTALL Arrow cudf-exports)
-endfunction()
-
 # This function finds arrow and sets any additional necessary environment variables.
-function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON
-         ENABLE_PARQUET PYARROW_LIBARROW
-)
-
-  if(PYARROW_LIBARROW)
-    # Generate a FindArrow.cmake to find pyarrow's libarrow.so
-    find_libarrow_in_python_wheel(${VERSION})
-    set(ARROW_FOUND
-        TRUE
-        PARENT_SCOPE
-    )
-    set(ARROW_LIBRARIES
-        arrow_shared
-        PARENT_SCOPE
-    )
-    return()
-  endif()
-
+function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_PARQUET)
   if(BUILD_STATIC)
     if(TARGET arrow_static)
       set(ARROW_FOUND
@@ -124,10 +50,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     endif()
   endif()
 
-  if(NOT ARROW_ARMV8_ARCH)
-    set(ARROW_ARMV8_ARCH "armv8-a")
-  endif()
-
   if(NOT ARROW_SIMD_LEVEL)
     set(ARROW_SIMD_LEVEL "NONE")
   endif()
@@ -150,14 +72,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     set(ARROW_OPENSSL_USE_SHARED ON)
   endif()
 
-  set(ARROW_PYTHON_OPTIONS "")
-  if(ENABLE_PYTHON)
-    list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON")
-    # Arrow's logic to build Boost from source is busted, so we have to get it from the system.
-    list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM")
-    list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO")
-  endif()
-
   set(ARROW_PARQUET_OPTIONS "")
   if(ENABLE_PARQUET)
     # Arrow's logic to build Boost from source is busted, so we have to get it from the system.
@@ -174,6 +88,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     GIT_REPOSITORY https://github.com/apache/arrow.git
     GIT_TAG apache-arrow-${VERSION}
     GIT_SHALLOW TRUE SOURCE_SUBDIR cpp
+    EXCLUDE_FROM_ALL ${EXCLUDE_FROM_ALL}
     OPTIONS "CMAKE_VERBOSE_MAKEFILE ON"
             "ARROW_ACERO ON"
             "ARROW_IPC ON"
@@ -181,16 +96,14 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
             "ARROW_WITH_BACKTRACE ON"
             "ARROW_CXXFLAGS -w"
             "ARROW_JEMALLOC OFF"
-            "ARROW_S3 ${ENABLE_S3}"
-            "ARROW_ORC ${ENABLE_ORC}"
-            # e.g. needed by blazingsql-io
+            "ARROW_S3 OFF"
+            "ARROW_ORC OFF"
             ${ARROW_PARQUET_OPTIONS}
             "ARROW_PARQUET ${ENABLE_PARQUET}"
             "ARROW_FILESYSTEM ON"
-            ${ARROW_PYTHON_OPTIONS}
+            "ARROW_PYTHON OFF"
             # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off
             "ARROW_USE_CCACHE OFF"
-            "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}"
             "ARROW_SIMD_LEVEL ${ARROW_SIMD_LEVEL}"
             "ARROW_BUILD_STATIC ${ARROW_BUILD_STATIC}"
             "ARROW_BUILD_SHARED ${ARROW_BUILD_SHARED}"
@@ -269,7 +182,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
   endif()
 
   if(Arrow_ADDED)
-
     set(arrow_code_string
         [=[
           if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared))
@@ -324,101 +236,106 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
         get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES)
       endif()
     endif()
-    rapids_export(
-      BUILD Arrow
-      VERSION ${VERSION}
-      EXPORT_SET arrow_targets
-      GLOBAL_TARGETS arrow_shared arrow_static
-      NAMESPACE cudf::
-      FINAL_CODE_BLOCK arrow_code_string
-    )
-
-    if(ENABLE_PARQUET)
-
-      set(arrow_acero_code_string
-          [=[
-              if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared))
-                  add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared)
-              endif()
-              if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static))
-                  add_library(arrow_acero_static ALIAS cudf::arrow_acero_static)
-              endif()
-            ]=]
-      )
 
+    include(rapids-export)
+    if(NOT EXCLUDE_FROM_ALL)
       rapids_export(
-        BUILD ArrowAcero
+        BUILD Arrow
         VERSION ${VERSION}
-        EXPORT_SET arrow_acero_targets
-        GLOBAL_TARGETS arrow_acero_shared arrow_acero_static
+        EXPORT_SET arrow_targets
+        GLOBAL_TARGETS arrow_shared arrow_static
         NAMESPACE cudf::
-        FINAL_CODE_BLOCK arrow_acero_code_string
+        FINAL_CODE_BLOCK arrow_code_string
       )
 
-      set(arrow_dataset_code_string
-          [=[
-              if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared))
-                  add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared)
-              endif()
-              if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static))
-                  add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static)
-              endif()
-            ]=]
-      )
+      if(ENABLE_PARQUET)
+        set(arrow_acero_code_string
+            [=[
+                if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared))
+                    add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared)
+                endif()
+                if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static))
+                    add_library(arrow_acero_static ALIAS cudf::arrow_acero_static)
+                endif()
+              ]=]
+        )
 
-      rapids_export(
-        BUILD ArrowDataset
-        VERSION ${VERSION}
-        EXPORT_SET arrow_dataset_targets
-        GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static
-        NAMESPACE cudf::
-        FINAL_CODE_BLOCK arrow_dataset_code_string
-      )
+        rapids_export(
+          BUILD ArrowAcero
+          VERSION ${VERSION}
+          EXPORT_SET arrow_acero_targets
+          GLOBAL_TARGETS arrow_acero_shared arrow_acero_static
+          NAMESPACE cudf::
+          FINAL_CODE_BLOCK arrow_acero_code_string
+        )
 
-      set(parquet_code_string
-          [=[
-              if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared))
-                  add_library(parquet_shared ALIAS cudf::parquet_shared)
-              endif()
-              if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static))
-                  add_library(parquet_static ALIAS cudf::parquet_static)
-              endif()
-            ]=]
-      )
+        set(arrow_dataset_code_string
+            [=[
+                if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared))
+                    add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared)
+                endif()
+                if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static))
+                    add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static)
+                endif()
+              ]=]
+        )
 
-      rapids_export(
-        BUILD Parquet
-        VERSION ${VERSION}
-        EXPORT_SET parquet_targets
-        GLOBAL_TARGETS parquet_shared parquet_static
-        NAMESPACE cudf::
-        FINAL_CODE_BLOCK parquet_code_string
-      )
+        rapids_export(
+          BUILD ArrowDataset
+          VERSION ${VERSION}
+          EXPORT_SET arrow_dataset_targets
+          GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static
+          NAMESPACE cudf::
+          FINAL_CODE_BLOCK arrow_dataset_code_string
+        )
+        set(parquet_code_string
+            [=[
+                if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared))
+                    add_library(parquet_shared ALIAS cudf::parquet_shared)
+                endif()
+                if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static))
+                    add_library(parquet_static ALIAS cudf::parquet_static)
+                endif()
+              ]=]
+        )
+
+        rapids_export(
+          BUILD Parquet
+          VERSION ${VERSION}
+          EXPORT_SET parquet_targets
+          GLOBAL_TARGETS parquet_shared parquet_static
+          NAMESPACE cudf::
+          FINAL_CODE_BLOCK parquet_code_string
+        )
+      endif()
     endif()
   endif()
-  # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency`
-  rapids_export_package(BUILD Arrow cudf-exports)
-  rapids_export_package(INSTALL Arrow cudf-exports)
 
-  if(ENABLE_PARQUET)
-    rapids_export_package(BUILD Parquet cudf-exports)
-    rapids_export_package(BUILD ArrowDataset cudf-exports)
-  endif()
+  if(NOT EXCLUDE_FROM_ALL)
+    # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency`
+    rapids_export_package(BUILD Arrow cudf-exports)
+    rapids_export_package(INSTALL Arrow cudf-exports)
 
-  include("${rapids-cmake-dir}/export/find_package_root.cmake")
-  rapids_export_find_package_root(
-    BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports
-  )
-  rapids_export_find_package_root(
-    BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=]
-    EXPORT_SET cudf-exports
-    CONDITION ENABLE_PARQUET
-  )
-  rapids_export_find_package_root(
-    BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=]
-    EXPORT_SET cudf-exports
-    CONDITION ENABLE_PARQUET
-  )
+    if(ENABLE_PARQUET)
+      rapids_export_package(BUILD Parquet cudf-exports)
+      rapids_export_package(BUILD ArrowDataset cudf-exports)
+    endif()
+
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    rapids_export_find_package_root(
+      BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports
+    )
+    rapids_export_find_package_root(
+      BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=]
+      EXPORT_SET cudf-exports
+      CONDITION ENABLE_PARQUET
+    )
+    rapids_export_find_package_root(
+      BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=]
+      EXPORT_SET cudf-exports
+      CONDITION ENABLE_PARQUET
+    )
+  endif()
 
   set(ARROW_LIBRARIES
       "${ARROW_LIBRARIES}"
@@ -435,7 +352,21 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   )
 endif()
 
+# Default to static arrow builds
+if(NOT DEFINED CUDF_USE_ARROW_STATIC)
+  set(CUDF_USE_ARROW_STATIC ON)
+endif()
+
+# Default to excluding from installation since we generally privately and statically link Arrow.
+if(NOT DEFINED CUDF_EXCLUDE_ARROW_FROM_ALL)
+  set(CUDF_EXCLUDE_ARROW_FROM_ALL OFF)
+endif()
+
+if(NOT DEFINED CUDF_ENABLE_ARROW_PARQUET)
+  set(CUDF_ENABLE_ARROW_PARQUET OFF)
+endif()
+
 find_and_configure_arrow(
-  ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC}
-  ${CUDF_ENABLE_ARROW_PYTHON} ${CUDF_ENABLE_ARROW_PARQUET} ${USE_LIBARROW_FROM_PYARROW}
+  ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_EXCLUDE_ARROW_FROM_ALL}
+  ${CUDF_ENABLE_ARROW_PARQUET}
 )
diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt
index a1f99c1d2fd..2816f613d3d 100644
--- a/cpp/examples/interop/CMakeLists.txt
+++ b/cpp/examples/interop/CMakeLists.txt
@@ -15,6 +15,13 @@ project(
 
 include(../fetch_dependencies.cmake)
 
+# The Arrow CMake is currently broken if the build type is not set
+set(CMAKE_BUILD_TYPE Release)
+# No need to install Arrow libs when only the final example executable is shipped.
+set(CUDF_EXCLUDE_ARROW_FROM_ALL ON)
+include(../../cmake/thirdparty/get_arrow.cmake)
+
 add_executable(interop interop.cpp)
 target_link_libraries(interop PRIVATE cudf::cudf)
 target_compile_features(interop PRIVATE cxx_std_17)
+target_link_libraries(interop PRIVATE ${ARROW_LIBRARIES})
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ac77a362e1c..f86acbcc51b 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -24,8 +24,8 @@ rapids_test_init()
 # properties and linking to build the test
 function(ConfigureTest CMAKE_TEST_NAME)
   set(options)
-  set(one_value GPUS PERCENT STREAM_MODE EXTRA_LIB)
-  set(multi_value)
+  set(one_value GPUS PERCENT STREAM_MODE)
+  set(multi_value EXTRA_LIBS)
   cmake_parse_arguments(_CUDF_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN})
   if(NOT DEFINED _CUDF_TEST_GPUS AND NOT DEFINED _CUDF_TEST_PERCENT)
     set(_CUDF_TEST_GPUS 1)
@@ -57,7 +57,7 @@ function(ConfigureTest CMAKE_TEST_NAME)
   target_link_libraries(
     ${CMAKE_TEST_NAME}
     PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main
-            nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
+            nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIBS}"
   )
   rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
@@ -78,6 +78,14 @@ function(ConfigureTest CMAKE_TEST_NAME)
   endif()
 endfunction()
 
+# ##################################################################################################
+# dependencies  ###################################################################################
+# ##################################################################################################
+
+# No need to install Arrow libs when only the final test executables are shipped.
+set(CUDF_EXCLUDE_ARROW_FROM_ALL ON)
+include(../cmake/thirdparty/get_arrow.cmake)
+
 # ##################################################################################################
 # test sources ##################################################################################
 # ##################################################################################################
@@ -197,7 +205,7 @@ ConfigureTest(
   QUANTILES_TEST quantiles/percentile_approx_test.cpp quantiles/quantile_test.cpp
   quantiles/quantiles_test.cpp
   GPUS 1
-  PERCENT 70
+  PERCENT 70 EXTRA_LIBS ${ARROW_LIBRARIES}
 )
 
 # ##################################################################################################
@@ -276,8 +284,9 @@ ConfigureTest(
   interop/from_arrow_host_test.cpp
   interop/from_arrow_stream_test.cpp
   interop/dlpack_test.cpp
-  EXTRA_LIB
+  EXTRA_LIBS
   nanoarrow
+  ${ARROW_LIBRARIES}
 )
 
 # ##################################################################################################
@@ -288,7 +297,7 @@ ConfigureTest(ROW_SELECTION_TEST io/row_selection_test.cpp)
 ConfigureTest(
   CSV_TEST io/csv_test.cpp
   GPUS 1
-  PERCENT 30
+  PERCENT 30 EXTRA_LIBS ${ARROW_LIBRARIES}
 )
 ConfigureTest(
   FILE_IO_TEST io/file_io_test.cpp
@@ -316,7 +325,7 @@ ConfigureTest(
 ConfigureTest(
   JSON_TEST io/json/json_test.cpp io/json/json_chunked_reader.cu
   GPUS 1
-  PERCENT 30
+  PERCENT 30 EXTRA_LIBS ${ARROW_LIBRARIES}
 )
 ConfigureTest(JSON_WRITER_TEST io/json/json_writer.cpp)
 ConfigureTest(JSON_TYPE_CAST_TEST io/json/json_type_cast_test.cu)
diff --git a/dependencies.yaml b/dependencies.yaml
index 04b5940c9fb..b55860815bf 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -17,7 +17,6 @@ files:
       - depends_on_rmm
       - develop
       - docs
-      - libarrow_build
       - notebooks
       - py_version
       - rapids_build_skbuild
@@ -40,7 +39,6 @@ files:
     output: none
     includes:
       - cuda_version
-      - libarrow_run
       - test_cpp
   test_python:
     output: none
@@ -58,7 +56,6 @@ files:
       - build_all
       - cuda
       - cuda_version
-      - libarrow_run
       - test_java
   test_notebooks:
     output: none
@@ -77,7 +74,6 @@ files:
       - cuda
       - cuda_version
       - docs
-      - libarrow_run
       - py_version
   py_build_cudf:
     output: pyproject
@@ -137,7 +133,6 @@ files:
     includes:
       - build_base
       - build_cpp
-      - build_python_libcudf
       - depends_on_librmm
   py_run_libcudf:
     output: pyproject
@@ -389,38 +384,6 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cython>=3.0.3
-          # Hard pin the patch version used during the build. This must be kept
-          # in sync with the version pinned in get_arrow.cmake.
-          - &pyarrow_build pyarrow==16.1.0.*
-      - output_types: pyproject
-        packages:
-          # Hard pin the version used during the build.
-          # Sync with conda build constraint & wheel run constraint.
-          - numpy==2.0.*
-  build_python_libcudf:
-    common:
-      - output_types: [conda, requirements, pyproject]
-        packages:
-          - *pyarrow_build
-  libarrow_build:
-    common:
-      - output_types: conda
-        packages:
-          # Hard pin the Arrow patch version used during the build. This must
-          # be kept in sync with the version pinned in get_arrow.cmake.
-          - libarrow-acero==16.1.0.*
-          - libarrow-dataset==16.1.0.*
-          - libarrow==16.1.0.*
-          - libparquet==16.1.0.*
-  libarrow_run:
-    common:
-      - output_types: conda
-        packages:
-          # Allow runtime version to float up to patch version
-          - libarrow-acero>=16.1.0,<16.2.0a0
-          - libarrow-dataset>=16.1.0,<16.2.0a0
-          - libarrow>=16.1.0,<16.2.0a0
-          - libparquet>=16.1.0,<16.2.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -600,7 +563,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fsspec>=0.6.0
-          - numpy>=1.23,<3.0a0
+          - &numpy numpy>=1.23,<3.0a0
           - pandas>=2.0,<2.2.3dev0
   run_pylibcudf:
     common:
@@ -731,6 +694,7 @@ dependencies:
           - *cmake_ver
           - maven
           - openjdk=8.*
+          - boost
   test_python_common:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -744,7 +708,7 @@ dependencies:
         packages:
           - fastavro>=0.22.9
           - hypothesis
-          - numpy
+          - *numpy
           - pandas
   test_python_cudf:
     common:
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 22059c5bc7f..c18a90140b6 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -212,6 +212,10 @@ target_compile_definitions(
 )
 target_link_options(cudfjni PRIVATE "-Wl,--no-undefined")
 
+set(CUDF_ENABLE_ARROW_PARQUET ON)
+include(../../../../cpp/cmake/thirdparty/get_arrow.cmake)
+target_link_libraries(cudfjni PRIVATE ${ARROW_LIBRARIES})
+
 if(USE_GDS)
   add_library(cufilejni src/CuFileJni.cpp)
   set_target_properties(
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index 72f20b30052..7193ada5b93 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -35,7 +35,6 @@ include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
 include(rapids-cython-core)
 rapids_cython_init()
 
-include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
 add_subdirectory(cudf/_lib)
 add_subdirectory(udf_cpp)
 
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 5ea378fc0e5..5d4b5421f16 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -65,9 +65,6 @@ rapids_cython_create_modules(
 target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
 target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 
-set(targets_using_arrow_headers avro csv orc json parquet)
-link_to_pyarrow_headers("${targets_using_arrow_headers}")
-
 include(${rapids-cmake-dir}/export/find_package_root.cmake)
 include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
 target_link_libraries(interop PUBLIC nanoarrow)
diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt
index 620229a1275..e7408cf2852 100644
--- a/python/cudf/cudf/_lib/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/io/CMakeLists.txt
@@ -19,5 +19,3 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX io_ ASSOCIATED_TARGETS cudf
 )
-
-link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index a6d26d17d46..8386935fab0 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -129,8 +129,6 @@ requires = [
     "libcudf==24.10.*,>=0.0.0a0",
     "librmm==24.10.*,>=0.0.0a0",
     "ninja",
-    "numpy==2.0.*",
-    "pyarrow==16.1.0.*",
     "pylibcudf==24.10.*,>=0.0.0a0",
     "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
index 1b205537d73..4490c41c7a9 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
+++ b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
@@ -20,5 +20,3 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}"
 )
-include(../../../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
-link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 01e7299a33a..6ca798bb11c 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -106,6 +106,4 @@ requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
-    "numpy==2.0.*",
-    "pyarrow==16.1.0.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 09c7ed2e217..96eb6c3bb30 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -32,9 +32,6 @@ endif()
 
 unset(cudf_FOUND)
 
-# For wheels, this should always be true
-set(USE_LIBARROW_FROM_PYARROW ON)
-
 # Find Python early so that later commands can use it
 find_package(Python 3.10 REQUIRED COMPONENTS Interpreter)
 
@@ -46,13 +43,11 @@ set(CUDA_STATIC_RUNTIME ON)
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 
-include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
-
 add_subdirectory(../../cpp cudf-cpp)
 
 # Ensure other libraries needed by libcudf.so get installed alongside it.
 include(cmake/Modules/WheelHelpers.cmake)
 install_aliased_imported_targets(
-  TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
-  DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
+  TARGETS cudf nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp DESTINATION
+  ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
 )
diff --git a/python/libcudf/libcudf/load.py b/python/libcudf/libcudf/load.py
index f6ba0d51bdb..ba134710868 100644
--- a/python/libcudf/libcudf/load.py
+++ b/python/libcudf/libcudf/load.py
@@ -18,10 +18,6 @@
 
 
 def load_library():
-    # This is loading the libarrow shared library in situations where it comes from the
-    # pyarrow package (i.e. when installed as a wheel).
-    import pyarrow  # noqa: F401
-
     # Dynamically load libcudf.so. Prefer a system library if one is present to
     # avoid clobbering symbols that other packages might expect, but if no
     # other library is present use the one in the wheel.
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index fd01f7f6e2f..43878d0aec2 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -71,5 +71,4 @@ requires = [
     "cmake>=3.26.4,!=3.30.0",
     "librmm==24.10.*,>=0.0.0a0",
     "ninja",
-    "pyarrow==16.1.0.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt
index 340ad120377..a4b831790fb 100644
--- a/python/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/CMakeLists.txt
@@ -36,7 +36,6 @@ include(rapids-cython-core)
 
 rapids_cython_init()
 
-include(cmake/Modules/LinkPyarrowHeaders.cmake)
 add_subdirectory(pylibcudf)
 
 if(DEFINED cython_lib_dir)
diff --git a/python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake b/python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake
deleted file mode 100644
index d432f9fe1f5..00000000000
--- a/python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake
+++ /dev/null
@@ -1,40 +0,0 @@
-# =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-include_guard(GLOBAL)
-
-find_package(Python REQUIRED COMPONENTS Development NumPy)
-
-execute_process(
-  COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
-  OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
-  ERROR_VARIABLE PYARROW_ERROR
-  RESULT_VARIABLE PYARROW_RESULT
-  OUTPUT_STRIP_TRAILING_WHITESPACE
-)
-
-if(${PYARROW_RESULT})
-  message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
-endif()
-
-# Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts of
-# cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
-# requirement for arrow headers infects all of cudf. These requirements will go away once all
-# scalar-related Cython code is removed from cudf.
-function(link_to_pyarrow_headers targets)
-  foreach(target IN LISTS targets)
-    # PyArrow headers require numpy headers.
-    target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
-    target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
-  endforeach()
-endfunction()
diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
index 55bea4fc262..bcc2151f5b6 100644
--- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
@@ -20,8 +20,3 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
 )
-
-set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_csv pylibcudf_io_datasource
-                                pylibcudf_io_json pylibcudf_io_parquet pylibcudf_io_types
-)
-link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt
index 6831063ecb9..9f5f74506e9 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt
@@ -21,6 +21,3 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_io_
 )
-
-set(targets_using_arrow_headers cpp_io_json cpp_io_types)
-link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index 0d673ea4cc3..e4c6edc6141 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -40,7 +40,7 @@ classifiers = [
 test = [
     "fastavro>=0.22.9",
     "hypothesis",
-    "numpy",
+    "numpy>=1.23,<3.0a0",
     "pandas",
     "pytest-cov",
     "pytest-xdist",
@@ -104,8 +104,6 @@ requires = [
     "libcudf==24.10.*,>=0.0.0a0",
     "librmm==24.10.*,>=0.0.0a0",
     "ninja",
-    "numpy==2.0.*",
-    "pyarrow==16.1.0.*",
     "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From d0e5cdfc4df197bfb4846a243e3d9ea9d7b87aab Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 27 Aug 2024 12:17:51 -1000
Subject: [PATCH 130/270] Allow for binops between two differently sized
 DecimalDtypes (#16638)

Currently cudf Python has some custom logic for determining the resulting dtype of a binop between 2 decimal dtypes since Python decimal dtype support `precision` and libcudf doesn't. But libcudf does require that the 2 operands have the same decimal type when calculating the binop, so we must ensure the inputs are cast to the same, resulting dtype.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16638
---
 python/cudf/cudf/core/column/decimal.py | 17 ++++++++++++++---
 python/cudf/cudf/tests/test_decimal.py  | 10 ++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 3b979ef2e97..8803ebd6791 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -135,9 +135,15 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
         # are computed outside of libcudf
         if op in {"__add__", "__sub__", "__mul__", "__div__"}:
             output_type = _get_decimal_type(lhs.dtype, rhs.dtype, op)
+            lhs = lhs.astype(
+                type(output_type)(lhs.dtype.precision, lhs.dtype.scale)
+            )
+            rhs = rhs.astype(
+                type(output_type)(rhs.dtype.precision, rhs.dtype.scale)
+            )
             result = libcudf.binaryop.binaryop(lhs, rhs, op, output_type)
-            # TODO:  Why is this necessary? Why isn't the result's
-            # precision already set correctly based on output_type?
+            # libcudf doesn't support precision, so result.dtype doesn't
+            # maintain output_type.precision
             result.dtype.precision = output_type.precision
         elif op in {
             "__eq__",
@@ -430,7 +436,11 @@ def _with_type_metadata(
         return self
 
 
-def _get_decimal_type(lhs_dtype, rhs_dtype, op):
+def _get_decimal_type(
+    lhs_dtype: DecimalDtype,
+    rhs_dtype: DecimalDtype,
+    op: str,
+) -> DecimalDtype:
     """
     Returns the resulting decimal type after calculating
     precision & scale when performing the binary operation
@@ -441,6 +451,7 @@ def _get_decimal_type(lhs_dtype, rhs_dtype, op):
 
     # This should at some point be hooked up to libcudf's
     # binary_operation_fixed_point_scale
+    # Note: libcudf decimal types don't have a concept of precision
 
     p1, p2 = lhs_dtype.precision, rhs_dtype.precision
     s1, s2 = lhs_dtype.scale, rhs_dtype.scale
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index b63788d20b7..048b3a656e3 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -398,3 +398,13 @@ def test_decimal_overflow():
     s = cudf.Series([1, 2], dtype=cudf.Decimal128Dtype(precision=38, scale=0))
     result = s * Decimal("1.0")
     assert_eq(cudf.Decimal128Dtype(precision=38, scale=1), result.dtype)
+
+
+def test_decimal_binop_upcast_operands():
+    ser1 = cudf.Series([0.51, 1.51, 2.51]).astype(cudf.Decimal64Dtype(18, 2))
+    ser2 = cudf.Series([0.90, 0.96, 0.99]).astype(cudf.Decimal128Dtype(19, 2))
+    result = ser1 + ser2
+    expected = cudf.Series([1.41, 2.47, 3.50]).astype(
+        cudf.Decimal128Dtype(20, 2)
+    )
+    assert_eq(result, expected)

From 88de8dd5bc0d2476a554107626d72ceb6d65cbab Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 27 Aug 2024 13:26:27 -1000
Subject: [PATCH 131/270] Fix interval_range right child non-zero offset
 (#16651)

xref https://github.com/rapidsai/cudf/issues/16507

Similar to what is done in `IntervalIndex.from_breaks`, `interval_index` generates the right edges by slicing a range of fencepost edges. However, we don't want to maintain the new `offset` (`1`) on the right edge after slicing as it adversely impacts subsequent indexing operations.

~~Additionally, I noticed that `Index(struct_data)` would automatically convert it to an `IntervalIndex`, but `IntervalIndex` has a strict requirement on the data have `left/right` keys, so making this raise a `NotImplementedError` instead~~
^ Will tackle this in a follow up, looks like there are cases where this is valid

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16651
---
 python/cudf/cudf/core/index.py                  | 12 +++++++++++-
 python/cudf/cudf/tests/indexes/test_interval.py |  6 ++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 6a5e718c2c5..df8af856f4f 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3250,7 +3250,7 @@ def interval_range(
     freq=None,
     name=None,
     closed="right",
-) -> "IntervalIndex":
+) -> IntervalIndex:
     """
     Returns a fixed frequency IntervalIndex.
 
@@ -3347,6 +3347,16 @@ def interval_range(
     )
     left_col = bin_edges.slice(0, len(bin_edges) - 1)
     right_col = bin_edges.slice(1, len(bin_edges))
+    # For indexing, children should both have 0 offset
+    right_col = type(right_col)(
+        data=right_col.data,
+        dtype=right_col.dtype,
+        size=right_col.size,
+        mask=right_col.mask,
+        offset=0,
+        null_count=right_col.null_count,
+        children=right_col.children,
+    )
 
     if len(right_col) == 0 or len(left_col) == 0:
         dtype = IntervalDtype("int64", closed)
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index a567c27f584..6653a94c9be 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -407,3 +407,9 @@ def test_interval_range_name():
     expected = pd.interval_range(start=0, periods=5, freq=2, name="foo")
     result = cudf.interval_range(start=0, periods=5, freq=2, name="foo")
     assert_eq(result, expected)
+
+
+def test_from_interval_range_indexing():
+    result = cudf.interval_range(start=0, end=1, name="a").repeat(2)
+    expected = pd.interval_range(start=0, end=1, name="a").repeat(2)
+    assert_eq(result, expected)

From e2a15cb1ba856616b7de08e2f1a5c06d6d7c4a35 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 27 Aug 2024 19:47:31 -0400
Subject: [PATCH 132/270] Fix strings::detail::copy_range when target contains
 nulls (#16626)

Fixes the logic in `cudf::strings::detail::copy_range` handling of nulls in the target range. The optimization check for nulls is removed simplifying the logic and making it more reliable as well. The benchmark showed no significant change in performance.
Also adds a specific gtest for this case.
Error was introduced in #15010
Closes #16618

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16626
---
 cpp/src/strings/copying/copy_range.cu  | 23 +++--------------------
 cpp/tests/copying/copy_range_tests.cpp | 10 ++++++++++
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/cpp/src/strings/copying/copy_range.cu b/cpp/src/strings/copying/copy_range.cu
index 9f8c47602f8..2434de1795e 100644
--- a/cpp/src/strings/copying/copy_range.cu
+++ b/cpp/src/strings/copying/copy_range.cu
@@ -40,20 +40,14 @@ struct compute_element_size {
   size_type source_begin;
   size_type target_begin;
   size_type target_end;
-  bool source_has_nulls;
-  bool target_has_nulls;
 
   __device__ cudf::size_type operator()(cudf::size_type idx)
   {
     if (idx >= target_begin && idx < target_end) {
       auto const str_idx = source_begin + (idx - target_begin);
-      return source_has_nulls && d_source.is_null_nocheck(str_idx)
-               ? 0
-               : d_source.element<string_view>(str_idx).size_bytes();
+      return d_source.is_null(str_idx) ? 0 : d_source.element<string_view>(str_idx).size_bytes();
     } else {
-      return target_has_nulls && d_target.is_null_nocheck(idx)
-               ? 0
-               : d_target.element<string_view>(idx).size_bytes();
+      return d_target.is_null(idx) ? 0 : d_target.element<string_view>(idx).size_bytes();
     }
   }
 };
@@ -97,20 +91,9 @@ std::unique_ptr<column> copy_range(strings_column_view const& source,
       mr);
   }();
 
-  auto [check_source, check_target] = [target, null_count = null_count] {
-    // check validities for both source & target
-    if (target.has_nulls()) { return std::make_pair(true, true); }
-    // check validities for source only
-    if (null_count > 0) { return std::make_pair(true, false); }
-    // no need to check validities
-    return std::make_pair(false, false);
-  }();
-
   // create offsets
   auto sizes_begin = cudf::detail::make_counting_transform_iterator(
-    0,
-    compute_element_size{
-      d_source, d_target, source_begin, target_begin, target_end, check_source, check_target});
+    0, compute_element_size{d_source, d_target, source_begin, target_begin, target_end});
   auto [offsets_column, chars_bytes] = cudf::strings::detail::make_offsets_child_column(
     sizes_begin, sizes_begin + target.size(), stream, mr);
   auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp
index 223946ddcee..25d93da277b 100644
--- a/cpp/tests/copying/copy_range_tests.cpp
+++ b/cpp/tests/copying/copy_range_tests.cpp
@@ -232,6 +232,16 @@ TEST_F(CopyRangeTestFixture, CopyWithNullsString)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*p_ret, expected);
 }
 
+TEST_F(CopyRangeTestFixture, CopyWithTargetNullsString)
+{
+  auto target =
+    cudf::test::strings_column_wrapper({"a", "b", "", "d", "", "é"}, {1, 1, 0, 1, 1, 1});
+  auto source   = cudf::test::strings_column_wrapper({"A", "B", "C", "D", "E", "F"});
+  auto result   = cudf::copy_range(source, target, 1, 5, 1);
+  auto expected = cudf::test::strings_column_wrapper({"a", "B", "C", "D", "E", "é"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+}
+
 TEST_F(CopyRangeTestFixture, CopyNoNullsString)
 {
   cudf::size_type size{100};

From d1412e00092d752e4e34371042d7dbfe972ba5d7 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 27 Aug 2024 19:52:57 -0400
Subject: [PATCH 133/270] Rework strings::slice benchmark to use nvbench
 (#16563)

Moves google-benchmark  for `cudf::strings::slice_strings` to nvbench.
This is to help measure performance improvements in follow on work for strings-slice.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16563
---
 cpp/benchmarks/CMakeLists.txt   |  2 +-
 cpp/benchmarks/string/slice.cpp | 89 ++++++++++++++++-----------------
 2 files changed, 44 insertions(+), 47 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 6db282a7728..7f3edfa0a01 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -325,7 +325,6 @@ ConfigureBench(
   string/filter.cpp
   string/repeat_strings.cpp
   string/replace.cpp
-  string/slice.cpp
   string/translate.cpp
   string/url_decode.cu
 )
@@ -346,6 +345,7 @@ ConfigureNVBench(
   string/like.cpp
   string/replace_re.cpp
   string/reverse.cpp
+  string/slice.cpp
   string/split.cpp
   string/split_re.cpp
 )
diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp
index 0f973a7c8b5..1898f0340b6 100644
--- a/cpp/benchmarks/string/slice.cpp
+++ b/cpp/benchmarks/string/slice.cpp
@@ -14,11 +14,8 @@
  * limitations under the License.
  */
 
-#include "string_bench_args.hpp"
-
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -29,56 +26,56 @@
 
 #include <thrust/iterator/constant_iterator.h>
 
+#include <nvbench/nvbench.cuh>
+
 #include <limits>
 
-class StringSlice : public cudf::benchmark {};
+static void bench_slice(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const stype     = state.get_string("type");
 
-enum slice_type { position, multi_position };
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
 
-static void BM_slice(benchmark::State& state, slice_type rt)
-{
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
-  auto starts_itr = thrust::constant_iterator<cudf::size_type>(max_str_length / 3);
-  auto stops_itr  = thrust::constant_iterator<cudf::size_type>(max_str_length / 2);
-  cudf::test::fixed_width_column_wrapper<int32_t> starts(starts_itr, starts_itr + n_rows);
-  cudf::test::fixed_width_column_wrapper<int32_t> stops(stops_itr, stops_itr + n_rows);
+  auto starts_itr = thrust::constant_iterator<cudf::size_type>(row_width / 4);
+  auto starts =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>(starts_itr, starts_itr + num_rows);
+  auto stops_itr = thrust::constant_iterator<cudf::size_type>(row_width / 3);
+  auto stops =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>(stops_itr, stops_itr + num_rows);
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (rt) {
-      case position:
-        cudf::strings::slice_strings(input, max_str_length / 3, max_str_length / 2);
-        break;
-      case multi_position: cudf::strings::slice_strings(input, starts, stops); break;
-    }
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  // gather some throughput statistics as well
+  auto chars_size = input.chars_size(stream);
+  state.add_element_count(chars_size, "chars_size");           // number of bytes
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read
+  auto output_size = (row_width / 3 - row_width / 4) * num_rows;
+  state.add_global_memory_writes<nvbench::int8_t>(output_size);
+
+  if (stype == "multi") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::slice_strings(input, starts, stops, stream);
+    });
+  } else {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::slice_strings(input, row_width / 4, row_width / 3, 1, stream);
+    });
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
+  set_throughputs(state);
 }
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 2;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
-}
-
-#define STRINGS_BENCHMARK_DEFINE(name)                          \
-  BENCHMARK_DEFINE_F(StringSlice, name)                         \
-  (::benchmark::State & st) { BM_slice(st, slice_type::name); } \
-  BENCHMARK_REGISTER_F(StringSlice, name)                       \
-    ->Apply(generate_bench_args)                                \
-    ->UseManualTime()                                           \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(position)
-STRINGS_BENCHMARK_DEFINE(multi_position)
+NVBENCH_BENCH(bench_slice)
+  .set_name("slice")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
+  .add_int64_axis("num_rows", {262144, 2097152, 16777216})
+  .add_string_axis("type", {"position", "multi"});

From 60f30d831325d5816e6968e8037796b8ce1dc579 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 27 Aug 2024 17:45:33 -0700
Subject: [PATCH 134/270] Use `make_host_vector` instead of `make_std_vector`
 to facilitate pinned memory optimizations (#16386)

Replaced most of `make_std_vector` calls with `make_host_vector` to allow pinned memory and kernel copies, when enabled.
Skipped places where the change would impact the public API.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16386
---
 cpp/include/cudf/detail/gather.cuh         |  2 +-
 cpp/src/io/csv/csv_gpu.cu                  |  4 +--
 cpp/src/io/csv/csv_gpu.hpp                 |  2 +-
 cpp/src/io/csv/reader_impl.cu              |  2 +-
 cpp/src/io/json/json_column.cu             | 42 +++++++++++-----------
 cpp/src/io/orc/writer_impl.cu              |  8 ++---
 cpp/src/io/orc/writer_impl.hpp             |  5 +--
 cpp/src/io/parquet/predicate_pushdown.cpp  |  2 +-
 cpp/src/io/parquet/reader_impl_chunking.cu | 20 +++++------
 cpp/src/io/parquet/writer_impl.cu          | 22 ++++++------
 cpp/src/io/utilities/datasource.cpp        |  4 +--
 cpp/src/text/jaccard.cu                    |  2 +-
 12 files changed, 57 insertions(+), 58 deletions(-)

diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 41f5494f78f..df6fe6e6ccb 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -609,7 +609,7 @@ void gather_bitmask(table_view const& source,
        stream);
 
   // Copy the valid counts into each column
-  auto const valid_counts = make_std_vector_sync(d_valid_counts, stream);
+  auto const valid_counts = make_host_vector_sync(d_valid_counts, stream);
   for (size_t i = 0; i < target.size(); ++i) {
     if (target[i]->nullable()) {
       auto const null_count = target_rows - valid_counts[i];
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 7a05d0aebaf..5a0c6decfda 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -794,7 +794,7 @@ device_span<uint64_t> __host__ remove_blank_rows(cudf::io::parse_options_view co
   return row_offsets.subspan(0, new_end - row_offsets.begin());
 }
 
-std::vector<column_type_histogram> detect_column_types(
+cudf::detail::host_vector<column_type_histogram> detect_column_types(
   cudf::io::parse_options_view const& options,
   device_span<char const> const data,
   device_span<column_parse::flags const> const column_flags,
@@ -812,7 +812,7 @@ std::vector<column_type_histogram> detect_column_types(
   data_type_detection<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, column_flags, row_starts, d_stats);
 
-  return detail::make_std_vector_sync(d_stats, stream);
+  return detail::make_host_vector_sync(d_stats, stream);
 }
 
 void decode_row_column_data(cudf::io::parse_options_view const& options,
diff --git a/cpp/src/io/csv/csv_gpu.hpp b/cpp/src/io/csv/csv_gpu.hpp
index 06c60319371..aa3d9f6c7b7 100644
--- a/cpp/src/io/csv/csv_gpu.hpp
+++ b/cpp/src/io/csv/csv_gpu.hpp
@@ -199,7 +199,7 @@ device_span<uint64_t> remove_blank_rows(cudf::io::parse_options_view const& opti
  *
  * @return stats Histogram of each dtypes' occurrence for each column
  */
-std::vector<column_type_histogram> detect_column_types(
+cudf::detail::host_vector<column_type_histogram> detect_column_types(
   cudf::io::parse_options_view const& options,
   device_span<char const> data,
   device_span<column_parse::flags const> column_flags,
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 40d4372ae9d..e27b06682bb 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -614,7 +614,7 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
     d_valid_counts,
     stream);
 
-  auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream);
+  auto const h_valid_counts = cudf::detail::make_host_vector_sync(d_valid_counts, stream);
   for (int i = 0; i < num_active_columns; ++i) {
     out_buffers[i].null_count() = num_records - h_valid_counts[i];
   }
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index e5e21e054a6..8d6890045be 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -77,16 +77,16 @@ void print_tree(host_span<SymbolT const> input,
                 tree_meta_t const& d_gpu_tree,
                 rmm::cuda_stream_view stream)
 {
-  print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.node_categories, stream),
+  print_vec(cudf::detail::make_host_vector_sync(d_gpu_tree.node_categories, stream),
             "node_categories",
             to_cat);
-  print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.parent_node_ids, stream),
+  print_vec(cudf::detail::make_host_vector_sync(d_gpu_tree.parent_node_ids, stream),
             "parent_node_ids",
             to_int);
   print_vec(
-    cudf::detail::make_std_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
-  auto node_range_begin = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_begin, stream);
-  auto node_range_end   = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_end, stream);
+    cudf::detail::make_host_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
+  auto node_range_begin = cudf::detail::make_host_vector_sync(d_gpu_tree.node_range_begin, stream);
+  auto node_range_end   = cudf::detail::make_host_vector_sync(d_gpu_tree.node_range_end, stream);
   print_vec(node_range_begin, "node_range_begin", to_int);
   print_vec(node_range_end, "node_range_end", to_int);
   for (int i = 0; i < int(node_range_begin.size()); i++) {
@@ -373,9 +373,9 @@ std::vector<std::string> copy_strings_to_host_sync(
   auto to_host        = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
-    auto const h_chars = cudf::detail::make_std_vector_async<char>(
+    auto const h_chars = cudf::detail::make_host_vector_async<char>(
       cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
-    auto const h_offsets = cudf::detail::make_std_vector_async(
+    auto const h_offsets = cudf::detail::make_host_vector_async(
       cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
                                                scv.size() + 1),
       stream);
@@ -523,25 +523,23 @@ void make_device_json_column(device_span<SymbolT const> input,
                           row_array_parent_col_id,
                           stream);
   auto num_columns    = d_unique_col_ids.size();
-  auto unique_col_ids = cudf::detail::make_std_vector_async(d_unique_col_ids, stream);
+  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
   auto column_categories =
-    cudf::detail::make_std_vector_async(d_column_tree.node_categories, stream);
-  auto column_parent_ids =
-    cudf::detail::make_std_vector_async(d_column_tree.parent_node_ids, stream);
+    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
+  auto const column_parent_ids =
+    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
   auto column_range_beg =
-    cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream);
-  auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream);
+    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
+  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
   std::vector<std::string> column_names = copy_strings_to_host_sync(
     input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
-  stream.synchronize();
   // array of arrays column names
   if (is_array_of_arrays) {
     TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
     auto values_column_indices =
       get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
     auto h_values_column_indices =
-      cudf::detail::make_std_vector_async(values_column_indices, stream);
-    stream.synchronize();
+      cudf::detail::make_host_vector_sync(values_column_indices, stream);
     std::transform(unique_col_ids.begin(),
                    unique_col_ids.end(),
                    column_names.begin(),
@@ -611,11 +609,13 @@ void make_device_json_column(device_span<SymbolT const> input,
     return thrust::get<0>(a) < thrust::get<0>(b);
   });
 
-  std::vector<uint8_t> is_str_column_all_nulls{};
-  if (is_enabled_mixed_types_as_string) {
-    is_str_column_all_nulls = cudf::detail::make_std_vector_sync(
-      is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream);
-  }
+  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
+    if (is_enabled_mixed_types_as_string) {
+      return cudf::detail::make_host_vector_sync(
+        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
+    }
+    return cudf::detail::make_empty_host_vector<uint8_t>(0, stream);
+  }();
 
   // use hash map because we may skip field name's col_ids
   std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 04eee68e757..ede9fd060b8 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1978,7 +1978,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
 
   // Gather the row group sizes and copy to host
   auto d_tmp_rowgroup_sizes = rmm::device_uvector<uint32_t>(segmentation.num_rowgroups(), stream);
-  std::map<uint32_t, std::vector<uint32_t>> rg_sizes;
+  std::map<uint32_t, cudf::detail::host_vector<uint32_t>> rg_sizes;
   for (auto const& [col_idx, esizes] : elem_sizes) {
     // Copy last elem in each row group - equal to row group size
     thrust::tabulate(rmm::exec_policy(stream),
@@ -1991,14 +1991,14 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                        return src[rg_bounds[idx][col_idx].end - 1];
                      });
 
-    rg_sizes[col_idx] = cudf::detail::make_std_vector_async(d_tmp_rowgroup_sizes, stream);
+    rg_sizes.emplace(col_idx, cudf::detail::make_host_vector_async(d_tmp_rowgroup_sizes, stream));
   }
 
   return {std::move(elem_sizes), std::move(rg_sizes)};
 }
 
 std::map<uint32_t, size_t> decimal_column_sizes(
-  std::map<uint32_t, std::vector<uint32_t>> const& chunk_sizes)
+  std::map<uint32_t, cudf::detail::host_vector<uint32_t>> const& chunk_sizes)
 {
   std::map<uint32_t, size_t> column_sizes;
   std::transform(chunk_sizes.cbegin(),
@@ -2056,7 +2056,7 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table,
                             orc_table.d_string_column_indices,
                             stream);
 
-  auto const h_counts = cudf::detail::make_std_vector_sync(counts, stream);
+  auto const h_counts = cudf::detail::make_host_vector_sync(counts, stream);
 
   for (auto col_idx : orc_table.string_column_indices) {
     auto& str_column = orc_table.column(col_idx);
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index f5f8b3cfed9..cae849ee315 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -90,8 +90,9 @@ struct stripe_rowgroups {
  */
 struct encoder_decimal_info {
   std::map<uint32_t, rmm::device_uvector<uint32_t>>
-    elem_sizes;                                        ///< Column index -> per-element size map
-  std::map<uint32_t, std::vector<uint32_t>> rg_sizes;  ///< Column index -> per-rowgroup size map
+    elem_sizes;  ///< Column index -> per-element size map
+  std::map<uint32_t, cudf::detail::host_vector<uint32_t>>
+    rg_sizes;  ///< Column index -> per-rowgroup size map
 };
 
 /**
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 5ca090b05b3..c8b8b7a1193 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -468,7 +468,7 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   auto validity_it = cudf::detail::make_counting_transform_iterator(
     0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); });
 
-  auto is_row_group_required = cudf::detail::make_std_vector_sync(
+  auto const is_row_group_required = cudf::detail::make_host_vector_sync(
     device_span<uint8_t const>(predicate.data<uint8_t>(), predicate.size()), stream);
 
   // Return only filtered row groups based on predicate
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 54ba898b058..00d62c45962 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -77,9 +77,9 @@ void print_cumulative_page_info(device_span<PageInfo const> d_pages,
                                 device_span<cumulative_page_info const> d_c_info,
                                 rmm::cuda_stream_view stream)
 {
-  std::vector<PageInfo> pages              = cudf::detail::make_std_vector_sync(d_pages, stream);
-  std::vector<ColumnChunkDesc> chunks      = cudf::detail::make_std_vector_sync(d_chunks, stream);
-  std::vector<cumulative_page_info> c_info = cudf::detail::make_std_vector_sync(d_c_info, stream);
+  auto const pages  = cudf::detail::make_host_vector_sync(d_pages, stream);
+  auto const chunks = cudf::detail::make_host_vector_sync(d_chunks, stream);
+  auto const c_info = cudf::detail::make_host_vector_sync(d_c_info, stream);
 
   printf("------------\nCumulative sizes by page\n");
 
@@ -647,7 +647,7 @@ std::tuple<rmm::device_uvector<page_span>, size_t, size_t> compute_next_subpass(
   auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
 
   // bring back to the cpu
-  auto const h_aggregated_info = cudf::detail::make_std_vector_sync(aggregated_info, stream);
+  auto const h_aggregated_info = cudf::detail::make_host_vector_sync(aggregated_info, stream);
   // print_cumulative_row_info(h_aggregated_info, "adjusted");
 
   // TODO: if the user has explicitly specified skip_rows/num_rows we could be more intelligent
@@ -694,8 +694,7 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
 
   // bring back to the cpu
-  std::vector<cumulative_page_info> h_aggregated_info =
-    cudf::detail::make_std_vector_sync(aggregated_info, stream);
+  auto const h_aggregated_info = cudf::detail::make_host_vector_sync(aggregated_info, stream);
   // print_cumulative_row_info(h_aggregated_info, "adjusted");
 
   std::vector<row_range> splits;
@@ -1304,9 +1303,8 @@ void reader::impl::setup_next_pass(read_mode mode)
     printf("\tskip_rows: %'lu\n", pass.skip_rows);
     printf("\tnum_rows: %'lu\n", pass.num_rows);
     printf("\tbase mem usage: %'lu\n", pass.base_mem_size);
-    auto const num_columns = _input_columns.size();
-    std::vector<size_type> h_page_offsets =
-      cudf::detail::make_std_vector_sync(pass.page_offsets, _stream);
+    auto const num_columns    = _input_columns.size();
+    auto const h_page_offsets = cudf::detail::make_host_vector_sync(pass.page_offsets, _stream);
     for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
       printf("\t\tColumn %'lu: num_pages(%'d)\n",
              c_idx,
@@ -1426,7 +1424,7 @@ void reader::impl::setup_next_subpass(read_mode mode)
     subpass.pages = subpass.page_buf;
   }
 
-  std::vector<page_span> h_spans = cudf::detail::make_std_vector_async(page_indices, _stream);
+  auto const h_spans = cudf::detail::make_host_vector_async(page_indices, _stream);
   subpass.pages.device_to_host_async(_stream);
 
   _stream.synchronize();
@@ -1464,7 +1462,7 @@ void reader::impl::setup_next_subpass(read_mode mode)
   printf("\t\tTotal expected usage: %'lu\n",
          total_expected_size == 0 ? subpass.decomp_page_data.size() + pass.base_mem_size
                                   : total_expected_size + pass.base_mem_size);
-  std::vector<page_span> h_page_indices = cudf::detail::make_std_vector_sync(page_indices, _stream);
+  auto const h_page_indices = cudf::detail::make_host_vector_sync(page_indices, _stream);
   for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
     printf("\t\tColumn %'lu: pages(%'lu - %'lu)\n",
            c_idx,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index c2c5dbb4a56..74992aa733f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -2230,20 +2230,20 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     bool need_sync{false};
 
     // need to fetch the histogram data from the device
-    std::vector<uint32_t> h_def_histogram;
-    std::vector<uint32_t> h_rep_histogram;
-    if (stats_granularity == statistics_freq::STATISTICS_COLUMN) {
-      if (def_histogram_bfr_size > 0) {
-        h_def_histogram =
-          std::move(cudf::detail::make_std_vector_async(def_level_histogram, stream));
+    auto const h_def_histogram = [&]() {
+      if (stats_granularity == statistics_freq::STATISTICS_COLUMN && def_histogram_bfr_size > 0) {
         need_sync = true;
+        return cudf::detail::make_host_vector_async(def_level_histogram, stream);
       }
-      if (rep_histogram_bfr_size > 0) {
-        h_rep_histogram =
-          std::move(cudf::detail::make_std_vector_async(rep_level_histogram, stream));
+      return cudf::detail::make_host_vector<uint32_t>(0, stream);
+    }();
+    auto const h_rep_histogram = [&]() {
+      if (stats_granularity == statistics_freq::STATISTICS_COLUMN && rep_histogram_bfr_size > 0) {
         need_sync = true;
+        return cudf::detail::make_host_vector_async(rep_level_histogram, stream);
       }
-    }
+      return cudf::detail::make_host_vector<uint32_t>(0, stream);
+    }();
 
     for (int r = 0; r < num_rowgroups; r++) {
       int p           = rg_to_part[r];
@@ -2265,7 +2265,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         update_chunk_encoding_stats(column_chunk_meta, ck, write_v2_headers);
 
         if (ck.ck_stat_size != 0) {
-          std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
+          auto const stats_blob = cudf::detail::make_host_vector_sync(
             device_span<uint8_t const>(dev_bfr, ck.ck_stat_size), stream);
           CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
           cp.read(&column_chunk_meta.statistics);
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 91be154e09d..e4313eba454 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -297,10 +297,10 @@ class device_buffer_source final : public datasource {
   {
     auto const count  = std::min(size, this->size() - offset);
     auto const stream = cudf::get_default_stream();
-    auto h_data       = cudf::detail::make_std_vector_async(
+    auto h_data       = cudf::detail::make_host_vector_async(
       cudf::device_span<std::byte const>{_d_buffer.data() + offset, count}, stream);
     stream.synchronize();
-    return std::make_unique<owning_buffer<std::vector<std::byte>>>(std::move(h_data));
+    return std::make_unique<owning_buffer<cudf::detail::host_vector<std::byte>>>(std::move(h_data));
   }
 
   [[nodiscard]] bool supports_device_read() const override { return true; }
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index e465fb79c89..e856b89b836 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -376,7 +376,7 @@ std::pair<rmm::device_uvector<uint32_t>, rmm::device_uvector<int64_t>> hash_subs
                           sub_offsets.begin(),
                           sub_offsets.end(),
                           indices.begin());
-      return cudf::detail::make_std_vector_sync(indices, stream);
+      return cudf::detail::make_host_vector_sync(indices, stream);
     }();
 
     // Call segmented sort with the sort sections

From 1a96e4cca188f4e0500a87c391ef105b49a42288 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 27 Aug 2024 18:57:48 -1000
Subject: [PATCH 135/270] Fix loc/iloc.__setitem__[:, loc] with non cupy types
 (#16677)

Discovered in https://github.com/rapidsai/cudf/pull/16652, `DataFrame.iloc/loc.__setitem__` with a non-cupy type e.g. `"category"` failed because the indexing path unconditionally tries to `cupy.asarray` the value to be set which only accepts types recognized by cupy.

We can skip this `asarray` if we have a numpy/pandas/cudf object

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16677
---
 python/cudf/cudf/core/dataframe.py      | 10 ++++++----
 python/cudf/cudf/tests/test_indexing.py | 10 ++++++++++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 14b63c2b0d7..d54a800aedf 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -414,8 +414,9 @@ def _setitem_tuple_arg(self, key, value):
                     )
 
             else:
-                value = cupy.asarray(value)
-                if value.ndim == 2:
+                if not is_column_like(value):
+                    value = cupy.asarray(value)
+                if getattr(value, "ndim", 1) == 2:
                     # If the inner dimension is 1, it's broadcastable to
                     # all columns of the dataframe.
                     indexed_shape = columns_df.loc[key[0]].shape
@@ -558,8 +559,9 @@ def _setitem_tuple_arg(self, key, value):
         else:
             # TODO: consolidate code path with identical counterpart
             # in `_DataFrameLocIndexer._setitem_tuple_arg`
-            value = cupy.asarray(value)
-            if value.ndim == 2:
+            if not is_column_like(value):
+                value = cupy.asarray(value)
+            if getattr(value, "ndim", 1) == 2:
                 indexed_shape = columns_df.iloc[key[0]].shape
                 if value.shape[1] == 1:
                     if value.shape[0] != indexed_shape[0]:
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 716b4dc6acd..9df2852dde8 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2369,3 +2369,13 @@ def test_duplicate_labels_raises():
         df[["a", "a"]]
     with pytest.raises(ValueError):
         df.loc[:, ["a", "a"]]
+
+
+@pytest.mark.parametrize("indexer", ["iloc", "loc"])
+@pytest.mark.parametrize("dtype", ["category", "timedelta64[ns]"])
+def test_loc_iloc_setitem_col_slice_non_cupy_types(indexer, dtype):
+    df_pd = pd.DataFrame(range(2), dtype=dtype)
+    df_cudf = cudf.DataFrame.from_pandas(df_pd)
+    getattr(df_pd, indexer)[:, 0] = getattr(df_pd, indexer)[:, 0]
+    getattr(df_cudf, indexer)[:, 0] = getattr(df_cudf, indexer)[:, 0]
+    assert_eq(df_pd, df_cudf)

From 569939f40094b266a768a270d8966c5f7277c46a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 28 Aug 2024 08:36:14 -0500
Subject: [PATCH 136/270] Fix slowdown in DataFrame repr in jupyter notebook
 (#16656)

Fixes: #15747

This PR fixes slow-down in `DataFrame` repr inside a jupyter notebook.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/16656
---
 .gitignore                                    |  4 ++
 ci/cudf_pandas_scripts/run_tests.sh           |  3 +
 .../all_cuda-118_arch-x86_64.yaml             |  4 ++
 .../all_cuda-125_arch-x86_64.yaml             |  4 ++
 dependencies.yaml                             |  8 ++-
 python/cudf/cudf/core/dataframe.py            |  4 +-
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 28 ++++++++
 .../data/repr_slow_down_test.ipynb            | 69 +++++++++++++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 36 ++++++++++
 python/cudf/pyproject.toml                    |  4 ++
 10 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb

diff --git a/.gitignore b/.gitignore
index 153c7f59744..619e1464b2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -178,3 +178,7 @@ jupyter_execute
 # clang tooling
 compile_commands.json
 .clangd/
+
+# pytest artifacts
+rmm_log.txt
+python/cudf/cudf_pandas_tests/data/rmm_log.txt
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 39056d58d56..52964496b36 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -61,6 +61,9 @@ else
         "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 fi
 
+python -m pip install ipykernel
+python -m ipykernel install --user --name python3
+
 python -m pytest -p cudf.pandas \
     --cov-config=./python/cudf/.coveragerc \
     --cov=cudf \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 96596958636..c4c32da8af2 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -37,6 +37,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
+- jupyter_client
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
@@ -48,6 +49,8 @@ dependencies:
 - moto>=4.0.8
 - msgpack-python
 - myst-nb
+- nbconvert
+- nbformat
 - nbsphinx
 - ninja
 - notebook
@@ -57,6 +60,7 @@ dependencies:
 - nvcc_linux-64=11.8
 - nvcomp==3.0.6
 - nvtx>=0.2.1
+- openpyxl
 - packaging
 - pandas
 - pandas>=2.0,<2.2.3dev0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index efc5f76b90f..7439c9543a5 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -38,6 +38,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
+- jupyter_client
 - libcufile-dev
 - libcurand-dev
 - libkvikio==24.10.*,>=0.0.0a0
@@ -47,6 +48,8 @@ dependencies:
 - moto>=4.0.8
 - msgpack-python
 - myst-nb
+- nbconvert
+- nbformat
 - nbsphinx
 - ninja
 - notebook
@@ -55,6 +58,7 @@ dependencies:
 - numpydoc
 - nvcomp==3.0.6
 - nvtx>=0.2.1
+- openpyxl
 - packaging
 - pandas
 - pandas>=2.0,<2.2.3dev0
diff --git a/dependencies.yaml b/dependencies.yaml
index b55860815bf..5be291b3671 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -31,6 +31,7 @@ files:
       - test_python_cudf
       - test_python_dask_cudf
       - test_python_pylibcudf
+      - test_python_cudf_pandas
   test_static_build:
     output: none
     includes:
@@ -49,6 +50,7 @@ files:
       - test_python_common
       - test_python_cudf
       - test_python_dask_cudf
+      - test_python_cudf_pandas
   test_java:
     output: none
     includes:
@@ -934,9 +936,13 @@ dependencies:
           # installation issues with `psycopg2`.
           - pandas[test, pyarrow, performance, computation, fss, excel, parquet, feather, hdf5, spss, html, xml, plot, output-formatting, clipboard, compression]
           - pytest-reportlog
+          - ipython
   test_python_cudf_pandas:
     common:
-      - output_types: [requirements, pyproject]
+      - output_types: [conda, requirements, pyproject]
         packages:
           - ipython
+          - jupyter_client
+          - nbconvert
+          - nbformat
           - openpyxl
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d54a800aedf..a309b9117eb 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -680,7 +680,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     3  3   0.3
     """
 
-    _PROTECTED_KEYS = frozenset(("_data", "_index"))
+    _PROTECTED_KEYS = frozenset(
+        ("_data", "_index", "_ipython_canary_method_should_not_exist_")
+    )
     _accessors: set[Any] = set()
     _loc_indexer_type = _DataFrameLocIndexer
     _iloc_indexer_type = _DataFrameIlocIndexer
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 478108f36f1..6d03063fa27 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -61,6 +61,12 @@
     TimeGrouper as pd_TimeGrouper,
 )
 
+try:
+    from IPython import get_ipython
+
+    ipython_shell = get_ipython()
+except ImportError:
+    ipython_shell = None
 
 cudf.set_option("mode.pandas_compatible", True)
 
@@ -208,6 +214,12 @@ def _DataFrame__dir__(self):
     ]
 
 
+def ignore_ipython_canary_check(self, **kwargs):
+    raise AttributeError(
+        "_ipython_canary_method_should_not_exist_ doesn't exist"
+    )
+
+
 DataFrame = make_final_proxy_type(
     "DataFrame",
     cudf.DataFrame,
@@ -220,10 +232,26 @@ def _DataFrame__dir__(self):
         "_constructor": _FastSlowAttribute("_constructor"),
         "_constructor_sliced": _FastSlowAttribute("_constructor_sliced"),
         "_accessors": set(),
+        "_ipython_canary_method_should_not_exist_": ignore_ipython_canary_check,
     },
 )
 
 
+def custom_repr_html(obj):
+    # This custom method is need to register a html format
+    # for ipython
+    return _fast_slow_function_call(
+        lambda obj: obj._repr_html_(),
+        obj,
+    )[0]
+
+
+if ipython_shell:
+    # See: https://ipython.readthedocs.io/en/stable/config/integrating.html#formatters-for-third-party-types
+    html_formatter = ipython_shell.display_formatter.formatters["text/html"]
+    html_formatter.for_type(DataFrame, custom_repr_html)
+
+
 Series = make_final_proxy_type(
     "Series",
     cudf.Series,
diff --git a/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
new file mode 100644
index 00000000000..c7d39b78810
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
@@ -0,0 +1,69 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext cudf.pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "np.random.seed(0)\n",
+    "\n",
+    "num_rows = 25_000_000\n",
+    "num_columns = 12\n",
+    "\n",
+    "# Create a DataFrame with random data\n",
+    "df = pd.DataFrame(np.random.randint(0, 100, size=(num_rows, num_columns)),\n",
+    "                  columns=[f'Column_{i}' for i in range(1, num_columns + 1)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 028f5f173ac..0827602852d 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -14,9 +14,12 @@
 import types
 from io import BytesIO, StringIO
 
+import jupyter_client
+import nbformat
 import numpy as np
 import pyarrow as pa
 import pytest
+from nbconvert.preprocessors import ExecutePreprocessor
 from numba import NumbaDeprecationWarning
 from pytz import utc
 
@@ -1650,3 +1653,36 @@ def test_change_index_name(index):
 
         assert s.index.name == name
         assert df.index.name == name
+
+
+def test_notebook_slow_repr():
+    notebook_filename = (
+        os.path.dirname(os.path.abspath(__file__))
+        + "/data/repr_slow_down_test.ipynb"
+    )
+    with open(notebook_filename, "r", encoding="utf-8") as f:
+        nb = nbformat.read(f, as_version=4)
+
+    ep = ExecutePreprocessor(
+        timeout=20, kernel_name=jupyter_client.KernelManager().kernel_name
+    )
+
+    try:
+        ep.preprocess(nb, {"metadata": {"path": "./"}})
+    except Exception as e:
+        assert False, f"Error executing the notebook: {e}"
+
+    # Collect the outputs
+    html_result = nb.cells[2]["outputs"][0]["data"]["text/html"]
+    for string in {
+        "div",
+        "Column_1",
+        "Column_2",
+        "Column_3",
+        "Column_4",
+        "tbody",
+        "</table>",
+    }:
+        assert (
+            string in html_result
+        ), f"Expected string {string} not found in the output"
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 8386935fab0..0c1d5015078 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -63,11 +63,15 @@ test = [
     "tzdata",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 pandas-tests = [
+    "ipython",
     "pandas[test, pyarrow, performance, computation, fss, excel, parquet, feather, hdf5, spss, html, xml, plot, output-formatting, clipboard, compression]",
     "pytest-reportlog",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 cudf-pandas-tests = [
     "ipython",
+    "jupyter_client",
+    "nbconvert",
+    "nbformat",
     "openpyxl",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From 5491b394921ca3e03f09c9e789f1ba00da9db0b1 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 28 Aug 2024 11:03:10 -0500
Subject: [PATCH 137/270] switch from typing.Callable to
 collections.abc.Callable (#16670)

Follow-up to #16637.

Once this project's minimum support Python version was bumped up to Python 3.10, `ruff` started raising this error from `pyupgrade`:

```text
Import from `collections.abc` instead: `Callable`
```

* ruff docs: https://docs.astral.sh/ruff/rules/deprecated-import/
* `typing` docs saying that `typing.Callable` is deprecated starting in Python 3.9 https://docs.python.org/3/library/typing.html#typing.Callable
* context: https://github.com/rapidsai/cudf/pull/16637#discussion_r1727482177

This proposes accepting that suggestion, so that `cudf` won't be broken whenever `Callable` is removed from the `typing` module.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/16670
---
 pyproject.toml                                    | 4 +++-
 python/cudf/cudf/_typing.py                       | 3 ++-
 python/cudf/cudf/core/column/numerical.py         | 4 +++-
 python/cudf/cudf/core/column_accessor.py          | 4 ++--
 python/cudf/cudf/core/dataframe.py                | 4 ++--
 python/cudf/cudf/core/dtypes.py                   | 4 +++-
 python/cudf/cudf/core/frame.py                    | 4 ++--
 python/cudf/cudf/core/udf/utils.py                | 5 ++++-
 python/cudf/cudf/io/parquet.py                    | 6 +++++-
 python/cudf/cudf/options.py                       | 4 ++--
 python/cudf/cudf/pandas/fast_slow_proxy.py        | 4 ++--
 python/cudf/cudf/utils/ioutils.py                 | 2 +-
 python/cudf_polars/cudf_polars/dsl/ir.py          | 4 ++--
 python/cudf_polars/cudf_polars/typing/__init__.py | 3 ++-
 python/cudf_polars/pyproject.toml                 | 1 -
 python/dask_cudf/dask_cudf/io/json.py             | 2 +-
 16 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e15cb7b3cdd..8f9aa165e5a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,7 +87,9 @@ select = [
     # non-pep585-annotation
     "UP006",
     # non-pep604-annotation
-    "UP007"
+    "UP007",
+    # Import from `collections.abc` instead: `Callable`
+    "UP035",
 ]
 ignore = [
     # whitespace before :
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
index 34c96cc8cb3..6e8ad556b08 100644
--- a/python/cudf/cudf/_typing.py
+++ b/python/cudf/cudf/_typing.py
@@ -1,7 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import sys
-from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, TypeVar, Union
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, Dict, Iterable, TypeVar, Union
 
 import numpy as np
 from pandas import Period, Timedelta, Timestamp
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 90bec049831..7f391c8a79c 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import functools
-from typing import TYPE_CHECKING, Any, Callable, Sequence, cast
+from typing import TYPE_CHECKING, Any, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -28,6 +28,8 @@
 from .numerical_base import NumericalBaseColumn
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from cudf._typing import (
         ColumnBinaryOperand,
         ColumnLike,
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 34076fa0060..09b0f453692 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -6,7 +6,7 @@
 import sys
 from collections import abc
 from functools import cached_property, reduce
-from typing import TYPE_CHECKING, Any, Callable, Mapping, cast
+from typing import TYPE_CHECKING, Any, Mapping, cast
 
 import numpy as np
 import pandas as pd
@@ -639,7 +639,7 @@ def _pad_key(
 
     def rename_levels(
         self,
-        mapper: Mapping[abc.Hashable, abc.Hashable] | Callable,
+        mapper: Mapping[abc.Hashable, abc.Hashable] | abc.Callable,
         level: int | None = None,
     ) -> Self:
         """
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a309b9117eb..6065e0e1eeb 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -13,8 +13,8 @@
 import textwrap
 import warnings
 from collections import abc, defaultdict
-from collections.abc import Iterator
-from typing import TYPE_CHECKING, Any, Callable, Literal, MutableMapping, cast
+from collections.abc import Callable, Iterator
+from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
 
 import cupy
 import numba
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 6d532e01cba..2110e610c37 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -7,7 +7,7 @@
 import textwrap
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 import pandas as pd
@@ -27,6 +27,8 @@
     PANDAS_NUMPY_DTYPE = pd.core.dtypes.dtypes.PandasDtype
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from cudf._typing import Dtype
     from cudf.core.buffer import Buffer
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 3e1efd7c97a..cbe1e97d834 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,7 +6,7 @@
 import pickle
 import warnings
 from collections import abc
-from typing import TYPE_CHECKING, Any, Callable, Literal, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal, MutableMapping
 
 # TODO: The `numpy` import is needed for typing purposes during doc builds
 # only, need to figure out why the `np` alias is insufficient then remove.
@@ -403,7 +403,7 @@ def __arrow_array__(self, type=None):
     @_performance_tracking
     def _to_array(
         self,
-        get_array: Callable,
+        get_array: abc.Callable,
         module: ModuleType,
         copy: bool,
         dtype: Dtype | None = None,
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index d616761cb3b..6d7362952c9 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -3,7 +3,7 @@
 
 import functools
 import os
-from typing import Any, Callable
+from typing import TYPE_CHECKING, Any
 
 import cachetools
 import cupy as cp
@@ -41,6 +41,9 @@
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import initfunc
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
 # Maximum size of a string column is 2 GiB
 _STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get("STRINGS_UDF_HEAP_SIZE", 2**31)
 _heap_size = 0
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 6b895abbf66..d6b2ae2f31c 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -10,7 +10,7 @@
 from collections import defaultdict
 from contextlib import ExitStack
 from functools import partial, reduce
-from typing import Callable
+from typing import TYPE_CHECKING
 from uuid import uuid4
 
 import numpy as np
@@ -24,6 +24,10 @@
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 BYTE_SIZES = {
     "kb": 1000,
     "mb": 1000000,
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index 94e73021cec..df7bbe22a61 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -5,10 +5,10 @@
 import textwrap
 from contextlib import ContextDecorator
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
-    from collections.abc import Container
+    from collections.abc import Callable, Container
 
 
 @dataclass
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index bb678fd1efe..4b0fd9a5b36 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -10,9 +10,9 @@
 import pickle
 import types
 import warnings
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from enum import IntEnum
-from typing import Any, Callable, Literal, Mapping
+from typing import Any, Literal, Mapping
 
 import numpy as np
 
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index e5944d7093c..94974e595b1 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -4,9 +4,9 @@
 import os
 import urllib
 import warnings
+from collections.abc import Callable
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
-from typing import Callable
 
 import fsspec
 import fsspec.implementations.local
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index ebc7dee6bfb..e334e6f5cc5 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -18,7 +18,7 @@
 import types
 from functools import cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
 import pylibcudf as plc
@@ -31,7 +31,7 @@
 from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
-    from collections.abc import MutableMapping
+    from collections.abc import Callable, MutableMapping
     from typing import Literal
 
     from cudf_polars.typing import Schema
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 5276073e62a..adab10bdded 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -13,7 +13,8 @@
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 if TYPE_CHECKING:
-    from typing import Callable, TypeAlias
+    from collections.abc import Callable
+    from typing import TypeAlias
 
     import polars as pl
 
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 0382e3ce6a2..f2bab9e6623 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -115,7 +115,6 @@ ignore = [
   # tryceratops
   "TRY003", # Avoid specifying long messages outside the exception class
   # pyupgrade
-  "UP035",  # Import from `collections.abc` instead: `Callable`
   "UP038",  # Use `X | Y` in `isinstance` call instead of `(X, Y)`
   # Lints below are turned off because of conflicts with the ruff
   # formatter
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index 8705d98e9d6..98c5ceedb76 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -81,7 +81,7 @@ def read_json(
 
         If str, this value will be used as the ``engine`` argument
         when :func:`cudf.read_json` is used to create each partition.
-        If a :obj:`~typing.Callable`, this value will be used as the
+        If a :obj:`~collections.abc.Callable`, this value will be used as the
         underlying function used to create each partition from JSON
         data. The default value is "auto", so that
         ``engine=partial(cudf.read_json, engine="auto")`` will be

From c600a65e4fd82a4a6eb00feaee032b62872de761 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 28 Aug 2024 09:55:11 -0700
Subject: [PATCH 138/270] Update documentation for Dask cuDF (#16671)

General documentation update for Dask cuDF:

- Adds `README.md` file to `dask_cudf` (this is currently a symlink to cudf's README, which isn't terribly helpful)
- Emphasizes direct usage of the `dask.dataframe` API (rather than the explicit `dask_cudf` API)
  - Including the `to_backend` API
- Advertises query-planning support
- Includes a simple Dask CUDA example (and best-practices link)

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16671
---
 docs/cudf/source/user_guide/10min.ipynb |   6 +-
 python/dask_cudf/README.md              | 136 +++++++++++++++++++++++-
 2 files changed, 140 insertions(+), 2 deletions(-)
 mode change 120000 => 100644 python/dask_cudf/README.md

diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index c3da2558db8..2eaa75b3189 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -15,7 +15,11 @@
     "\n",
     "[Dask](https://dask.org/) is a flexible library for parallel computing in Python that makes scaling out your workflow smooth and simple. On the CPU, Dask uses Pandas to execute operations in parallel on DataFrame partitions.\n",
     "\n",
-    "[Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n",
+    "[Dask cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n",
+    "\n",
+    "\n",
+    "> [!NOTE]  \n",
+    "> This notebook uses the explicit Dask cuDF API (`dask_cudf`) for clarity. However, we strongly recommend that you use Dask's [configuration infrastructure](https://docs.dask.org/en/latest/configuration.html) to set the `\"dataframe.backend\"` to `\"cudf\"`, and work with the `dask.dataframe` API directly. Please see the [Dask cuDF documentation](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) for more information.\n",
     "\n",
     "\n",
     "## When to use cuDF and Dask-cuDF\n",
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
deleted file mode 120000
index fe840054137..00000000000
--- a/python/dask_cudf/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../README.md
\ No newline at end of file
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
new file mode 100644
index 00000000000..6edb9f87d48
--- /dev/null
+++ b/python/dask_cudf/README.md
@@ -0,0 +1,135 @@
+# <div align="left"><img src="../../img/rapids_logo.png" width="90px"/>&nbsp;Dask cuDF - A GPU Backend for Dask DataFrame</div>
+
+Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html). When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame.
+
+## Using Dask cuDF
+
+### The Dask DataFrame API (Recommended)
+
+Simply set the `"dataframe.backend"` [configuration](https://docs.dask.org/en/stable/configuration.html) to `"cudf"` in Dask, and the public Dask DataFrame API will leverage `cudf` automatically:
+
+```python
+import dask
+dask.config.set({"dataframe.backend": "cudf"})
+
+import dask.dataframe as dd
+# This gives us a cuDF-backed dataframe
+df = dd.read_parquet("data.parquet", ...)
+```
+
+> [!IMPORTANT]
+> The `"dataframe.backend"` configuration will only be used for collection creation when the following APIs are used: `read_parquet`, `read_json`, `read_csv`, `read_orc`, `read_hdf`, and `from_dict`. For example, if `from_map`, `from_pandas`, `from_delayed`, or `from_array` are used, the backend of the new collection will depend on the input to the function:
+
+```python
+import pandas as pd
+import cudf
+
+# This gives us a Pandas-backed dataframe
+dd.from_pandas(pd.DataFrame({"a": range(10)}))
+
+# This gives us a cuDF-backed dataframe
+dd.from_pandas(cudf.DataFrame({"a": range(10)}))
+```
+
+A cuDF-backed DataFrame collection can be moved to the `"pandas"` backend:
+
+```python
+df = df.to_backend("pandas")
+```
+
+Similarly, a Pandas-backed DataFrame collection can be moved to the `"cudf"` backend:
+
+```python
+df = df.to_backend("cudf")
+```
+
+### The Explicit Dask cuDF API
+
+In addition to providing the `"cudf"` backend for Dask DataFrame, Dask cuDF also provides an explicit `dask_cudf` API:
+
+```python
+import dask_cudf
+
+# This always gives us a cuDF-backed dataframe
+df = dask_cudf.read_parquet("data.parquet", ...)
+```
+
+> [!NOTE]
+> This API is used implicitly by the Dask DataFrame API when the `"cudf"` backend is enabled. Therefore, using it directly will not provide any performance benefit over the CPU/GPU-portable `dask.dataframe` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section).
+
+See the [Dask cuDF's API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) for further information.
+
+## Query Planning
+
+Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the `"dataframe.query-planning"` configuration is set to `True` (the default) when `dask.dataframe` is first imported, [Dask Expressions](https://github.com/dask/dask-expr) will be used under the hood.
+
+For example, the following user code will automatically benefit from predicate pushdown when the result is computed.
+
+```python
+df = dd.read_parquet("/my/parquet/dataset/")
+result = df.sort_values('B')['A']
+```
+
+Unoptimized expression graph (`df.pprint()`):
+```
+Projection: columns='A'
+  SortValues: by=['B'] shuffle_method='tasks' options={}
+    ReadParquetFSSpec: path='/my/parquet/dataset/' ...
+```
+
+Simplified expression graph (`df.simplify().pprint()`):
+```
+Projection: columns='A'
+  SortValues: by=['B'] shuffle_method='tasks' options={}
+    ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ...
+```
+
+> [!NOTE]
+> Dask will automatically simplify the expression graph (within `optimize`) when the result is converted to a task graph (via `compute` or `persist`). The user does not need to call `simplify` themself.
+
+
+## Using Multiple GPUs and Multiple Nodes
+
+Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try to partition your data into small-enough tasks to fit comfortably in the memory of a single GPU. This means the necessary compute tasks needed to compute a query can often be streamed to a single GPU process for out-of-core computing. This also means that the compute tasks can be executed in parallel over a multi-GPU cluster.
+
+> [!IMPORTANT]
+> Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must deploy a distributed cluster (ideally with [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs.
+
+In order to execute your Dask workflow on multiple GPUs, you will typically need to use [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) to deploy distributed Dask cluster, and [Distributed](https://distributed.dask.org/en/stable/client.html) to define a `client` object. For example:
+
+```python
+
+from dask_cuda import LocalCUDACluster
+from distributed import Client
+
+client = Client(
+    LocalCUDACluster(
+        CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
+        rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
+        enable_cudf_spill=True,  # Improve device memory stability
+        local_directory="/fast/scratch/",  # Use fast local storage for spilling
+    )
+)
+
+df = dd.read_parquet("/my/parquet/dataset/")
+agg = df.groupby('B').sum()
+agg.compute()  # This will use the cluster defined above
+```
+
+> [!NOTE]
+> This example uses `compute` to materialize a concrete `cudf.DataFrame` object in local memory. Never call `compute` on a large collection that cannot fit comfortably in the memory of a single GPU! See Dask's [documentation on managing computation](https://distributed.dask.org/en/stable/manage-computation.html) for more details.
+
+Please see the [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) documentation for more information about deploying GPU-aware clusters (including [best practices](https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/)).
+
+## Install
+
+See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages.
+
+## Resources
+
+- [Dask cuDF API documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
+- [cuDF API documentation](https://docs.rapids.ai/api/cudf/stable/)
+- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
+- [Dask CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
+- [Deployment](https://docs.rapids.ai/deployment/stable/)
+- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.

From 872e01e8c11fe61051d7be46f09f285252f2c6ac Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 28 Aug 2024 12:08:38 -0500
Subject: [PATCH 139/270] Fix slowdown in `CategoricalIndex.__repr__` (#16665)

Fixes: #13297

This PR fixes a slow-down in performing repr of a `CategoricalIndex` when there are too many unique values. There was no other choice to fix this in a better way by using public APIs, because all the public APIs seem to be performing categories validation even if `fastpath=True`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16665
---
 python/cudf/cudf/core/index.py      | 16 +++++++++++++++-
 python/cudf/cudf/testing/_utils.py  | 21 +++++++++++++++++++++
 python/cudf/cudf/tests/test_repr.py | 11 +++++++++++
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index df8af856f4f..27c6556f976 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1443,7 +1443,21 @@ def __repr__(self):
                     output[:break_idx].replace("'", "") + output[break_idx:]
                 )
             else:
-                output = repr(preprocess.to_pandas())
+                # Too many non-unique categories will cause
+                # the output to take too long. In this case, we
+                # split the categories into data and categories
+                # and generate the repr separately and
+                # merge them.
+                pd_cats = pd.Categorical(
+                    preprocess.astype(preprocess.categories.dtype).to_pandas()
+                )
+                pd_preprocess = pd.CategoricalIndex(pd_cats)
+                data_repr = repr(pd_preprocess).split("\n")
+                pd_preprocess.dtype._categories = (
+                    preprocess.categories.to_pandas()
+                )
+                cats_repr = repr(pd_preprocess).split("\n")
+                output = "\n".join(data_repr[:-1] + cats_repr[-1:])
 
             output = output.replace("nan", str(cudf.NA))
         elif preprocess._values.nullable:
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index a6a2d4eea00..540f12c8382 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import itertools
+import signal
 import string
 from collections import abc
 from contextlib import contextmanager
@@ -368,3 +369,23 @@ def sv_to_udf_str_testing_lowering(context, builder, sig, args):
     return cast_string_view_to_udf_string(
         context, builder, sig.args[0], sig.return_type, args[0]
     )
+
+
+class cudf_timeout:
+    """
+    Context manager to raise a TimeoutError after a specified number of seconds.
+    """
+
+    def __init__(self, seconds, *, timeout_message=""):
+        self.seconds = int(seconds)
+        self.timeout_message = timeout_message
+
+    def _timeout_handler(self, signum, frame):
+        raise TimeoutError(self.timeout_message)
+
+    def __enter__(self):
+        signal.signal(signal.SIGALRM, self._timeout_handler)
+        signal.alarm(self.seconds)
+
+    def __exit__(self, type, value, traceback):
+        signal.alarm(0)
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index a013745f71e..57eef9e3463 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1480,3 +1480,14 @@ def test_interval_index_repr():
     gi = cudf.from_pandas(pi)
 
     assert repr(pi) == repr(gi)
+
+
+def test_large_unique_categories_repr():
+    # Unfortunately, this is a long running test (takes about 1 minute)
+    # and there is no way we can reduce the time
+    pi = pd.CategoricalIndex(range(100_000_000))
+    gi = cudf.CategoricalIndex(range(100_000_000))
+    expected_repr = repr(pi)
+    with utils.cudf_timeout(2, timeout_message="Failed to repr fast enough"):
+        actual_repr = repr(gi)
+    assert expected_repr == actual_repr

From dba6c1fe37bbc4a3b15123bfd3a5c1d5cf693fe3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 28 Aug 2024 09:29:38 -1000
Subject: [PATCH 140/270] Remove build_categorical_column in favor of
 CategoricalColumn constructor (#16617)

`build_categorical_column` was largely redundant with the CategoricalColumn constructor, so in the spirit of having One Way to Do Things, replacing the former with the latter.

There is usage of `build_categorical_column` in cugraph that has been replaced in https://github.com/rapidsai/cugraph/pull/4618

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16617
---
 python/cudf/cudf/core/_internals/where.py    |  13 -
 python/cudf/cudf/core/column/__init__.py     |   1 -
 python/cudf/cudf/core/column/categorical.py  | 266 +++++++++----------
 python/cudf/cudf/core/column/column.py       |  96 +++----
 python/cudf/cudf/core/column/numerical.py    |  20 +-
 python/cudf/cudf/core/cut.py                 |  17 +-
 python/cudf/cudf/core/dataframe.py           |  33 +--
 python/cudf/cudf/core/df_protocol.py         |  28 +-
 python/cudf/cudf/core/frame.py               |  25 +-
 python/cudf/cudf/core/index.py               |  18 +-
 python/cudf/cudf/core/indexed_frame.py       |  12 +-
 python/cudf/cudf/core/series.py              |  21 +-
 python/cudf/cudf/core/single_column_frame.py |   7 +-
 python/cudf/cudf/io/parquet.py               |  16 +-
 python/dask_cudf/dask_cudf/backends.py       |  19 +-
 python/dask_cudf/dask_cudf/io/parquet.py     |  12 +-
 16 files changed, 284 insertions(+), 320 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 0c754317185..2199d4d5ba5 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -106,19 +106,6 @@ def _check_and_cast_columns_with_other(
     return _normalize_categorical(source_col.astype(common_dtype), other)
 
 
-def _make_categorical_like(result, column):
-    if isinstance(column, cudf.core.column.CategoricalColumn):
-        result = cudf.core.column.build_categorical_column(
-            categories=column.categories,
-            codes=result,
-            mask=result.base_mask,
-            size=result.size,
-            offset=result.offset,
-            ordered=column.ordered,
-        )
-    return result
-
-
 def _can_cast(from_dtype, to_dtype):
     """
     Utility function to determine if we can cast
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index e7119fcdf47..5781d77ee9a 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -8,7 +8,6 @@
 from cudf.core.column.column import (
     ColumnBase,
     as_column,
-    build_categorical_column,
     build_column,
     column_empty,
     column_empty_like,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index a7e98e5218f..de5ed15771d 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -52,6 +52,15 @@
 _DEFAULT_CATEGORICAL_VALUE = np.int8(-1)
 
 
+def as_unsigned_codes(
+    num_cats: int, codes: NumericalColumn
+) -> NumericalColumn:
+    codes_dtype = min_unsigned_type(num_cats)
+    return cast(
+        cudf.core.column.numerical.NumericalColumn, codes.astype(codes_dtype)
+    )
+
+
 class CategoricalAccessor(ColumnMethods):
     """
     Accessor object for categorical properties of the Series values.
@@ -637,13 +646,12 @@ def __setitem__(self, key, value):
             value = value.codes
         codes = self.codes
         codes[key] = value
-        out = cudf.core.column.build_categorical_column(
-            categories=self.categories,
-            codes=codes,
-            mask=codes.base_mask,
+        out = type(self)(
+            data=self.data,
             size=codes.size,
-            offset=self.offset,
-            ordered=self.ordered,
+            dtype=self.dtype,
+            mask=codes.base_mask,
+            children=(codes,),
         )
         self._mimic_inplace(out, inplace=True)
 
@@ -669,16 +677,13 @@ def _fill(
 
     def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
         codes = self.codes.slice(start, stop, stride)
-        return cast(
-            Self,
-            cudf.core.column.build_categorical_column(
-                categories=self.categories,
-                codes=codes,
-                mask=codes.base_mask,
-                ordered=self.ordered,
-                size=codes.size,
-                offset=codes.offset,
-            ),
+        return type(self)(
+            data=self.data,  # type: ignore[arg-type]
+            size=codes.size,
+            dtype=self.dtype,
+            mask=codes.base_mask,
+            offset=codes.offset,
+            children=(codes,),
         )
 
     def _reduce(
@@ -719,7 +724,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             )
         return self.codes._binaryop(other.codes, op)
 
-    def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
+    def normalize_binop_value(self, other: ScalarLike) -> Self:
         if isinstance(other, column.ColumnBase):
             if not isinstance(other, CategoricalColumn):
                 return NotImplemented
@@ -727,30 +732,27 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
                 raise TypeError(
                     "Categoricals can only compare with the same type"
                 )
-            return other
-
-        ary = column.as_column(
+            return cast(Self, other)
+        codes = column.as_column(
             self._encode(other), length=len(self), dtype=self.codes.dtype
         )
-        return column.build_categorical_column(
-            categories=self.dtype.categories._values,
-            codes=column.as_column(ary),
+        return type(self)(
+            data=None,
+            size=self.size,
+            dtype=self.dtype,
             mask=self.base_mask,
-            ordered=self.dtype.ordered,
+            children=(codes,),  # type: ignore[arg-type]
         )
 
-    def sort_values(
-        self, ascending: bool = True, na_position="last"
-    ) -> CategoricalColumn:
+    def sort_values(self, ascending: bool = True, na_position="last") -> Self:
         codes = self.codes.sort_values(ascending, na_position)
-        col = column.build_categorical_column(
-            categories=self.dtype.categories._values,
-            codes=codes,
-            mask=codes.base_mask,
+        return type(self)(
+            data=self.data,  # type: ignore[arg-type]
             size=codes.size,
-            ordered=self.dtype.ordered,
+            dtype=self.dtype,
+            mask=codes.base_mask,
+            children=(codes,),
         )
-        return col
 
     def element_indexing(self, index: int) -> ScalarLike:
         val = self.codes.element_indexing(index)
@@ -777,12 +779,12 @@ def to_pandas(
 
         if self.categories.dtype.kind == "f":
             new_mask = bools_to_mask(self.notnull())
-            col = column.build_categorical_column(
-                categories=self.categories,
-                codes=column.as_column(self.codes, dtype=self.codes.dtype),
+            col = type(self)(
+                data=self.data,  # type: ignore[arg-type]
+                size=self.size,
+                dtype=self.dtype,
                 mask=new_mask,
-                ordered=self.dtype.ordered,
-                size=self.codes.size,
+                children=self.children,
             )
         else:
             col = self
@@ -849,15 +851,15 @@ def data_array_view(
     ) -> numba.cuda.devicearray.DeviceNDArray:
         return self.codes.data_array_view(mode=mode)
 
-    def unique(self) -> CategoricalColumn:
+    def unique(self) -> Self:
         codes = self.codes.unique()
-        return column.build_categorical_column(
-            categories=self.categories,
-            codes=codes,
+        return type(self)(
+            data=self.data,  # type: ignore[arg-type]
+            size=codes.size,
+            dtype=self.dtype,
             mask=codes.base_mask,
             offset=codes.offset,
-            size=codes.size,
-            ordered=self.ordered,
+            children=(codes,),
         )
 
     def _encode(self, value) -> ScalarLike:
@@ -988,14 +990,17 @@ def find_and_replace(
         output = libcudf.replace.replace(
             replaced_codes, to_replace_col, replacement_col
         )
+        codes = as_unsigned_codes(len(new_cats["cats"]), output)
 
-        result = column.build_categorical_column(
-            categories=new_cats["cats"],
-            codes=output,
-            mask=output.base_mask,
-            offset=output.offset,
-            size=output.size,
-            ordered=self.dtype.ordered,
+        result = type(self)(
+            data=self.data,  # type: ignore[arg-type]
+            size=codes.size,
+            dtype=CategoricalDtype(
+                categories=new_cats["cats"], ordered=self.dtype.ordered
+            ),
+            mask=codes.base_mask,
+            offset=codes.offset,
+            children=(codes,),
         )
         if result.dtype != self.dtype:
             warnings.warn(
@@ -1082,7 +1087,7 @@ def is_monotonic_increasing(self) -> bool:
     def is_monotonic_decreasing(self) -> bool:
         return bool(self.ordered) and self.codes.is_monotonic_decreasing
 
-    def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn:
+    def as_categorical_column(self, dtype: Dtype) -> Self:
         if isinstance(dtype, str) and dtype == "category":
             return self
         if isinstance(dtype, pd.CategoricalDtype):
@@ -1099,7 +1104,23 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn:
         if not isinstance(self.categories, type(dtype.categories._column)):
             # If both categories are of different Column types,
             # return a column full of Nulls.
-            return _create_empty_categorical_column(self, dtype)
+            codes = cast(
+                cudf.core.column.numerical.NumericalColumn,
+                column.as_column(
+                    _DEFAULT_CATEGORICAL_VALUE,
+                    length=self.size,
+                    dtype=self.codes.dtype,
+                ),
+            )
+            codes = as_unsigned_codes(len(dtype.categories), codes)
+            return type(self)(
+                data=self.data,  # type: ignore[arg-type]
+                size=self.size,
+                dtype=dtype,
+                mask=self.base_mask,
+                offset=self.offset,
+                children=(codes,),
+            )
 
         return self.set_categories(
             new_categories=dtype.categories, ordered=bool(dtype.ordered)
@@ -1185,26 +1206,29 @@ def _concat(
             codes = [o for o in codes if len(o)]
             codes_col = libcudf.concat.concat_columns(objs)
 
-        return column.build_categorical_column(
-            categories=column.as_column(cats),
-            codes=codes_col,
-            mask=codes_col.base_mask,
+        codes_col = as_unsigned_codes(
+            len(cats),
+            cast(cudf.core.column.numerical.NumericalColumn, codes_col),
+        )
+        return CategoricalColumn(
+            data=None,
             size=codes_col.size,
+            dtype=CategoricalDtype(categories=cats),
+            mask=codes_col.base_mask,
             offset=codes_col.offset,
+            children=(codes_col,),  # type: ignore[arg-type]
         )
 
-    def _with_type_metadata(
-        self: CategoricalColumn, dtype: Dtype
-    ) -> CategoricalColumn:
+    def _with_type_metadata(self: Self, dtype: Dtype) -> Self:
         if isinstance(dtype, CategoricalDtype):
-            return column.build_categorical_column(
-                categories=dtype.categories._values,
-                codes=self.codes,
-                mask=self.codes.base_mask,
-                ordered=dtype.ordered,
+            return type(self)(
+                data=self.data,  # type: ignore[arg-type]
                 size=self.codes.size,
+                dtype=dtype,
+                mask=self.codes.base_mask,
                 offset=self.codes.offset,
                 null_count=self.codes.null_count,
+                children=(self.codes,),
             )
         return self
 
@@ -1213,7 +1237,7 @@ def set_categories(
         new_categories: Any,
         ordered: bool = False,
         rename: bool = False,
-    ) -> CategoricalColumn:
+    ) -> Self:
         # See CategoricalAccessor.set_categories.
 
         ordered = ordered if ordered is not None else self.ordered
@@ -1232,25 +1256,39 @@ def set_categories(
                     "new_categories must have the same "
                     "number of items as old categories"
                 )
-
-            out_col = column.build_categorical_column(
-                categories=new_categories,
-                codes=self.base_children[0],
-                mask=self.base_mask,
+            out_col = type(self)(
+                data=self.data,  # type: ignore[arg-type]
                 size=self.size,
+                dtype=CategoricalDtype(
+                    categories=new_categories, ordered=ordered
+                ),
+                mask=self.base_mask,
                 offset=self.offset,
-                ordered=ordered,
+                children=(self.codes,),
             )
         else:
             out_col = self
             if type(out_col.categories) is not type(new_categories):
                 # If both categories are of different Column types,
                 # return a column full of Nulls.
-                out_col = _create_empty_categorical_column(
-                    self,
-                    CategoricalDtype(
+                new_codes = cast(
+                    cudf.core.column.numerical.NumericalColumn,
+                    column.as_column(
+                        _DEFAULT_CATEGORICAL_VALUE,
+                        length=self.size,
+                        dtype=self.codes.dtype,
+                    ),
+                )
+                new_codes = as_unsigned_codes(len(new_categories), new_codes)
+                out_col = type(self)(
+                    data=self.data,  # type: ignore[arg-type]
+                    size=self.size,
+                    dtype=CategoricalDtype(
                         categories=new_categories, ordered=ordered
                     ),
+                    mask=self.base_mask,
+                    offset=self.offset,
+                    children=(new_codes,),
                 )
             elif (
                 not out_col._categories_equal(new_categories, ordered=True)
@@ -1335,19 +1373,19 @@ def _set_categories(
         df.reset_index(drop=True, inplace=True)
 
         ordered = ordered if ordered is not None else self.ordered
-        new_codes = df._data["new_codes"]
+        new_codes = cast(
+            cudf.core.column.numerical.NumericalColumn, df._data["new_codes"]
+        )
 
         # codes can't have masks, so take mask out before moving in
-        return cast(
-            Self,
-            column.build_categorical_column(
-                categories=new_cats,
-                codes=new_codes,
-                mask=new_codes.base_mask,
-                size=new_codes.size,
-                offset=new_codes.offset,
-                ordered=ordered,
-            ),
+        new_codes = as_unsigned_codes(len(new_cats), new_codes)
+        return type(self)(
+            data=self.data,  # type: ignore[arg-type]
+            size=new_codes.size,
+            dtype=CategoricalDtype(categories=new_cats, ordered=ordered),
+            mask=new_codes.base_mask,
+            offset=new_codes.offset,
+            children=(new_codes,),
         )
 
     def add_categories(self, new_categories: Any) -> Self:
@@ -1425,56 +1463,16 @@ def remove_unused_categories(self) -> Self:
             "remove_unused_categories is currently not supported."
         )
 
-    def as_ordered(self, ordered: bool):
+    def as_ordered(self, ordered: bool) -> Self:
         if self.dtype.ordered == ordered:
             return self
-        return column.build_categorical_column(
-            categories=self.categories,
-            codes=self.codes,
-            mask=self.base_mask,
+        return type(self)(
+            data=self.data,  # type: ignore[arg-type]
             size=self.size,
+            dtype=CategoricalDtype(
+                categories=self.categories, ordered=ordered
+            ),
+            mask=self.base_mask,
             offset=self.offset,
-            ordered=ordered,
+            children=self.children,
         )
-
-
-def _create_empty_categorical_column(
-    categorical_column: CategoricalColumn, dtype: "CategoricalDtype"
-) -> CategoricalColumn:
-    return column.build_categorical_column(
-        categories=column.as_column(dtype.categories),
-        codes=column.as_column(
-            _DEFAULT_CATEGORICAL_VALUE,
-            length=categorical_column.size,
-            dtype=categorical_column.codes.dtype,
-        ),
-        offset=categorical_column.offset,
-        size=categorical_column.size,
-        mask=categorical_column.base_mask,
-        ordered=dtype.ordered,
-    )
-
-
-def pandas_categorical_as_column(
-    categorical: ColumnLike, codes: ColumnLike | None = None
-) -> CategoricalColumn:
-    """Creates a CategoricalColumn from a pandas.Categorical
-
-    If ``codes`` is defined, use it instead of ``categorical.codes``
-    """
-    codes = categorical.codes if codes is None else codes
-    codes = column.as_column(codes)
-
-    valid_codes = codes != codes.dtype.type(_DEFAULT_CATEGORICAL_VALUE)
-
-    mask = None
-    if not valid_codes.all():
-        mask = bools_to_mask(valid_codes)
-
-    return column.build_categorical_column(
-        categories=categorical.categories,
-        codes=codes,
-        size=codes.size,
-        mask=mask,
-        ordered=categorical.ordered,
-    )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 60b4126ddd4..885476a897c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -352,13 +352,17 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
 
             codes = libcudf.interop.from_arrow(indices_table)[0]
             categories = libcudf.interop.from_arrow(dictionaries_table)[0]
-
-            return build_categorical_column(
-                categories=categories,
-                codes=codes,
-                mask=codes.base_mask,
+            codes = cudf.core.column.categorical.as_unsigned_codes(
+                len(categories), codes
+            )
+            return cudf.core.column.CategoricalColumn(
+                data=None,
                 size=codes.size,
-                ordered=array.type.ordered,
+                dtype=CategoricalDtype(
+                    categories=categories, ordered=array.type.ordered
+                ),
+                mask=codes.base_mask,
+                children=(codes,),
             )
 
         result = libcudf.interop.from_arrow(data)[0]
@@ -950,10 +954,10 @@ def is_monotonic_decreasing(self) -> bool:
         )
 
     def sort_values(
-        self: ColumnBase,
+        self: Self,
         ascending: bool = True,
         na_position: str = "last",
-    ) -> ColumnBase:
+    ) -> Self:
         if (not ascending and self.is_monotonic_decreasing) or (
             ascending and self.is_monotonic_increasing
         ):
@@ -1041,12 +1045,16 @@ def as_categorical_column(self, dtype) -> ColumnBase:
             and dtype._categories is not None
         ):
             cat_col = dtype._categories
-            labels = self._label_encoding(cats=cat_col)
-            return build_categorical_column(
-                categories=cat_col,
-                codes=labels,
+            codes = self._label_encoding(cats=cat_col)
+            codes = cudf.core.column.categorical.as_unsigned_codes(
+                len(cat_col), codes
+            )
+            return cudf.core.column.categorical.CategoricalColumn(
+                data=None,
+                size=None,
+                dtype=dtype,
                 mask=self.mask,
-                ordered=dtype.ordered,
+                children=(codes,),
             )
 
         # Categories must be unique and sorted in ascending order.
@@ -1058,15 +1066,16 @@ def as_categorical_column(self, dtype) -> ColumnBase:
         # columns include null index in factorization; remove:
         if self.has_nulls():
             cats = cats.dropna()
-            min_type = min_unsigned_type(len(cats), 8)
-            if cudf.dtype(min_type).itemsize < labels.dtype.itemsize:
-                labels = labels.astype(min_type)
 
-        return build_categorical_column(
-            categories=cats,
-            codes=labels,
+        labels = cudf.core.column.categorical.as_unsigned_codes(
+            len(cats), labels
+        )
+        return cudf.core.column.categorical.CategoricalColumn(
+            data=None,
+            size=None,
+            dtype=CategoricalDtype(categories=cats, ordered=ordered),
             mask=self.mask,
-            ordered=ordered,
+            children=(labels,),
         )
 
     def as_numerical_column(
@@ -1186,7 +1195,7 @@ def searchsorted(
             na_position=na_position,
         )
 
-    def unique(self) -> ColumnBase:
+    def unique(self) -> Self:
         """
         Get unique values in the data
         """
@@ -1695,51 +1704,6 @@ def build_column(
         raise TypeError(f"Unrecognized dtype: {dtype}")
 
 
-def build_categorical_column(
-    categories: ColumnBase,
-    codes: ColumnBase,
-    mask: Buffer | None = None,
-    size: int | None = None,
-    offset: int = 0,
-    null_count: int | None = None,
-    ordered: bool = False,
-) -> "cudf.core.column.CategoricalColumn":
-    """
-    Build a CategoricalColumn
-
-    Parameters
-    ----------
-    categories : Column
-        Column of categories
-    codes : Column
-        Column of codes, the size of the resulting Column will be
-        the size of `codes`
-    mask : Buffer
-        Null mask
-    size : int, optional
-    offset : int, optional
-    ordered : bool, default False
-        Indicates whether the categories are ordered
-    """
-    codes_dtype = min_unsigned_type(len(categories))
-    codes = as_column(codes)
-    if codes.dtype != codes_dtype:
-        codes = codes.astype(codes_dtype)
-
-    dtype = CategoricalDtype(categories=categories, ordered=ordered)
-
-    result = build_column(
-        data=None,
-        dtype=dtype,
-        mask=mask,
-        size=size,
-        offset=offset,
-        null_count=null_count,
-        children=(codes,),
-    )
-    return cast("cudf.core.column.CategoricalColumn", result)
-
-
 def check_invalid_array(shape: tuple, dtype):
     """Invalid ndarrays properties that are not supported"""
     if len(shape) > 1:
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 7f391c8a79c..78d2814ed26 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -651,22 +651,20 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
 
         return False
 
-    def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
+    def _with_type_metadata(self: Self, dtype: Dtype) -> ColumnBase:
         if isinstance(dtype, CategoricalDtype):
-            return column.build_categorical_column(
-                categories=dtype.categories._values,
-                codes=cudf.core.column.NumericalColumn(
-                    self.base_data,  # type: ignore[arg-type]
-                    self.size,
-                    dtype=self.dtype,
-                ),
-                mask=self.base_mask,
-                ordered=dtype.ordered,
+            codes = cudf.core.column.categorical.as_unsigned_codes(
+                len(dtype.categories), self
+            )
+            return cudf.core.column.CategoricalColumn(
+                data=None,
                 size=self.size,
+                dtype=dtype,
+                mask=self.base_mask,
                 offset=self.offset,
                 null_count=self.null_count,
+                children=(codes,),
             )
-
         return self
 
     def to_pandas(
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index a4ceea266b4..c9b1fa2669c 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -8,7 +8,8 @@
 
 import cudf
 from cudf.api.types import is_list_like
-from cudf.core.column import as_column, build_categorical_column
+from cudf.core.column import as_column
+from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
 from cudf.core.index import IntervalIndex, interval_range
 
 
@@ -282,13 +283,17 @@ def cut(
             # should allow duplicate categories.
             return interval_labels[index_labels]
 
-    col = build_categorical_column(
-        categories=interval_labels,
-        codes=index_labels,
+    index_labels = as_unsigned_codes(len(interval_labels), index_labels)
+
+    col = CategoricalColumn(
+        data=None,
+        size=index_labels.size,
+        dtype=cudf.CategoricalDtype(
+            categories=interval_labels, ordered=ordered
+        ),
         mask=index_labels.base_mask,
         offset=index_labels.offset,
-        size=index_labels.size,
-        ordered=ordered,
+        children=(index_labels,),
     )
 
     # we return a categorical index, as we don't have a Categorical method
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6065e0e1eeb..0d632f4775f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -48,10 +48,10 @@
     ColumnBase,
     StructColumn,
     as_column,
-    build_categorical_column,
     column_empty,
     concat_columns,
 )
+from cudf.core.column.categorical import as_unsigned_codes
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import BooleanMask
 from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template
@@ -3067,7 +3067,6 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
 
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
-            _make_categorical_like,
         )
 
         # First process the condition.
@@ -3119,7 +3118,7 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
 
         out = []
         for (name, col), other_col in zip(self._data.items(), other_cols):
-            col, other_col = _check_and_cast_columns_with_other(
+            source_col, other_col = _check_and_cast_columns_with_other(
                 source_col=col,
                 other=other_col,
                 inplace=inplace,
@@ -3127,16 +3126,16 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
 
             if cond_col := cond._data.get(name):
                 result = cudf._lib.copying.copy_if_else(
-                    col, other_col, cond_col
+                    source_col, other_col, cond_col
                 )
 
-                out.append(_make_categorical_like(result, self._data[name]))
+                out.append(result._with_type_metadata(col.dtype))
             else:
                 out_mask = cudf._lib.null_mask.create_null_mask(
-                    len(col),
+                    len(source_col),
                     state=cudf._lib.null_mask.MaskState.ALL_NULL,
                 )
-                out.append(col.set_mask(out_mask))
+                out.append(source_col.set_mask(out_mask))
 
         return self._mimic_inplace(
             self._from_data_like_self(self._data._from_columns_like_self(out)),
@@ -3296,9 +3295,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         # least require a deprecation cycle because we currently support
         # inserting a pd.Categorical.
         if isinstance(value, pd.Categorical):
-            value = cudf.core.column.categorical.pandas_categorical_as_column(
-                value
-            )
+            value = as_column(value)
 
         if _is_scalar_or_zero_d_array(value):
             dtype = None
@@ -8510,12 +8507,16 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories):
 def _reassign_categories(categories, cols, col_idxs):
     for name, idx in zip(cols, col_idxs):
         if idx in categories:
-            cols[name] = build_categorical_column(
-                categories=categories[idx],
-                codes=cols[name],
-                mask=cols[name].base_mask,
-                offset=cols[name].offset,
-                size=cols[name].size,
+            codes = as_unsigned_codes(len(categories[idx]), cols[name])
+            cols[name] = CategoricalColumn(
+                data=None,
+                size=codes.size,
+                dtype=cudf.CategoricalDtype(
+                    categories=categories[idx], ordered=False
+                ),
+                mask=codes.base_mask,
+                offset=codes.offset,
+                children=(codes,),
             )
 
 
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index a70a42c04af..5250a741d3d 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -13,7 +13,12 @@
 
 import cudf
 from cudf.core.buffer import Buffer, as_buffer
-from cudf.core.column import as_column, build_categorical_column, build_column
+from cudf.core.column import (
+    CategoricalColumn,
+    NumericalColumn,
+    as_column,
+    build_column,
+)
 
 # Implementation of interchange protocol classes
 # ----------------------------------------------
@@ -830,18 +835,19 @@ def _protocol_to_cudf_column_categorical(
     assert buffers["data"] is not None, "data buffer should not be None"
     codes_buffer, codes_dtype = buffers["data"]
     codes_buffer = _ensure_gpu_buffer(codes_buffer, codes_dtype, allow_copy)
-    cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
-    codes = build_column(
-        codes_buffer._buf,
-        cdtype,
+    cdtype = np.dtype(protocol_dtype_to_cupy_dtype(codes_dtype))
+    codes = NumericalColumn(
+        data=codes_buffer._buf,
+        size=None,
+        dtype=cdtype,
     )
-
-    cudfcol = build_categorical_column(
-        categories=categories,
-        codes=codes,
-        mask=codes.base_mask,
+    cudfcol = CategoricalColumn(
+        data=None,
         size=codes.size,
-        ordered=ordered,
+        dtype=cudf.CategoricalDtype(categories=categories, ordered=ordered),
+        mask=codes.base_mask,
+        offset=codes.offset,
+        children=(codes,),
     )
 
     return _set_missing_values(col, cudfcol, allow_copy), buffers
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index cbe1e97d834..7b2bc85b13b 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -24,10 +24,10 @@
 from cudf.core.column import (
     ColumnBase,
     as_column,
-    build_categorical_column,
     deserialize_columns,
     serialize_columns,
 )
+from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.mixins import BinaryOperand, Scannable
 from cudf.utils import ioutils
@@ -889,18 +889,21 @@ def from_arrow(cls, data: pa.Table) -> Self:
                 for name in dict_dictionaries.keys()
             }
 
-            cudf_category_frame = {
-                name: build_categorical_column(
-                    cudf_dictionaries_columns[name],
-                    codes,
-                    mask=codes.base_mask,
+            for name, codes in zip(
+                dict_indices_table.column_names, indices_columns
+            ):
+                categories = cudf_dictionaries_columns[name]
+                codes = as_unsigned_codes(len(categories), codes)
+                cudf_category_frame[name] = CategoricalColumn(
+                    data=None,
                     size=codes.size,
-                    ordered=dict_ordered[name],
-                )
-                for name, codes in zip(
-                    dict_indices_table.column_names, indices_columns
+                    dtype=cudf.CategoricalDtype(
+                        categories=categories,
+                        ordered=dict_ordered[name],
+                    ),
+                    mask=codes.base_mask,
+                    children=(codes,),
                 )
-            }
 
         # Handle non-dict arrays
         cudf_non_category_frame = {
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 27c6556f976..500fc580097 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3079,22 +3079,8 @@ def __init__(
         name = _getdefault_name(data, name=name)
         if isinstance(data, CategoricalColumn):
             data = data
-        elif isinstance(data, pd.Series) and (
-            isinstance(data.dtype, pd.CategoricalDtype)
-        ):
-            codes_data = column.as_column(data.cat.codes.values)
-            data = column.build_categorical_column(
-                categories=data.cat.categories,
-                codes=codes_data,
-                ordered=data.cat.ordered,
-            )
-        elif isinstance(data, (pd.Categorical, pd.CategoricalIndex)):
-            codes_data = column.as_column(data.codes)
-            data = column.build_categorical_column(
-                categories=data.categories,
-                codes=codes_data,
-                ordered=data.ordered,
-            )
+        elif isinstance(getattr(data, "dtype", None), pd.CategoricalDtype):
+            data = column.as_column(data)
         else:
             data = column.as_column(
                 data, dtype="category" if dtype is None else dtype
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ad6aa56d472..fd6bf37f0e6 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -173,17 +173,7 @@ def _drop_columns(f: Frame, columns: abc.Iterable, errors: str):
 def _indices_from_labels(obj, labels):
     if not isinstance(labels, cudf.MultiIndex):
         labels = cudf.core.column.as_column(labels)
-
-        if isinstance(obj.index.dtype, cudf.CategoricalDtype):
-            labels = labels.astype("category")
-            codes = labels.codes.astype(obj.index.codes.dtype)
-            labels = cudf.core.column.build_categorical_column(
-                categories=labels.dtype.categories,
-                codes=codes,
-                ordered=labels.dtype.ordered,
-            )
-        else:
-            labels = labels.astype(obj.index.dtype)
+        labels = labels.astype(obj.index.dtype)
         idx_labels = cudf.Index._from_column(labels)
     else:
         idx_labels = labels
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 4be10752651..a831a798772 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -38,7 +38,9 @@
     as_column,
 )
 from cudf.core.column.categorical import (
+    _DEFAULT_CATEGORICAL_VALUE,
     CategoricalAccessor as CategoricalAccessor,
+    CategoricalColumn,
 )
 from cudf.core.column.column import concat_columns
 from cudf.core.column.lists import ListMethods
@@ -511,9 +513,22 @@ def from_categorical(cls, categorical, codes=None):
         dtype: category
         Categories (3, object): ['a', 'b', 'c']
         """  # noqa: E501
-        col = cudf.core.column.categorical.pandas_categorical_as_column(
-            categorical, codes=codes
-        )
+        col = as_column(categorical)
+        if codes is not None:
+            codes = as_column(codes)
+
+            valid_codes = codes != codes.dtype.type(_DEFAULT_CATEGORICAL_VALUE)
+
+            mask = None
+            if not valid_codes.all():
+                mask = libcudf.transform.bools_to_mask(valid_codes)
+            col = CategoricalColumn(
+                data=col.data,
+                size=codes.size,
+                dtype=col.dtype,
+                mask=mask,
+                children=(codes,),
+            )
         return Series._from_column(col)
 
     @classmethod
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index eb6714029cf..55dda34a576 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -350,7 +350,6 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
     def where(self, cond, other=None, inplace=False):
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
-            _make_categorical_like,
         )
 
         if isinstance(other, cudf.DataFrame):
@@ -366,14 +365,12 @@ def where(self, cond, other=None, inplace=False):
         if not cudf.api.types.is_scalar(other):
             other = cudf.core.column.as_column(other)
 
-        self_column = self._column
         input_col, other = _check_and_cast_columns_with_other(
-            source_col=self_column, other=other, inplace=inplace
+            source_col=self._column, other=other, inplace=inplace
         )
 
         result = cudf._lib.copying.copy_if_else(input_col, other, cond)
-
-        return _make_categorical_like(result, self_column)
+        return result._with_type_metadata(self.dtype)
 
     @_performance_tracking
     def transpose(self):
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index d6b2ae2f31c..984115dcbbe 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -20,7 +20,8 @@
 import cudf
 from cudf._lib import parquet as libparquet
 from cudf.api.types import is_list_like
-from cudf.core.column import as_column, build_categorical_column, column_empty
+from cudf.core.column import as_column, column_empty
+from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
@@ -811,12 +812,17 @@ def _parquet_to_frame(
                     partition_categories[name].index(value),
                     length=_len,
                 )
-                dfs[-1][name] = build_categorical_column(
-                    categories=partition_categories[name],
-                    codes=codes,
+                codes = as_unsigned_codes(
+                    len(partition_categories[name]), codes
+                )
+                dfs[-1][name] = CategoricalColumn(
+                    data=None,
                     size=codes.size,
+                    dtype=cudf.CategoricalDtype(
+                        categories=partition_categories[name], ordered=False
+                    ),
                     offset=codes.offset,
-                    ordered=False,
+                    children=(codes,),
                 )
             else:
                 # Not building categorical columns, so
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 5bd3eb5fa7f..9347ebba5de 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -64,8 +64,11 @@ def _nonempty_index(idx):
         values = cudf.core.column.as_column(data)
         return cudf.DatetimeIndex(values, name=idx.name)
     elif isinstance(idx, cudf.CategoricalIndex):
-        values = cudf.core.column.build_categorical_column(
-            categories=idx.categories, codes=[0, 0], ordered=idx.ordered
+        values = cudf.core.column.CategoricalColumn(
+            data=None,
+            size=None,
+            dtype=idx.dtype,
+            children=(cudf.core.column.as_column([0, 0], dtype=np.uint8),),
         )
         return cudf.CategoricalIndex(values, name=idx.name)
     elif isinstance(idx, cudf.MultiIndex):
@@ -105,12 +108,16 @@ def _get_non_empty_data(
         )
         codes = cudf.core.column.as_column(
             0,
-            dtype=cudf._lib.types.size_type_dtype,
+            dtype=np.uint8,
             length=2,
         )
-        ordered = s.ordered  # type: ignore[attr-defined]
-        return cudf.core.column.build_categorical_column(
-            categories=categories, codes=codes, ordered=ordered
+        return cudf.core.column.CategoricalColumn(
+            data=None,
+            size=codes.size,
+            dtype=cudf.CategoricalDtype(
+                categories=categories, ordered=s.dtype.ordered
+            ),
+            children=(codes,),  # type: ignore[arg-type]
         )
     elif isinstance(s.dtype, cudf.ListDtype):
         leaf_type = s.dtype.leaf_type
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index c025280c240..e793d4381d1 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -19,7 +19,7 @@
     create_metadata_file_dd = None
 
 import cudf
-from cudf.core.column import as_column, build_categorical_column
+from cudf.core.column import CategoricalColumn, as_column
 from cudf.io import write_to_dataset
 from cudf.io.parquet import _apply_post_filters, _normalize_filters
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
@@ -163,12 +163,14 @@ def _read_paths(
                         partitions[i].keys.get_loc(index2),
                         length=len(df),
                     )
-                    df[name] = build_categorical_column(
-                        categories=partitions[i].keys,
-                        codes=codes,
+                    df[name] = CategoricalColumn(
+                        data=None,
                         size=codes.size,
+                        dtype=cudf.CategoricalDtype(
+                            categories=partitions[i].keys, ordered=False
+                        ),
                         offset=codes.offset,
-                        ordered=False,
+                        children=(codes,),
                     )
                 elif name not in df.columns:
                     # Add non-categorical partition column

From 925530afe8178b7e788ea1a8d4df4c0eb4d042dc Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 28 Aug 2024 13:22:18 -0700
Subject: [PATCH 141/270] Relax Arrow pin (#16681)

With this change, cudf users can install any version of pyarrow greater than 14. This is the minimum version supporting the C Data Interface, which is a requirement for us (it may be possible to relax in principle, but would require changes to the cudf/pylibcudf code). A few tests are skipped due to bugs or missing features in older versions of pyarrow.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16681
---
 .../all_cuda-118_arch-x86_64.yaml             |  1 +
 .../all_cuda-125_arch-x86_64.yaml             |  1 +
 conda/recipes/cudf/meta.yaml                  |  2 +-
 conda/recipes/pylibcudf/meta.yaml             |  2 +-
 dependencies.yaml                             | 12 ++--------
 python/cudf/cudf/tests/test_parquet.py        | 24 +++++++++++++++----
 python/cudf/pyproject.toml                    |  2 +-
 python/libcudf/pyproject.toml                 |  3 ---
 python/pylibcudf/pylibcudf/interop.pyx        |  1 -
 python/pylibcudf/pyproject.toml               |  2 +-
 10 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index c4c32da8af2..7f6967d7287 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -67,6 +67,7 @@ dependencies:
 - pandoc
 - pre-commit
 - ptxcompiler
+- pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 7439c9543a5..c1315e73f16 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -64,6 +64,7 @@ dependencies:
 - pandas>=2.0,<2.2.3dev0
 - pandoc
 - pre-commit
+- pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink>=0.0.0a0
 - pytest-benchmark
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 53f52a35651..e22b4a4eddc 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -82,7 +82,7 @@ requirements:
     - cupy >=12.0.0
     - numba >=0.57
     - numpy >=1.23,<3.0a0
-    - pyarrow ==16.1.0.*
+    - pyarrow>=14.0.0,<18.0.0a0
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index 67b9b76bb8c..7c1efa0176c 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -79,7 +79,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.3dev0
     - numpy >=1.23,<3.0a0
-    - pyarrow ==16.1.0.*
+    - pyarrow>=14.0.0,<18.0.0a0
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}
diff --git a/dependencies.yaml b/dependencies.yaml
index 5be291b3671..c6851d9cb90 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -19,6 +19,7 @@ files:
       - docs
       - notebooks
       - py_version
+      - pyarrow_run
       - rapids_build_skbuild
       - rapids_build_setuptools
       - run_common
@@ -46,7 +47,6 @@ files:
     includes:
       - cuda_version
       - py_version
-      - pyarrow_run
       - test_python_common
       - test_python_cudf
       - test_python_dask_cudf
@@ -136,13 +136,6 @@ files:
       - build_base
       - build_cpp
       - depends_on_librmm
-  py_run_libcudf:
-    output: pyproject
-    pyproject_dir: python/libcudf
-    extras:
-      table: project
-    includes:
-      - pyarrow_run
   py_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
@@ -390,8 +383,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          # Allow runtime version to float up to patch version
-          - pyarrow>=16.1.0,<16.2.0a0
+          - pyarrow>=14.0.0,<18.0.0a0
   cuda_version:
     specific:
       - output_types: conda
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index db4f1c9c8bd..879b2bd3d74 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -515,10 +515,6 @@ def test_parquet_read_filtered_multiple_files(tmpdir):
     )
 
 
-@pytest.mark.skipif(
-    version.parse(pa.__version__) < version.parse("1.0.1"),
-    reason="pyarrow 1.0.0 needed for various operators and operand types",
-)
 @pytest.mark.parametrize(
     "predicate,expected_len",
     [
@@ -2393,6 +2389,10 @@ def test_parquet_writer_list_large_mixed(tmpdir):
 
 @pytest.mark.parametrize("store_schema", [True, False])
 def test_parquet_writer_list_chunked(tmpdir, store_schema):
+    if store_schema and version.parse(pa.__version__) < version.parse(
+        "15.0.0"
+    ):
+        pytest.skip("https://github.com/apache/arrow/pull/37792")
     table1 = cudf.DataFrame(
         {
             "a": list_gen(string_gen, 128, 80, 50),
@@ -2578,6 +2578,10 @@ def normalized_equals(value1, value2):
 @pytest.mark.parametrize("add_nulls", [True, False])
 @pytest.mark.parametrize("store_schema", [True, False])
 def test_parquet_writer_statistics(tmpdir, pdf, add_nulls, store_schema):
+    if store_schema and version.parse(pa.__version__) < version.parse(
+        "15.0.0"
+    ):
+        pytest.skip("https://github.com/apache/arrow/pull/37792")
     file_path = tmpdir.join("cudf.parquet")
     if "col_category" in pdf.columns:
         pdf = pdf.drop(columns=["col_category", "col_bool"])
@@ -2957,6 +2961,10 @@ def test_per_column_options_string_col(tmpdir, encoding):
     assert encoding in fmd.row_group(0).column(0).encodings
 
 
+@pytest.mark.skipif(
+    version.parse(pa.__version__) < version.parse("16.0.0"),
+    reason="https://github.com/apache/arrow/pull/39748",
+)
 @pytest.mark.parametrize(
     "num_rows",
     [200, 10000],
@@ -3557,6 +3565,10 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data):
 
 
 @pytest.mark.parametrize("index", [None, True, False])
+@pytest.mark.skipif(
+    version.parse(pa.__version__) < version.parse("15.0.0"),
+    reason="https://github.com/apache/arrow/pull/37792",
+)
 def test_parquet_writer_roundtrip_with_arrow_schema(index):
     # Ensure that the concrete and nested types are faithfully being roundtripped
     # across Parquet with arrow schema
@@ -3707,6 +3719,10 @@ def test_parquet_writer_int96_timestamps_and_arrow_schema():
     ],
 )
 @pytest.mark.parametrize("index", [None, True, False])
+@pytest.mark.skipif(
+    version.parse(pa.__version__) < version.parse("15.0.0"),
+    reason="https://github.com/apache/arrow/pull/37792",
+)
 def test_parquet_writer_roundtrip_structs_with_arrow_schema(
     tmpdir, data, index
 ):
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 0c1d5015078..17d1292980b 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -30,7 +30,7 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=16.1.0,<16.2.0a0",
+    "pyarrow>=14.0.0,<18.0.0a0",
     "pylibcudf==24.10.*,>=0.0.0a0",
     "rich",
     "rmm==24.10.*,>=0.0.0a0",
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 43878d0aec2..5f4b9957fd0 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -37,9 +37,6 @@ classifiers = [
     "Programming Language :: C++",
     "Environment :: GPU :: NVIDIA CUDA",
 ]
-dependencies = [
-    "pyarrow>=16.1.0,<16.2.0a0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"
diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
index d54e5b7ba1f..1a03fa5b45b 100644
--- a/python/pylibcudf/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -152,7 +152,6 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None):
 
 
 @from_arrow.register(pa.Array)
-@from_arrow.register(pa.ChunkedArray)
 def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
     if data_type is not None:
         raise ValueError("data_type may not be passed for arrays")
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index e4c6edc6141..bfade41353c 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "libcudf==24.10.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pyarrow>=16.1.0,<16.2.0a0",
+    "pyarrow>=14.0.0,<18.0.0a0",
     "rmm==24.10.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From fbd61142a47bd9ef6f739f97c81c88c1ca9430d4 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 28 Aug 2024 15:06:10 -0700
Subject: [PATCH 142/270] Support reading matching projected and filter cols
 from Parquet files with otherwise mismatched schemas (#16394)

Closes #16269.

This PR adds support to read (matching) projected/selected and filter columns from Parquet files with otherwise mismatching schemas.

### Solution Description
We create a `std::vector<unordered_maps<int32_t, int32_t>>`, one per file except 0th file. We then co-walk schema trees and populate the map with corresponding (one-to-one mapped) `schema_idx` of valid selected (projection and filter) column between 0th and the rest of the files. The same `unordered_map` is used to get the `schema_idx` of the same columns across files when creating `ColumnChunkDesc` and copying column chunk metadata into the page decoder.

### Known Limitation
- [x] Nullability across files: Each selected column must still be either nullable or non-nullable across all files. See #12702 also described in [#dask/9935](https://github.com/dask/dask/pull/9935)

CC @wence-

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16394
---
 cpp/include/cudf/io/parquet.hpp               |  37 +++
 cpp/src/io/parquet/reader_impl.cpp            |  13 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    | 174 ++++++++++--
 cpp/src/io/parquet/reader_impl_helpers.hpp    |  53 +++-
 python/cudf/cudf/_lib/parquet.pyx             |  10 +-
 python/cudf/cudf/io/parquet.py                |   5 +
 python/cudf/cudf/tests/test_parquet.py        | 248 ++++++++++++++++++
 python/cudf/cudf/utils/ioutils.py             |   3 +
 python/pylibcudf/pylibcudf/io/parquet.pxd     |   1 +
 python/pylibcudf/pylibcudf/io/parquet.pyx     |  14 +-
 .../pylibcudf/libcudf/io/parquet.pxd          |   6 +-
 11 files changed, 534 insertions(+), 30 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 12897ac77ef..64c37f9a9df 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -76,6 +76,8 @@ class parquet_reader_options {
   bool _use_pandas_metadata = true;
   // Whether to read and use ARROW schema
   bool _use_arrow_schema = true;
+  // Whether to allow reading matching select columns from mismatched Parquet files.
+  bool _allow_mismatched_pq_schemas = false;
   // Cast timestamp columns to a specific type
   data_type _timestamp_type{type_id::EMPTY};
 
@@ -138,6 +140,18 @@ class parquet_reader_options {
    */
   [[nodiscard]] bool is_enabled_use_arrow_schema() const { return _use_arrow_schema; }
 
+  /**
+   * @brief Returns true/false depending on whether to read matching projected and filter columns
+   * from mismatched Parquet sources.
+   *
+   * @return `true` if mismatched projected and filter columns will be read from mismatched Parquet
+   * sources.
+   */
+  [[nodiscard]] bool is_enabled_allow_mismatched_pq_schemas() const
+  {
+    return _allow_mismatched_pq_schemas;
+  }
+
   /**
    * @brief Returns optional tree of metadata.
    *
@@ -258,6 +272,15 @@ class parquet_reader_options {
    */
   void enable_use_arrow_schema(bool val) { _use_arrow_schema = val; }
 
+  /**
+   * @brief Sets to enable/disable reading of matching projected and filter columns from mismatched
+   * Parquet sources.
+   *
+   * @param val Boolean value whether to read matching projected and filter columns from mismatched
+   * Parquet sources.
+   */
+  void enable_allow_mismatched_pq_schemas(bool val) { _allow_mismatched_pq_schemas = val; }
+
   /**
    * @brief Sets reader column schema.
    *
@@ -382,6 +405,20 @@ class parquet_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets to enable/disable reading of matching projected and filter columns from mismatched
+   * Parquet sources.
+   *
+   * @param val Boolean value whether to read matching projected and filter columns from mismatched
+   * Parquet sources.
+   * @return this for chaining.
+   */
+  parquet_reader_options_builder& allow_mismatched_pq_schemas(bool val)
+  {
+    options._allow_mismatched_pq_schemas = val;
+    return *this;
+  }
+
   /**
    * @brief Sets reader metadata.
    *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 2648a1f41ab..9950e2f7d7d 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -470,8 +470,10 @@ reader::impl::impl(std::size_t chunk_read_limit,
     _input_pass_read_limit{pass_read_limit}
 {
   // Open and parse the source dataset metadata
-  _metadata =
-    std::make_unique<aggregate_reader_metadata>(_sources, options.is_enabled_use_arrow_schema());
+  _metadata = std::make_unique<aggregate_reader_metadata>(
+    _sources,
+    options.is_enabled_use_arrow_schema(),
+    options.get_columns().has_value() and options.is_enabled_allow_mismatched_pq_schemas());
 
   // Strings may be returned as either string or categorical columns
   _strings_to_categorical = options.is_enabled_convert_strings_to_categories();
@@ -769,11 +771,14 @@ parquet_column_schema walk_schema(aggregate_reader_metadata const* mt, int idx)
 
 parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources)
 {
-  // do not use arrow schema when reading information from parquet metadata.
+  // Do not use arrow schema when reading information from parquet metadata.
   static constexpr auto use_arrow_schema = false;
 
+  // Do not select any columns when only reading the parquet metadata.
+  static constexpr auto has_column_projection = false;
+
   // Open and parse the source dataset metadata
-  auto metadata = aggregate_reader_metadata(sources, use_arrow_schema);
+  auto metadata = aggregate_reader_metadata(sources, use_arrow_schema, has_column_projection);
 
   return parquet_metadata{parquet_schema{walk_schema(&metadata, 0)},
                           metadata.get_num_rows(),
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 00f75e4e828..8b5678f202b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -380,6 +380,17 @@ aggregate_reader_metadata::collect_keyval_metadata() const
   return kv_maps;
 }
 
+std::vector<std::unordered_map<int32_t, int32_t>> aggregate_reader_metadata::init_schema_idx_maps(
+  bool const has_cols_from_mismatched_srcs) const
+{
+  // Only initialize if more than 1 data sources and has select columns from mismatched data sources
+  if (has_cols_from_mismatched_srcs and per_file_metadata.size() > 1) {
+    return std::vector<std::unordered_map<int32_t, int32_t>>{per_file_metadata.size() - 1};
+  }
+
+  return {};
+}
+
 int64_t aggregate_reader_metadata::calc_num_rows() const
 {
   return std::accumulate(
@@ -539,13 +550,18 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf
 }
 
 aggregate_reader_metadata::aggregate_reader_metadata(
-  host_span<std::unique_ptr<datasource> const> sources, bool use_arrow_schema)
+  host_span<std::unique_ptr<datasource> const> sources,
+  bool use_arrow_schema,
+  bool has_cols_from_mismatched_srcs)
   : per_file_metadata(metadatas_from_sources(sources)),
     keyval_maps(collect_keyval_metadata()),
+    schema_idx_maps(init_schema_idx_maps(has_cols_from_mismatched_srcs)),
     num_rows(calc_num_rows()),
     num_row_groups(calc_num_row_groups())
 {
-  if (per_file_metadata.size() > 0) {
+  // Validate that all sources have the same schema unless we are reading select columns
+  // from mismatched sources, in which case, we will only check the projected columns later.
+  if (per_file_metadata.size() > 1 and not has_cols_from_mismatched_srcs) {
     auto const& first_meta = per_file_metadata.front();
     auto const num_cols =
       first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0;
@@ -632,7 +648,7 @@ arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
       if (field->type_type() == flatbuf::Type::Type_Duration) {
         auto type_data = field->type_as_Duration();
         if (type_data != nullptr) {
-          auto name = (field->name()) ? field->name()->str() : "";
+          auto name = field->name() ? field->name()->str() : "";
           // set the schema_elem type to duration type
           schema_elem.type = duration_from_flatbuffer(type_data);
           arrow_type_col_seen |= (schema_elem.type.id() != type_id::EMPTY);
@@ -868,12 +884,23 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t
                                                                           size_type src_idx,
                                                                           int schema_idx) const
 {
+  // schema_idx_maps will only have > 0 size when we are reading matching column projection from
+  // mismatched Parquet sources.
+  if (src_idx and not schema_idx_maps.empty()) {
+    auto const& schema_idx_map = schema_idx_maps[src_idx - 1];
+    CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(),
+                 "Unmapped schema index encountered in the specified source tree",
+                 std::range_error);
+    schema_idx = schema_idx_map.at(schema_idx);
+  }
+
   auto col =
     std::find_if(per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
                  per_file_metadata[src_idx].row_groups[row_group_index].columns.end(),
                  [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx; });
   CUDF_EXPECTS(col != std::end(per_file_metadata[src_idx].row_groups[row_group_index].columns),
-               "Found no metadata for schema index");
+               "Found no metadata for schema index",
+               std::range_error);
   return col->meta_data;
 }
 
@@ -1041,18 +1068,19 @@ aggregate_reader_metadata::select_columns(
   std::optional<std::vector<std::string>> const& filter_columns_names,
   bool include_index,
   bool strings_to_categorical,
-  type_id timestamp_type_id) const
+  type_id timestamp_type_id)
 {
-  auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
-    auto const& col_schema_idx =
-      std::find_if(schema_elem.children_idx.cbegin(),
-                   schema_elem.children_idx.cend(),
-                   [&](size_t col_schema_idx) { return get_schema(col_schema_idx).name == name; });
-
-    return (col_schema_idx != schema_elem.children_idx.end())
-             ? static_cast<size_type>(*col_schema_idx)
-             : -1;
-  };
+  auto const find_schema_child =
+    [&](SchemaElement const& schema_elem, std::string const& name, int const pfm_idx = 0) {
+      auto const& col_schema_idx = std::find_if(
+        schema_elem.children_idx.cbegin(),
+        schema_elem.children_idx.cend(),
+        [&](size_t col_schema_idx) { return get_schema(col_schema_idx, pfm_idx).name == name; });
+
+      return (col_schema_idx != schema_elem.children_idx.end())
+               ? static_cast<size_type>(*col_schema_idx)
+               : -1;
+    };
 
   std::vector<cudf::io::detail::inline_column_buffer> output_columns;
   std::vector<input_column_info> input_columns;
@@ -1074,7 +1102,7 @@ aggregate_reader_metadata::select_columns(
       if (schema_elem.is_stub()) {
         // is this legit?
         CUDF_EXPECTS(schema_elem.num_children == 1, "Unexpected number of children for stub");
-        auto child_col_name_info = (col_name_info) ? &col_name_info->children[0] : nullptr;
+        auto const child_col_name_info = col_name_info ? &col_name_info->children[0] : nullptr;
         return build_column(
           child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent);
       }
@@ -1154,6 +1182,97 @@ aggregate_reader_metadata::select_columns(
       return path_is_valid;
     };
 
+  // Compares two schema elements to be equal except their number of children
+  auto const equal_to_except_num_children = [](SchemaElement const& lhs, SchemaElement const& rhs) {
+    return lhs.type == rhs.type and lhs.converted_type == rhs.converted_type and
+           lhs.type_length == rhs.type_length and lhs.repetition_type == rhs.repetition_type and
+           lhs.name == rhs.name and lhs.decimal_scale == rhs.decimal_scale and
+           lhs.decimal_precision == rhs.decimal_precision and lhs.field_id == rhs.field_id;
+  };
+
+  // Maps a projected column's schema_idx in the zeroth per_file_metadata (source) to the
+  // corresponding schema_idx in pfm_idx'th per_file_metadata (destination). The projected
+  // column's path must match across sources, else an appropriate exception is thrown.
+  std::function<void(column_name_info const*, int const, int const, int const)> map_column =
+    [&](column_name_info const* col_name_info,
+        int const src_schema_idx,
+        int const dst_schema_idx,
+        int const pfm_idx) {
+      auto const& src_schema_elem = get_schema(src_schema_idx);
+      auto const& dst_schema_elem = get_schema(dst_schema_idx, pfm_idx);
+
+      // Check the schema elements to be equal except their number of children as we only care about
+      // the specific column paths in the schema trees. Raise an invalid_argument error if the
+      // schema elements don't match.
+      CUDF_EXPECTS(equal_to_except_num_children(src_schema_elem, dst_schema_elem),
+                   "Encountered mismatching SchemaElement properties for a column in "
+                   "the selected path",
+                   std::invalid_argument);
+
+      // If src_schema_elem is a stub, it does not exist in the column_name_info and column_buffer
+      // hierarchy. So continue on with mapping.
+      if (src_schema_elem.is_stub()) {
+        // Check if dst_schema_elem is also a stub i.e. has num_children == 1 that we didn't
+        // previously check. Raise an invalid_argument error if dst_schema_elem is not a stub.
+        CUDF_EXPECTS(dst_schema_elem.is_stub(),
+                     "Encountered mismatching schemas for stub.",
+                     std::invalid_argument);
+        auto const child_col_name_info = col_name_info ? &col_name_info->children[0] : nullptr;
+        return map_column(child_col_name_info,
+                          src_schema_elem.children_idx[0],
+                          dst_schema_elem.children_idx[0],
+                          pfm_idx);
+      }
+
+      // The path ends here. If this is a list/struct col (has children), then map all its children
+      // which must be identical.
+      if (col_name_info == nullptr or col_name_info->children.empty()) {
+        // Check the number of children to be equal to be mapped. An out_of_range error if the
+        // number of children isn't equal.
+        CUDF_EXPECTS(src_schema_elem.num_children == dst_schema_elem.num_children,
+                     "Encountered mismatching number of children for a "
+                     "column in the selected path",
+                     std::out_of_range);
+
+        std::for_each(thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(src_schema_elem.num_children),
+                      [&](auto const child_idx) {
+                        map_column(nullptr,
+                                   src_schema_elem.children_idx[child_idx],
+                                   dst_schema_elem.children_idx[child_idx],
+                                   pfm_idx);
+                      });
+      }
+      // The path goes further down to specific child(ren) of this column so map only those
+      // children.
+      else {
+        std::for_each(
+          col_name_info->children.cbegin(),
+          col_name_info->children.cend(),
+          [&](auto const& child_col_name_info) {
+            // Ensure that each named child column exists in the destination schema tree for the
+            // paths to align up. An out_of_range error otherwise.
+            CUDF_EXPECTS(
+              find_schema_child(dst_schema_elem, child_col_name_info.name, pfm_idx) != -1,
+              "Encountered mismatching schema tree depths across data sources",
+              std::out_of_range);
+            map_column(&child_col_name_info,
+                       find_schema_child(src_schema_elem, child_col_name_info.name),
+                       find_schema_child(dst_schema_elem, child_col_name_info.name, pfm_idx),
+                       pfm_idx);
+          });
+      }
+
+      // We're at a leaf and this is an input column (one with actual data stored) so map it.
+      if (src_schema_elem.num_children == 0) {
+        // Get the schema_idx_map for this data source (pfm)
+        auto& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+
+        // Map the schema index from 0th tree (src) to the one in the current (dst) tree.
+        schema_idx_map[src_schema_idx] = dst_schema_idx;
+      }
+    };
+
   std::vector<int> output_column_schemas;
 
   //
@@ -1287,7 +1406,28 @@ aggregate_reader_metadata::select_columns(
     for (auto& col : selected_columns) {
       auto const& top_level_col_schema_idx = find_schema_child(root, col.name);
       bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false);
-      if (valid_column) output_column_schemas.push_back(top_level_col_schema_idx);
+      if (valid_column) {
+        output_column_schemas.push_back(top_level_col_schema_idx);
+
+        // Map the column's schema_idx across the rest of the data sources if required.
+        if (per_file_metadata.size() > 1 and not schema_idx_maps.empty()) {
+          std::for_each(thrust::make_counting_iterator(static_cast<size_t>(1)),
+                        thrust::make_counting_iterator(per_file_metadata.size()),
+                        [&](auto const pfm_idx) {
+                          auto const& dst_root = get_schema(0, pfm_idx);
+                          // Ensure that each top level column exists in the destination schema
+                          // tree. An out_of_range error is thrown otherwise.
+                          CUDF_EXPECTS(
+                            find_schema_child(dst_root, col.name, pfm_idx) != -1,
+                            "Encountered mismatching schema tree depths across data sources",
+                            std::out_of_range);
+                          map_column(&col,
+                                     top_level_col_schema_idx,
+                                     find_schema_child(dst_root, col.name, pfm_idx),
+                                     pfm_idx);
+                        });
+        }
+      }
     }
   }
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 309132a5347..6f2863136b2 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -128,6 +128,7 @@ struct arrow_schema_data_types {
 class aggregate_reader_metadata {
   std::vector<metadata> per_file_metadata;
   std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
+  std::vector<std::unordered_map<int32_t, int32_t>> schema_idx_maps;
 
   int64_t num_rows;
   size_type num_row_groups;
@@ -144,6 +145,19 @@ class aggregate_reader_metadata {
   [[nodiscard]] std::vector<std::unordered_map<std::string, std::string>> collect_keyval_metadata()
     const;
 
+  /**
+   * @brief Initialize the vector of schema_idx maps.
+   *
+   * Initializes a vector of hash maps that will store the one-to-one mappings between the
+   * schema_idx'es of the selected columns in the zeroth per_file_metadata (source) and each
+   * kth per_file_metadata (destination) for k in range: [1, per_file_metadata.size()-1].
+   *
+   * @param has_cols_from_mismatched_srcs True if we are reading select cols from mismatched
+   * parquet schemas.
+   */
+  [[nodiscard]] std::vector<std::unordered_map<int32_t, int32_t>> init_schema_idx_maps(
+    bool has_cols_from_mismatched_srcs) const;
+
   /**
    * @brief Decodes and constructs the arrow schema from the ARROW_SCHEMA_KEY IPC message
    * in key value metadata section of Parquet file footer
@@ -183,10 +197,28 @@ class aggregate_reader_metadata {
 
  public:
   aggregate_reader_metadata(host_span<std::unique_ptr<datasource> const> sources,
-                            bool use_arrow_schema);
+                            bool use_arrow_schema,
+                            bool has_cols_from_mismatched_srcs);
 
   [[nodiscard]] RowGroup const& get_row_group(size_type row_group_index, size_type src_idx) const;
 
+  /**
+   * @brief Extracts the schema_idx'th column chunk metadata from row_group_index'th row group of
+   * the src_idx'th file.
+   *
+   * Extracts the schema_idx'th column chunk metadata from the specified row group index of the
+   * src_idx'th file. Note that the schema_idx is actually the index in the zeroth file which may
+   * not be the same in all files, in which case, the schema_idx is mapped to the corresponding
+   * index in the src_idx'th file and returned. A range_error error is thrown if schema_idx
+   * doesn't exist or isn't mapped to the src_idx file.
+   *
+   * @param row_group_index The row group index in the file to extract column chunk metadata from.
+   * @param src_idx The per_file_metadata index to extract extract column chunk metadata from.
+   * @param schema_idx The schema_idx of the column chunk to be extracted
+   *
+   * @return The requested column chunk metadata or a range_error error if the schema index isn't
+   * valid.
+   */
   [[nodiscard]] ColumnChunkMetaData const& get_column_metadata(size_type row_group_index,
                                                                size_type src_idx,
                                                                int schema_idx) const;
@@ -202,9 +234,22 @@ class aggregate_reader_metadata {
 
   [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
 
-  [[nodiscard]] auto const& get_schema(int schema_idx) const
+  /**
+   * @brief Extracts the schema_idx'th SchemaElement from the pfm_idx'th file
+   *
+   * @param schema_idx The index of the SchemaElement to be extracted.
+   * @param pfm_idx The index of the per_file_metadata to extract SchemaElement from, default = 0 if
+   * not specified.
+   *
+   * @return The requested SchemaElement or an error if invalid schema_idx or pfm_idx.
+   */
+  [[nodiscard]] auto const& get_schema(int schema_idx, int pfm_idx = 0) const
   {
-    return per_file_metadata[0].schema[schema_idx];
+    CUDF_EXPECTS(
+      schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
+      "Parquet reader encountered an invalid schema_idx or pfm_idx",
+      std::invalid_argument);
+    return per_file_metadata[pfm_idx].schema[schema_idx];
   }
 
   [[nodiscard]] auto const& get_key_value_metadata() const& { return keyval_maps; }
@@ -314,7 +359,7 @@ class aggregate_reader_metadata {
                  std::optional<std::vector<std::string>> const& filter_columns_names,
                  bool include_index,
                  bool strings_to_categorical,
-                 type_id timestamp_type_id) const;
+                 type_id timestamp_type_id);
 };
 
 /**
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index c874a51e220..a0155671a26 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -266,7 +266,8 @@ def read_parquet_chunked(
     size_t chunk_read_limit=0,
     size_t pass_read_limit=1024000000,
     size_type nrows=-1,
-    int64_t skip_rows=0
+    int64_t skip_rows=0,
+    allow_mismatched_pq_schemas=False
 ):
     # Note: If this function ever takes accepts filters
     # allow_range_index needs to be False when a filter is passed
@@ -277,11 +278,12 @@ def read_parquet_chunked(
         plc.io.SourceInfo(filepaths_or_buffers),
         columns,
         row_groups,
-        use_pandas_metadata,
+        use_pandas_metadata=use_pandas_metadata,
         chunk_read_limit=chunk_read_limit,
         pass_read_limit=pass_read_limit,
         skip_rows=skip_rows,
         nrows=nrows,
+        allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
     )
 
     tbl_w_meta = reader.read_chunk()
@@ -323,7 +325,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                    use_pandas_metadata=True,
                    Expression filters=None,
                    size_type nrows=-1,
-                   int64_t skip_rows=0):
+                   int64_t skip_rows=0,
+                   allow_mismatched_pq_schemas=False):
     """
     Cython function to call into libcudf API, see `read_parquet`.
 
@@ -351,6 +354,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         use_pandas_metadata = use_pandas_metadata,
         skip_rows = skip_rows,
         nrows = nrows,
+        allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
     )
 
     df = cudf.DataFrame._from_data(
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 984115dcbbe..526f12aa94e 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -514,6 +514,7 @@ def read_parquet(
     dataset_kwargs=None,
     nrows=None,
     skip_rows=None,
+    allow_mismatched_pq_schemas=False,
     *args,
     **kwargs,
 ):
@@ -622,6 +623,7 @@ def read_parquet(
         dataset_kwargs=dataset_kwargs,
         nrows=nrows,
         skip_rows=skip_rows,
+        allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
         **kwargs,
     )
     # Apply filters row-wise (if any are defined), and return
@@ -865,6 +867,7 @@ def _read_parquet(
     use_pandas_metadata=None,
     nrows=None,
     skip_rows=None,
+    allow_mismatched_pq_schemas=False,
     *args,
     **kwargs,
 ):
@@ -889,6 +892,7 @@ def _read_parquet(
                 use_pandas_metadata=use_pandas_metadata,
                 nrows=nrows if nrows is not None else -1,
                 skip_rows=skip_rows if skip_rows is not None else 0,
+                allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
             )
         else:
             if nrows is None:
@@ -902,6 +906,7 @@ def _read_parquet(
                 use_pandas_metadata=use_pandas_metadata,
                 nrows=nrows,
                 skip_rows=skip_rows,
+                allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
             )
     else:
         if (
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 879b2bd3d74..6623c537ddf 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3809,3 +3809,251 @@ def test_parquet_reader_pandas_compatibility():
     with cudf.option_context("io.parquet.low_memory", True):
         expected = cudf.read_parquet(buffer)
     assert_eq(expected, df)
+
+
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_reader_with_mismatched_tables(store_schema):
+    # cuDF tables with mixed types
+    df1 = cudf.DataFrame(
+        {
+            "i32": cudf.Series([None, None, None], dtype="int32"),
+            "i64": cudf.Series([1234, None, 123], dtype="int64"),
+            "list": list([[1, 2], [None, 4], [5, 6]]),
+            "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "str": ["vfd", None, "ghu"],
+            "d_list": list(
+                [
+                    [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                    [None, pd.Timedelta(minutes=3)],
+                    [pd.Timedelta(minutes=8), None],
+                ]
+            ),
+        }
+    )
+
+    df2 = cudf.DataFrame(
+        {
+            "str": ["abc", "def", None],
+            "i64": cudf.Series([None, 65, 98], dtype="int64"),
+            "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"),
+            "list": list([[7, 8], [9, 10], [None, 12]]),
+            "d_list": list(
+                [
+                    [pd.Timedelta(minutes=4), None],
+                    [None, None],
+                    [pd.Timedelta(minutes=6), None],
+                ]
+            ),
+        }
+    )
+
+    # IO buffers
+    buf1 = BytesIO()
+    buf2 = BytesIO()
+
+    # Write Parquet with and without arrow schema
+    df1.to_parquet(buf1, store_schema=store_schema)
+    df2.to_parquet(buf2, store_schema=store_schema)
+
+    # Read mismatched Parquet files
+    got = cudf.read_parquet(
+        [buf1, buf2],
+        columns=["list", "d_list", "str"],
+        filters=[("i64", ">", 20)],
+        allow_mismatched_pq_schemas=True,
+    )
+
+    # Construct the expected table
+    expected = cudf.concat(
+        [
+            df1[df1["i64"] > 20][["list", "d_list", "str"]],
+            df2[df2["i64"] > 20][["list", "d_list", "str"]],
+        ]
+    ).reset_index(drop=True)
+
+    # Read with chunked reader (filter columns not supported)
+    got_chunked = read_parquet_chunked(
+        [buf1, buf2],
+        columns=["list", "d_list", "str"],
+        chunk_read_limit=240,
+        pass_read_limit=240,
+        allow_mismatched_pq_schemas=True,
+    )
+
+    # Construct the expected table without filter columns
+    expected_chunked = cudf.concat(
+        [df1[["list", "d_list", "str"]], df2[["list", "d_list", "str"]]]
+    ).reset_index(drop=True)
+
+    # Check results
+    assert_eq(expected, got)
+    assert_eq(expected_chunked, got_chunked)
+
+
+def test_parquet_reader_with_mismatched_structs():
+    data1 = [
+        {
+            "a": 1,
+            "b": {
+                "inner_a": 10,
+                "inner_b": {"inner_inner_b": 1, "inner_inner_a": 2},
+            },
+            "c": 2,
+        },
+        {
+            "a": 3,
+            "b": {"inner_a": 30, "inner_b": {"inner_inner_a": 210}},
+            "c": 4,
+        },
+        {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+        {"a": 7, "b": None, "c": 8},
+        {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
+        None,
+        {
+            "a": None,
+            "b": {
+                "inner_a": None,
+                "inner_b": {"inner_inner_b": None, "inner_inner_a": 10},
+            },
+            "c": 10,
+        },
+    ]
+
+    data2 = [
+        {"a": 1, "b": {"inner_b": {"inner_inner_a": None}}},
+        {"a": 3, "b": {"inner_b": {"inner_inner_a": 1}}},
+        {"a": 5, "b": {"inner_b": None}},
+        {"a": 7, "b": {"inner_b": {"inner_inner_b": 1, "inner_inner_a": 0}}},
+        {"a": None, "b": {"inner_b": None}},
+        None,
+        {"a": None, "b": {"inner_b": {"inner_inner_a": 1}}},
+    ]
+
+    # cuDF tables from struct data
+    df1 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data1}))
+    df2 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data2}))
+
+    # Buffers
+    buf1 = BytesIO()
+    buf2 = BytesIO()
+
+    # Write to parquet
+    df1.to_parquet(buf1)
+    df2.to_parquet(buf2)
+
+    # Read the struct.b.inner_b.inner_inner_a column from parquet
+    got = cudf.read_parquet(
+        [buf1, buf2],
+        columns=["struct.b.inner_b.inner_inner_a"],
+        allow_mismatched_pq_schemas=True,
+    )
+    got = (
+        cudf.Series(got["struct"])
+        .struct.field("b")
+        .struct.field("inner_b")
+        .struct.field("inner_inner_a")
+    )
+
+    # Read with chunked reader
+    got_chunked = read_parquet_chunked(
+        [buf1, buf2],
+        columns=["struct.b.inner_b.inner_inner_a"],
+        chunk_read_limit=240,
+        pass_read_limit=240,
+        allow_mismatched_pq_schemas=True,
+    )
+    got_chunked = (
+        cudf.Series(got_chunked["struct"])
+        .struct.field("b")
+        .struct.field("inner_b")
+        .struct.field("inner_inner_a")
+    )
+
+    # Construct the expected series
+    expected = cudf.concat(
+        [
+            cudf.Series(df1["struct"])
+            .struct.field("b")
+            .struct.field("inner_b")
+            .struct.field("inner_inner_a"),
+            cudf.Series(df2["struct"])
+            .struct.field("b")
+            .struct.field("inner_b")
+            .struct.field("inner_inner_a"),
+        ]
+    ).reset_index(drop=True)
+
+    # Check results
+    assert_eq(expected, got)
+    assert_eq(expected, got_chunked)
+
+
+def test_parquet_reader_with_mismatched_schemas_error():
+    df1 = cudf.DataFrame(
+        {
+            "millis": cudf.Series([123, 3454, 123], dtype="timedelta64[ms]"),
+            "i64": cudf.Series([123, 3454, 123], dtype="int64"),
+            "i32": cudf.Series([123, 3454, 123], dtype="int32"),
+        }
+    )
+    df2 = cudf.DataFrame(
+        {
+            "i64": cudf.Series([123, 3454, 123], dtype="int64"),
+            "millis": cudf.Series([123, 3454, 123], dtype="timedelta64[ms]"),
+        }
+    )
+
+    buf1 = BytesIO()
+    buf2 = BytesIO()
+
+    df1.to_parquet(buf1, store_schema=True)
+    df2.to_parquet(buf2, store_schema=False)
+
+    with pytest.raises(
+        ValueError,
+        match="Encountered mismatching SchemaElement properties for a column in the selected path",
+    ):
+        cudf.read_parquet(
+            [buf1, buf2], columns=["millis"], allow_mismatched_pq_schemas=True
+        )
+
+    data1 = [
+        {"a": 1, "b": {"inner_a": 1, "inner_b": 6}},
+        {"a": 3, "b": {"inner_a": None, "inner_b": 2}},
+    ]
+    data2 = [
+        {"b": {"inner_a": 1}, "c": "str"},
+        {"b": {"inner_a": None}, "c": None},
+    ]
+
+    # cuDF tables from struct data
+    df1 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data1}))
+    df2 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data2}))
+
+    # Buffers
+    buf1 = BytesIO()
+    buf2 = BytesIO()
+
+    # Write to parquet
+    df1.to_parquet(buf1)
+    df2.to_parquet(buf2)
+
+    with pytest.raises(
+        IndexError,
+        match="Encountered mismatching number of children for a column in the selected path",
+    ):
+        cudf.read_parquet(
+            [buf1, buf2],
+            columns=["struct.b"],
+            allow_mismatched_pq_schemas=True,
+        )
+
+    with pytest.raises(
+        IndexError,
+        match="Encountered mismatching schema tree depths across data sources",
+    ):
+        cudf.read_parquet(
+            [buf1, buf2],
+            columns=["struct.b.inner_b"],
+            allow_mismatched_pq_schemas=True,
+        )
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 94974e595b1..6b146be0fa3 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -184,6 +184,9 @@
 
     .. note:
        This option is not supported when the low-memory mode is on.
+allow_mismatched_pq_schemas : boolean, default False
+    If True, enables reading (matching) columns specified in `columns` and `filters`
+    options from the input files with otherwise mismatched schemas.
 
 Returns
 -------
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd
index 47458b00159..9c476030ded 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/io/parquet.pxd
@@ -28,6 +28,7 @@ cpdef read_parquet(
     bool use_pandas_metadata = *,
     int64_t skip_rows = *,
     size_type nrows = *,
+    bool allow_mismatched_pq_schemas = *,
     # disabled see comment in parquet.pyx for more
     # ReaderColumnSchema reader_column_schema = *,
     # DataType timestamp_type = *
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index fb5244a2a9e..df1f1b14247 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -26,6 +26,7 @@ cdef parquet_reader_options _setup_parquet_reader_options(
     bool use_pandas_metadata = True,
     int64_t skip_rows = 0,
     size_type nrows = -1,
+    bool allow_mismatched_pq_schemas=False,
     # ReaderColumnSchema reader_column_schema = None,
     # DataType timestamp_type = DataType(type_id.EMPTY)
 ):
@@ -34,6 +35,7 @@ cdef parquet_reader_options _setup_parquet_reader_options(
         parquet_reader_options.builder(source_info.c_obj)
         .convert_strings_to_categories(convert_strings_to_categories)
         .use_pandas_metadata(use_pandas_metadata)
+        .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
         .use_arrow_schema(True)
         .build()
     )
@@ -80,6 +82,9 @@ cdef class ChunkedParquetReader:
     pass_read_limit : size_t, default 1024000000
         Limit on the amount of memory used for reading and decompressing data
         or 0 if there is no limit.
+    allow_mismatched_pq_schemas : bool, default False
+        Whether to read (matching) columns specified in `columns` from
+        the input files with otherwise mismatched schemas.
     """
     def __init__(
         self,
@@ -91,7 +96,8 @@ cdef class ChunkedParquetReader:
         int64_t skip_rows = 0,
         size_type nrows = -1,
         size_t chunk_read_limit=0,
-        size_t pass_read_limit=1024000000
+        size_t pass_read_limit=1024000000,
+        bool allow_mismatched_pq_schemas=False
     ):
 
         cdef parquet_reader_options opts = _setup_parquet_reader_options(
@@ -103,6 +109,7 @@ cdef class ChunkedParquetReader:
             use_pandas_metadata=use_pandas_metadata,
             skip_rows=skip_rows,
             nrows=nrows,
+            allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
         )
 
         with nogil:
@@ -152,6 +159,7 @@ cpdef read_parquet(
     bool use_pandas_metadata = True,
     int64_t skip_rows = 0,
     size_type nrows = -1,
+    bool allow_mismatched_pq_schemas = False,
     # Disabled, these aren't used by cudf-python
     # we should only add them back in if there's user demand
     # ReaderColumnSchema reader_column_schema = None,
@@ -179,6 +187,9 @@ cpdef read_parquet(
         The number of rows to skip from the start of the file.
     nrows : size_type, default -1
         The number of rows to read. By default, read the entire file.
+    allow_mismatched_pq_schemas : bool, default False
+        If True, enable reading (matching) columns specified in `columns`
+        from the input files with otherwise mismatched schemas.
 
     Returns
     -------
@@ -195,6 +206,7 @@ cpdef read_parquet(
         use_pandas_metadata,
         skip_rows,
         nrows,
+        allow_mismatched_pq_schemas,
     )
 
     with nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
index 222d87defa0..de6a6c1e82d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
@@ -32,7 +32,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         data_type get_timestamp_type() except +
         bool is_enabled_use_pandas_metadata() except +
         bool is_enabled_arrow_schema() except +
-
+        bool is_enabled_allow_mismatched_pq_schemas() except +
         # setter
 
         void set_filter(expression &filter) except +
@@ -41,6 +41,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_row_groups(vector[vector[size_type]] row_grp) except +
         void set_skip_rows(int64_t val) except +
         void enable_use_arrow_schema(bool val) except +
+        void enable_allow_mismatched_pq_schemas(bool val) except +
         void enable_use_pandas_metadata(bool val) except +
         void set_timestamp_type(data_type type) except +
 
@@ -69,6 +70,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_reader_options_builder& use_arrow_schema(
             bool val
         ) except +
+        parquet_reader_options_builder& allow_mismatched_pq_schemas(
+            bool val
+        ) except +
         parquet_reader_options_builder& timestamp_type(
             data_type type
         ) except +

From 9e9efcc9f5ed8411fb09f4d8384e14612a7f3b10 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Thu, 29 Aug 2024 09:19:48 +1000
Subject: [PATCH 143/270] Replace raw device_memory_resource pointer in
 pylibcudf Cython (#16674)

Replaces a single `device_memory_resource*` in pylibcudf Cython inline C++ function with `rmm::device_async_resource_ref` to help smooth RMM refactoring effort.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16674
---
 python/pylibcudf/pylibcudf/libcudf/interop.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
index c7efff2340d..9228c017d93 100644
--- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
@@ -71,7 +71,7 @@ cdef extern from *:
     ArrowArray* to_arrow_host_raw(
       cudf::table_view const& tbl,
       rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) {
+      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) {
       // Assumes the sync event is null and the data is already on the host.
       ArrowArray *arr = new ArrowArray();
       auto device_arr = cudf::to_arrow_host(tbl, stream, mr);

From f6e2355dfefb1a02a984425aabeca7a4fcb2bfde Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 28 Aug 2024 19:03:34 -0500
Subject: [PATCH 144/270] Handle `ordered` parameter in
 `CategoricalIndex.__repr__` (#16683)

Thanks @mroeschke for catching this in https://github.com/rapidsai/cudf/pull/16665#discussion_r1735277661

This PR factors in the `ordered` parameter while generating the `repr` for `CategoricalIndex`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16683
---
 python/cudf/cudf/core/index.py      | 1 +
 python/cudf/cudf/tests/test_repr.py | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 500fc580097..fc35ffa3744 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1456,6 +1456,7 @@ def __repr__(self):
                 pd_preprocess.dtype._categories = (
                     preprocess.categories.to_pandas()
                 )
+                pd_preprocess.dtype._ordered = preprocess.dtype.ordered
                 cats_repr = repr(pd_preprocess).split("\n")
                 output = "\n".join(data_repr[:-1] + cats_repr[-1:])
 
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 57eef9e3463..681b467f66c 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1491,3 +1491,11 @@ def test_large_unique_categories_repr():
     with utils.cudf_timeout(2, timeout_message="Failed to repr fast enough"):
         actual_repr = repr(gi)
     assert expected_repr == actual_repr
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_categorical_index_ordered(ordered):
+    pi = pd.CategoricalIndex(range(10), ordered=ordered)
+    gi = cudf.CategoricalIndex(range(10), ordered=ordered)
+
+    assert repr(pi) == repr(gi)

From f2d153b5e1d0c8410947afb438033468dc84d1b8 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 29 Aug 2024 10:30:21 -1000
Subject: [PATCH 145/270] Have interval_range use IntervalIndex.from_breaks,
 remove column_empty_same_mask (#16694)

To match pandas implementation, `interval_range` dispatches to `IntervalIndex.from_breaks` which allows some code deduplication. This also allows us to remove `column_empty_same_mask` which (luckily) I didn't find any usage across RAPIDS

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16694
---
 python/cudf/cudf/core/column/__init__.py |  1 -
 python/cudf/cudf/core/column/column.py   | 16 -------------
 python/cudf/cudf/core/index.py           | 30 +++---------------------
 3 files changed, 3 insertions(+), 44 deletions(-)

diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 5781d77ee9a..06791df7dc0 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -11,7 +11,6 @@
     build_column,
     column_empty,
     column_empty_like,
-    column_empty_like_same_mask,
     concat_columns,
     deserialize_columns,
     serialize_columns,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 885476a897c..7674565e2c3 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1483,22 +1483,6 @@ def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
     )
 
 
-def column_empty_like_same_mask(
-    column: ColumnBase, dtype: Dtype
-) -> ColumnBase:
-    """Create a new empty Column with the same length and the same mask.
-
-    Parameters
-    ----------
-    dtype : np.dtype like
-        The dtype of the data buffer.
-    """
-    result = column_empty_like(column, dtype)
-    if column.nullable:
-        result = result.set_mask(column.mask)
-    return result
-
-
 def column_empty(
     row_count: int, dtype: Dtype = "object", masked: bool = False
 ) -> ColumnBase:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index fc35ffa3744..241a276ebe2 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3346,31 +3346,7 @@ def interval_range(
         init=start.device_value,
         step=freq.device_value,
     )
-    left_col = bin_edges.slice(0, len(bin_edges) - 1)
-    right_col = bin_edges.slice(1, len(bin_edges))
-    # For indexing, children should both have 0 offset
-    right_col = type(right_col)(
-        data=right_col.data,
-        dtype=right_col.dtype,
-        size=right_col.size,
-        mask=right_col.mask,
-        offset=0,
-        null_count=right_col.null_count,
-        children=right_col.children,
-    )
-
-    if len(right_col) == 0 or len(left_col) == 0:
-        dtype = IntervalDtype("int64", closed)
-        data = column.column_empty_like_same_mask(left_col, dtype)
-        return IntervalIndex(data, closed=closed, name=name)
-
-    interval_col = IntervalColumn(
-        data=None,
-        dtype=IntervalDtype(left_col.dtype, closed),
-        size=len(left_col),
-        children=(left_col, right_col),
-    )
-    return IntervalIndex(interval_col, closed=closed, name=name)
+    return IntervalIndex.from_breaks(bin_edges, closed=closed, name=name)
 
 
 class IntervalIndex(Index):
@@ -3520,7 +3496,7 @@ def from_breaks(
         left_col = breaks.slice(0, len(breaks) - 1)
         right_col = breaks.slice(1, len(breaks))
         # For indexing, children should both have 0 offset
-        right_col = column.build_column(
+        right_col = type(right_col)(
             data=right_col.data,
             dtype=right_col.dtype,
             size=right_col.size,
@@ -3536,7 +3512,7 @@ def from_breaks(
             size=len(left_col),
             children=(left_col, right_col),
         )
-        return IntervalIndex(interval_col, name=name, closed=closed)
+        return IntervalIndex._from_column(interval_col, name=name)
 
     @classmethod
     def from_arrays(

From eca5108d2f3120c83b26ba5e3c9a6cfaa2b0b233 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 29 Aug 2024 16:37:15 -0400
Subject: [PATCH 146/270] Disable gtests/ERROR_TEST during compute-sanitizer
 memcheck test (#16691)

Disables the `gtests/ERROR_TEST` when run under `compute-sanitizer` for memcheck. The `compute-sanitizer` started hanging on some of these tests. There is no value in running memcheck on any of the tests in `ERROR_TEST`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/16691
---
 cpp/tests/error/error_handling_test.cu | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 1dfe45556c4..9c7459fa69d 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -50,8 +50,6 @@ CUDF_KERNEL void test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
 // calls.
 TEST(StreamCheck, FailedKernel)
 {
-  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { GTEST_SKIP(); }
-
   rmm::cuda_stream stream;
   int a;
   test_kernel<<<0, 0, 0, stream.value()>>>(&a);
@@ -63,8 +61,6 @@ TEST(StreamCheck, FailedKernel)
 
 TEST(StreamCheck, CatchFailedKernel)
 {
-  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { GTEST_SKIP(); }
-
   rmm::cuda_stream stream;
   int a;
   test_kernel<<<0, 0, 0, stream.value()>>>(&a);
@@ -131,6 +127,8 @@ TEST(DebugAssert, cudf_assert_true)
 // 2.) The RMM Pool interferes with the death test
 int main(int argc, char** argv)
 {
+  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return 0; }
+
   ::testing::InitGoogleTest(&argc, argv);
   auto const cmd_opts = parse_cudf_test_opts(argc, argv);
   auto adaptor        = make_stream_mode_adaptor(cmd_opts);

From 21d05d73a66c0bc0009ff378beb58fb4f0f2bf2d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 29 Aug 2024 16:40:14 -0400
Subject: [PATCH 147/270] Move apply_boolean_mask benchmark to nvbench (#16616)

Reworks the `apply_booleam_mask` benchmark as an nvbench benchmark under the `STREAM_COMPACTION_NVBENCH` module. `cudf::string_view` was added as a type to help measure the performance improvement in a follow on PR for `apply_boolean_mask` for strings

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16616
---
 cpp/benchmarks/CMakeLists.txt                 |   5 +-
 .../stream_compaction/apply_boolean_mask.cpp  | 138 ++++++------------
 2 files changed, 48 insertions(+), 95 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7f3edfa0a01..99ef9e2976f 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -151,14 +151,11 @@ ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else.cpp)
 # * transpose benchmark ---------------------------------------------------------------------------
 ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp)
 
-# ##################################################################################################
-# * apply_boolean_mask benchmark ------------------------------------------------------------------
-ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp)
-
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
   STREAM_COMPACTION_NVBENCH
+  stream_compaction/apply_boolean_mask.cpp
   stream_compaction/distinct.cpp
   stream_compaction/distinct_count.cpp
   stream_compaction/stable_distinct.cpp
diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
index 492237474ff..fa017ca9e29 100644
--- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
+++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
@@ -15,120 +15,76 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf/stream_compaction.hpp>
+#include <cudf/strings/string_view.hpp>
 
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
+#include <nvbench/nvbench.cuh>
 
 namespace {
 
-constexpr cudf::size_type hundredM      = 1e8;
-constexpr cudf::size_type tenM          = 1e7;
-constexpr cudf::size_type tenK          = 1e4;
-constexpr cudf::size_type fifty_percent = 50;
-
-void percent_range(benchmark::internal::Benchmark* b)
-{
-  b->Unit(benchmark::kMillisecond);
-  for (int percent = 0; percent <= 100; percent += 10)
-    b->Args({hundredM, percent});
-}
-
-void size_range(benchmark::internal::Benchmark* b)
-{
-  b->Unit(benchmark::kMillisecond);
-  for (int size = tenK; size <= hundredM; size *= 10)
-    b->Args({size, fifty_percent});
-}
-
 template <typename T>
-void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns)
+void calculate_bandwidth(nvbench::state& state)
 {
-  cudf::size_type const column_size{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const percent_true{static_cast<cudf::size_type>(state.range(1))};
-
-  float const fraction                  = percent_true / 100.f;
-  cudf::size_type const column_size_out = fraction * column_size;
-  int64_t const mask_size =
-    sizeof(bool) * column_size + cudf::bitmask_allocation_size_bytes(column_size);
-  int64_t const validity_bytes_in  = (fraction >= 1.0f / 32)
-                                       ? cudf::bitmask_allocation_size_bytes(column_size)
-                                       : 4 * column_size_out;
-  int64_t const validity_bytes_out = cudf::bitmask_allocation_size_bytes(column_size_out);
-  int64_t const column_bytes_out   = sizeof(T) * column_size_out;
+  auto const n_rows       = static_cast<cudf::size_type>(state.get_int64("rows"));
+  auto const n_cols       = static_cast<cudf::size_type>(state.get_int64("columns"));
+  auto const percent_true = static_cast<cudf::size_type>(state.get_int64("hits_%"));
+
+  double const fraction             = percent_true / 100.0;
+  cudf::size_type const output_size = fraction * n_rows;
+  int64_t const mask_size = sizeof(bool) * n_rows + cudf::bitmask_allocation_size_bytes(n_rows);
+  int64_t const validity_bytes_in =
+    (fraction >= 1.0 / 32) ? cudf::bitmask_allocation_size_bytes(n_rows) : 4 * output_size;
+  int64_t const validity_bytes_out = cudf::bitmask_allocation_size_bytes(output_size);
+  int64_t const column_bytes_out   = sizeof(T) * output_size;
   int64_t const column_bytes_in    = column_bytes_out;  // we only read unmasked inputs
 
-  int64_t const bytes_read =
-    (column_bytes_in + validity_bytes_in) * num_columns +  // reading columns
-    mask_size;                                             // reading boolean mask
+  int64_t const bytes_read = (column_bytes_in + validity_bytes_in) * n_cols +  // reading columns
+                             mask_size;  // reading boolean mask
   int64_t const bytes_written =
-    (column_bytes_out + validity_bytes_out) * num_columns;  // writing columns
+    (column_bytes_out + validity_bytes_out) * n_cols;  // writing columns
 
-  state.SetItemsProcessed(state.iterations() * column_size * num_columns);
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * (bytes_read + bytes_written));
+  state.add_element_count(n_rows * n_cols);
+  state.add_global_memory_reads<nvbench::int8_t>(bytes_read);
+  state.add_global_memory_writes<nvbench::int8_t>(bytes_written);
 }
 
 }  // namespace
 
-template <class T>
-void BM_apply_boolean_mask(benchmark::State& state, cudf::size_type num_columns)
+template <typename DataType>
+void apply_boolean_mask_benchmark(nvbench::state& state, nvbench::type_list<DataType>)
 {
-  cudf::size_type const column_size{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const percent_true{static_cast<cudf::size_type>(state.range(1))};
+  auto const n_rows       = static_cast<cudf::size_type>(state.get_int64("rows"));
+  auto const n_cols       = static_cast<cudf::size_type>(state.get_int64("columns"));
+  auto const percent_true = static_cast<cudf::size_type>(state.get_int64("hits_%"));
 
-  data_profile profile = data_profile_builder().cardinality(0).null_probability(0.0).distribution(
-    cudf::type_to_id<T>(), distribution_id::UNIFORM, 0, 100);
+  auto const input_type = cudf::type_to_id<DataType>();
+  data_profile profile  = data_profile_builder().cardinality(0).no_validity().distribution(
+    input_type, distribution_id::UNIFORM, 0, 20);
 
-  auto source_table = create_random_table(
-    cycle_dtypes({cudf::type_to_id<T>()}, num_columns), row_count{column_size}, profile);
+  auto source_table =
+    create_random_table(cycle_dtypes({input_type}, n_cols), row_count{n_rows}, profile);
 
   profile.set_bool_probability_true(percent_true / 100.0);
   profile.set_null_probability(std::nullopt);  // no null mask
-  auto mask = create_random_column(cudf::type_id::BOOL8, row_count{column_size}, profile);
+  auto mask = create_random_column(cudf::type_id::BOOL8, row_count{n_rows}, profile);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  calculate_bandwidth<DataType>(state);
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    auto result = cudf::apply_boolean_mask(*source_table, mask->view());
-  }
+  state.exec(nvbench::exec_tag::sync, [&source_table, &mask](nvbench::launch& launch) {
+    cudf::apply_boolean_mask(*source_table, mask->view());
+  });
 
-  calculate_bandwidth<T>(state, num_columns);
+  set_throughputs(state);
 }
 
-template <class T>
-class ApplyBooleanMask : public cudf::benchmark {
- public:
-  using TypeParam = T;
-};
-
-#define ABM_BENCHMARK_DEFINE(name, type, n_columns)                                  \
-  BENCHMARK_TEMPLATE_DEFINE_F(ApplyBooleanMask, name, type)(::benchmark::State & st) \
-  {                                                                                  \
-    BM_apply_boolean_mask<TypeParam>(st, n_columns);                                 \
-  }
-
-ABM_BENCHMARK_DEFINE(float_1_col, float, 1);
-ABM_BENCHMARK_DEFINE(float_2_col, float, 2);
-ABM_BENCHMARK_DEFINE(float_4_col, float, 4);
-
-// shmoo 1, 2, 4 column float across percentage true
-BENCHMARK_REGISTER_F(ApplyBooleanMask, float_1_col)->Apply(percent_range);
-BENCHMARK_REGISTER_F(ApplyBooleanMask, float_2_col)->Apply(percent_range);
-BENCHMARK_REGISTER_F(ApplyBooleanMask, float_4_col)->Apply(percent_range);
-
-// shmoo 1, 2, 4 column float across column sizes with 50% true
-BENCHMARK_REGISTER_F(ApplyBooleanMask, float_1_col)->Apply(size_range);
-BENCHMARK_REGISTER_F(ApplyBooleanMask, float_2_col)->Apply(size_range);
-BENCHMARK_REGISTER_F(ApplyBooleanMask, float_4_col)->Apply(size_range);
-
-// spot benchmark other types
-ABM_BENCHMARK_DEFINE(int8_1_col, int8_t, 1);
-ABM_BENCHMARK_DEFINE(int16_1_col, int16_t, 1);
-ABM_BENCHMARK_DEFINE(int32_1_col, int32_t, 1);
-ABM_BENCHMARK_DEFINE(int64_1_col, int64_t, 1);
-ABM_BENCHMARK_DEFINE(double_1_col, double, 1);
-BENCHMARK_REGISTER_F(ApplyBooleanMask, int8_1_col)->Args({tenM, fifty_percent});
-BENCHMARK_REGISTER_F(ApplyBooleanMask, int16_1_col)->Args({tenM, fifty_percent});
-BENCHMARK_REGISTER_F(ApplyBooleanMask, int32_1_col)->Args({tenM, fifty_percent});
-BENCHMARK_REGISTER_F(ApplyBooleanMask, int64_1_col)->Args({tenM, fifty_percent});
-BENCHMARK_REGISTER_F(ApplyBooleanMask, double_1_col)->Args({tenM, fifty_percent});
+using data_type = nvbench::type_list<int32_t, int64_t, double, cudf::string_view>;
+NVBENCH_BENCH_TYPES(apply_boolean_mask_benchmark, NVBENCH_TYPE_AXES(data_type))
+  .set_name("apply_boolean_mask")
+  .set_type_axes_names({"type"})
+  .add_int64_axis("columns", {1, 4})
+  .add_int64_axis("rows", {100'000, 1'000'000, 10'000'000})
+  .add_int64_axis("hits_%", {10, 50, 100});

From 8c7af08073fba49c7a7e62cc30595b2962ae7e65 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 29 Aug 2024 18:18:05 -0500
Subject: [PATCH 148/270] Increase timeouts for couple of tests (#16692)

This PR increases timeouts for tests.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16692
---
 python/cudf/cudf/testing/_utils.py            | 25 ++++++++++---------
 python/cudf/cudf/tests/test_repr.py           |  2 +-
 .../cudf_pandas_tests/test_cudf_pandas.py     |  2 +-
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 540f12c8382..8cb9efa873c 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import itertools
-import signal
 import string
+import time
 from collections import abc
 from contextlib import contextmanager
 from decimal import Decimal
@@ -376,16 +376,17 @@ class cudf_timeout:
     Context manager to raise a TimeoutError after a specified number of seconds.
     """
 
-    def __init__(self, seconds, *, timeout_message=""):
-        self.seconds = int(seconds)
-        self.timeout_message = timeout_message
-
-    def _timeout_handler(self, signum, frame):
-        raise TimeoutError(self.timeout_message)
+    def __init__(self, timeout):
+        self.timeout = timeout
 
     def __enter__(self):
-        signal.signal(signal.SIGALRM, self._timeout_handler)
-        signal.alarm(self.seconds)
-
-    def __exit__(self, type, value, traceback):
-        signal.alarm(0)
+        self.start_time = time.perf_counter()
+
+    def __exit__(self, *args):
+        elapsed_time = (
+            time.perf_counter() - self.start_time
+        )  # Calculate elapsed time
+        if elapsed_time >= self.timeout:
+            raise TimeoutError(
+                f"Expected to finish in {self.timeout=} seconds but took {elapsed_time=} seconds"
+            )
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 681b467f66c..95e19fae501 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1488,7 +1488,7 @@ def test_large_unique_categories_repr():
     pi = pd.CategoricalIndex(range(100_000_000))
     gi = cudf.CategoricalIndex(range(100_000_000))
     expected_repr = repr(pi)
-    with utils.cudf_timeout(2, timeout_message="Failed to repr fast enough"):
+    with utils.cudf_timeout(6):
         actual_repr = repr(gi)
     assert expected_repr == actual_repr
 
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 0827602852d..505d5d0b9cc 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1664,7 +1664,7 @@ def test_notebook_slow_repr():
         nb = nbformat.read(f, as_version=4)
 
     ep = ExecutePreprocessor(
-        timeout=20, kernel_name=jupyter_client.KernelManager().kernel_name
+        timeout=30, kernel_name=jupyter_client.KernelManager().kernel_name
     )
 
     try:

From 53f488ba2db10bead273b1e5eff5f1a07703a7ae Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 29 Aug 2024 14:09:14 -1000
Subject: [PATCH 149/270] Add type annotations to Index classes, utilize
 _from_column more (#16695)

* Add more type annotations to `index.py`
* More consistently use `Index._from_column` where appropriate
* Remove single used `Index._indices_of` in favor of just accessing the `Column._indicies_of` method

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16695
---
 python/cudf/cudf/core/index.py  | 165 +++++++++++++++-----------------
 python/cudf/cudf/core/series.py |   5 +-
 2 files changed, 83 insertions(+), 87 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 241a276ebe2..66d03682de4 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -46,7 +46,6 @@
 from cudf.core.column.column import as_column, concat_columns
 from cudf.core.column.string import StringMethods as StringMethods
 from cudf.core.dtypes import IntervalDtype
-from cudf.core.frame import Frame
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.core.mixins import BinaryOperand
 from cudf.core.single_column_frame import SingleColumnFrame
@@ -63,6 +62,8 @@
     from collections.abc import Generator, Iterable
     from datetime import tzinfo
 
+    from cudf.core.frame import Frame
+
 
 def ensure_index(index_like: Any) -> BaseIndex:
     """
@@ -316,7 +317,7 @@ def _num_rows(self) -> int:
 
     @cached_property  # type: ignore
     @_performance_tracking
-    def _values(self):
+    def _values(self) -> ColumnBase:
         if len(self) > 0:
             return column.as_column(self._range, dtype=self.dtype)
         else:
@@ -582,7 +583,7 @@ def __rmul__(self, other):
         return self.__mul__(other)
 
     @_performance_tracking
-    def _as_int_index(self):
+    def _as_int_index(self) -> Index:
         # Convert self to an integer index. This method is used to perform ops
         # that are not defined directly on RangeIndex.
         return cudf.Index._from_data(self._data)
@@ -870,12 +871,12 @@ def join(
 
     @property  # type: ignore
     @_performance_tracking
-    def _column(self):
+    def _column(self) -> ColumnBase:
         return self._as_int_index()._column
 
     @property  # type: ignore
     @_performance_tracking
-    def _columns(self):
+    def _columns(self) -> list[ColumnBase]:
         return self._as_int_index()._columns
 
     @property  # type: ignore
@@ -937,7 +938,7 @@ def notna(self) -> cupy.ndarray:
     notnull = isna
 
     @_performance_tracking
-    def _minmax(self, meth: str):
+    def _minmax(self, meth: str) -> int | float:
         no_steps = len(self) - 1
         if no_steps == -1:
             return np.nan
@@ -948,10 +949,10 @@ def _minmax(self, meth: str):
 
         return self.start + self.step * no_steps
 
-    def min(self):
+    def min(self) -> int | float:
         return self._minmax("min")
 
-    def max(self):
+    def max(self) -> int | float:
         return self._minmax("max")
 
     @property
@@ -1115,7 +1116,7 @@ def _from_data_like_self(
 
     @classmethod
     @_performance_tracking
-    def from_arrow(cls, obj):
+    def from_arrow(cls, obj) -> Index | cudf.MultiIndex:
         """Create from PyArrow Array/ChunkedArray.
 
         Parameters
@@ -1145,11 +1146,11 @@ def from_arrow(cls, obj):
             return cudf.MultiIndex.from_arrow(obj)
 
     @cached_property
-    def is_monotonic_increasing(self):
+    def is_monotonic_increasing(self) -> bool:
         return super().is_monotonic_increasing
 
     @cached_property
-    def is_monotonic_decreasing(self):
+    def is_monotonic_decreasing(self) -> bool:
         return super().is_monotonic_decreasing
 
     def _binaryop(
@@ -1191,7 +1192,7 @@ def _binaryop(
 
     @property  # type: ignore
     @_performance_tracking
-    def _values(self):
+    def _values(self) -> ColumnBase:
         return self._column
 
     @classmethod
@@ -1239,12 +1240,12 @@ def _concat(cls, objs):
         return result
 
     @_performance_tracking
-    def memory_usage(self, deep=False):
+    def memory_usage(self, deep: bool = False) -> int:
         return self._column.memory_usage
 
     @cached_property  # type: ignore
     @_performance_tracking
-    def is_unique(self):
+    def is_unique(self) -> bool:
         return self._column.is_unique
 
     @_performance_tracking
@@ -1271,7 +1272,7 @@ def equals(self, other) -> bool:
             return False
 
     @_performance_tracking
-    def copy(self, name=None, deep=False):
+    def copy(self, name: Hashable = None, deep: bool = False) -> Self:
         """
         Make a copy of this object.
 
@@ -1288,13 +1289,11 @@ def copy(self, name=None, deep=False):
         New index instance.
         """
         name = self.name if name is None else name
-
-        return _index_from_data(
-            {name: self._values.copy(True) if deep else self._values}
-        )
+        col = self._column.copy(deep=True) if deep else self._column
+        return type(self)._from_column(col, name=name)
 
     @_performance_tracking
-    def astype(self, dtype, copy: bool = True):
+    def astype(self, dtype, copy: bool = True) -> Index:
         return super().astype({self.name: dtype}, copy)
 
     @_performance_tracking
@@ -1405,7 +1404,7 @@ def get_loc(self, key) -> int | slice | cupy.ndarray:
         return mask
 
     @_performance_tracking
-    def __repr__(self):
+    def __repr__(self) -> str:
         max_seq_items = pd.get_option("max_seq_items") or len(self)
         mr = 0
         if 2 * max_seq_items < len(self):
@@ -1501,8 +1500,8 @@ def __repr__(self):
             keywords.append(
                 f"freq={self._freq._maybe_as_fast_pandas_offset().freqstr!r}"
             )
-        keywords = ", ".join(keywords)
-        lines.append(f"{prior_to_dtype} {keywords})")
+        joined_keywords = ", ".join(keywords)
+        lines.append(f"{prior_to_dtype} {joined_keywords})")
         return "\n".join(lines)
 
     @_performance_tracking
@@ -1518,47 +1517,47 @@ def dtype(self):
         """
         `dtype` of the underlying values in Index.
         """
-        return self._values.dtype
+        return self._column.dtype
 
     @_performance_tracking
-    def isna(self):
+    def isna(self) -> cupy.ndarray:
         return self._column.isnull().values
 
     isnull = isna
 
     @_performance_tracking
-    def notna(self):
+    def notna(self) -> cupy.ndarray:
         return self._column.notnull().values
 
     notnull = notna
 
-    def _is_numeric(self):
+    def _is_numeric(self) -> bool:
         return (
             isinstance(self._values, cudf.core.column.NumericalColumn)
             and self.dtype.kind != "b"
         )
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return self.dtype.kind == "b"
 
-    def _is_integer(self):
+    def _is_integer(self) -> bool:
         return self.dtype.kind in "iu"
 
-    def _is_floating(self):
+    def _is_floating(self) -> bool:
         return self.dtype.kind == "f"
 
-    def _is_object(self):
-        return isinstance(self._values, cudf.core.column.StringColumn)
+    def _is_object(self) -> bool:
+        return isinstance(self._column, cudf.core.column.StringColumn)
 
-    def _is_categorical(self):
+    def _is_categorical(self) -> bool:
         return False
 
-    def _is_interval(self):
+    def _is_interval(self) -> bool:
         return False
 
     @property  # type: ignore
     @_performance_tracking
-    def hasnans(self):
+    def hasnans(self) -> bool:
         return self._column.has_nulls(include_nan=True)
 
     @_performance_tracking
@@ -1600,13 +1599,13 @@ def argsort(
             na_position=na_position,
         )
 
-    def repeat(self, repeats, axis=None):
-        return self._from_columns_like_self(
-            Frame._repeat([*self._columns], repeats, axis), self._column_names
-        )
+    def repeat(self, repeats, axis=None) -> Self:
+        result = super()._repeat([self._column], repeats, axis)[0]
+        result = result._with_type_metadata(self.dtype)
+        return type(self)._from_column(result, name=self.name)
 
     @_performance_tracking
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace=False) -> Index:
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
             _index_from_data({self.name: result_col}),
@@ -1614,14 +1613,14 @@ def where(self, cond, other=None, inplace=False):
         )
 
     @property
-    def values(self):
+    def values(self) -> cupy.ndarray:
         return self._column.values
 
-    def __contains__(self, item):
+    def __contains__(self, item) -> bool:
         hash(item)
-        return item in self._values
+        return item in self._column
 
-    def _clean_nulls_from_index(self):
+    def _clean_nulls_from_index(self) -> Index:
         if self._values.has_nulls():
             fill_value = (
                 str(cudf.NaT)
@@ -1635,8 +1634,8 @@ def _clean_nulls_from_index(self):
 
         return self
 
-    def any(self):
-        return self._values.any()
+    def any(self) -> bool:
+        return self._column.any()
 
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
@@ -1691,11 +1690,9 @@ def unique(self, level: int | None = None) -> Self:
             raise IndexError(
                 f"Too many levels: Index has only 1 level, not {level + 1}"
             )
-        return cudf.core.index._index_from_data(
-            {self.name: self._values.unique()}, name=self.name
-        )
+        return type(self)._from_column(self._column.unique(), name=self.name)
 
-    def isin(self, values, level=None):
+    def isin(self, values, level=None) -> cupy.ndarray:
         if level is not None and level > 0:
             raise IndexError(
                 f"Too many levels: Index has only 1 level, not {level + 1}"
@@ -1706,11 +1703,7 @@ def isin(self, values, level=None):
                 f"to isin(), you passed a {type(values).__name__}"
             )
 
-        return self._values.isin(values).values
-
-    def _indices_of(self, value):
-        """Return indices of value in index"""
-        return self._column.indices_of(value)
+        return self._column.isin(values).values
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
@@ -2130,7 +2123,7 @@ def day_of_week(self) -> Index:
 
     @property  # type: ignore
     @_performance_tracking
-    def year(self):
+    def year(self) -> Index:
         """
         The year of the datetime.
 
@@ -2149,7 +2142,7 @@ def year(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def month(self):
+    def month(self) -> Index:
         """
         The month as January=1, December=12.
 
@@ -2168,7 +2161,7 @@ def month(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def day(self):
+    def day(self) -> Index:
         """
         The day of the datetime.
 
@@ -2187,7 +2180,7 @@ def day(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def hour(self):
+    def hour(self) -> Index:
         """
         The hours of the datetime.
 
@@ -2208,7 +2201,7 @@ def hour(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def minute(self):
+    def minute(self) -> Index:
         """
         The minutes of the datetime.
 
@@ -2229,7 +2222,7 @@ def minute(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def second(self):
+    def second(self) -> Index:
         """
         The seconds of the datetime.
 
@@ -2250,7 +2243,7 @@ def second(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def microsecond(self):
+    def microsecond(self) -> Index:
         """
         The microseconds of the datetime.
 
@@ -2281,7 +2274,7 @@ def microsecond(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def nanosecond(self):
+    def nanosecond(self) -> Index:
         """
         The nanoseconds of the datetime.
 
@@ -2303,7 +2296,7 @@ def nanosecond(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def weekday(self):
+    def weekday(self) -> Index:
         """
         The day of the week with Monday=0, Sunday=6.
 
@@ -2325,7 +2318,7 @@ def weekday(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def dayofweek(self):
+    def dayofweek(self) -> Index:
         """
         The day of the week with Monday=0, Sunday=6.
 
@@ -2347,7 +2340,7 @@ def dayofweek(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def dayofyear(self):
+    def dayofyear(self) -> Index:
         """
         The day of the year, from 1-365 in non-leap years and
         from 1-366 in leap years.
@@ -2370,7 +2363,7 @@ def dayofyear(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def day_of_year(self):
+    def day_of_year(self) -> Index:
         """
         The day of the year, from 1-365 in non-leap years and
         from 1-366 in leap years.
@@ -2412,7 +2405,7 @@ def is_leap_year(self) -> cupy.ndarray:
 
     @property  # type: ignore
     @_performance_tracking
-    def quarter(self):
+    def quarter(self) -> Index:
         """
         Integer indicator for which quarter of the year the date belongs in.
 
@@ -2523,11 +2516,11 @@ def _get_dt_field(self, field: str) -> Index:
         )
         return Index._from_column(out_column, name=self.name)
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
     @_performance_tracking
-    def ceil(self, freq):
+    def ceil(self, freq: str) -> Self:
         """
         Perform ceil operation on the data to the specified freq.
 
@@ -2558,7 +2551,7 @@ def ceil(self, freq):
         return type(self)._from_column(self._column.ceil(freq), name=self.name)
 
     @_performance_tracking
-    def floor(self, freq):
+    def floor(self, freq: str) -> Self:
         """
         Perform floor operation on the data to the specified freq.
 
@@ -2591,7 +2584,7 @@ def floor(self, freq):
         )
 
     @_performance_tracking
-    def round(self, freq):
+    def round(self, freq: str) -> Self:
         """
         Perform round operation on the data to the specified freq.
 
@@ -2635,7 +2628,7 @@ def tz_localize(
         tz: str | None,
         ambiguous: Literal["NaT"] = "NaT",
         nonexistent: Literal["NaT"] = "NaT",
-    ):
+    ) -> Self:
         """
         Localize timezone-naive data to timezone-aware data.
 
@@ -2682,7 +2675,7 @@ def tz_localize(
             result_col, name=self.name, freq=self._freq
         )
 
-    def tz_convert(self, tz: str | None):
+    def tz_convert(self, tz: str | None) -> Self:
         """
         Convert tz-aware datetimes from one time zone to another.
 
@@ -2717,7 +2710,7 @@ def tz_convert(self, tz: str | None):
         result_col = self._column.tz_convert(tz)
         return DatetimeIndex._from_column(result_col, name=self.name)
 
-    def repeat(self, repeats, axis=None):
+    def repeat(self, repeats, axis=None) -> Self:
         res = super().repeat(repeats, axis=axis)
         res._freq = None
         return res
@@ -2982,7 +2975,7 @@ def nanoseconds(self) -> cudf.Index:
 
     @property  # type: ignore
     @_performance_tracking
-    def components(self):
+    def components(self) -> cudf.DataFrame:
         """
         Return a dataframe of the components (days, hours, minutes,
         seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas.
@@ -3003,7 +2996,7 @@ def inferred_freq(self):
         """
         raise NotImplementedError("inferred_freq is not yet supported")
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
 
@@ -3122,16 +3115,16 @@ def codes(self) -> cudf.Index:
 
     @property  # type: ignore
     @_performance_tracking
-    def categories(self):
+    def categories(self) -> cudf.Index:
         """
         The categories of this categorical.
         """
         return self.dtype.categories
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
-    def _is_categorical(self):
+    def _is_categorical(self) -> bool:
         return True
 
     def add_categories(self, new_categories) -> Self:
@@ -3440,7 +3433,7 @@ def __init__(
         super().__init__(interval_col, name=name)
 
     @property
-    def closed(self):
+    def closed(self) -> Literal["left", "right", "neither", "both"]:
         return self.dtype.closed
 
     @classmethod
@@ -3461,7 +3454,7 @@ def from_breaks(
         name=None,
         copy: bool = False,
         dtype=None,
-    ):
+    ) -> Self:
         """
         Construct an IntervalIndex from an array of splits.
 
@@ -3533,7 +3526,7 @@ def from_tuples(
         name=None,
         copy: bool = False,
         dtype=None,
-    ) -> IntervalIndex:
+    ) -> Self:
         piidx = pd.IntervalIndex.from_tuples(
             data, closed=closed, name=name, copy=copy, dtype=dtype
         )
@@ -3544,13 +3537,13 @@ def __getitem__(self, index):
             "Getting a scalar from an IntervalIndex is not yet supported"
         )
 
-    def _is_interval(self):
+    def _is_interval(self) -> bool:
         return True
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
-    def _clean_nulls_from_index(self):
+    def _clean_nulls_from_index(self) -> Self:
         return self
 
     @property
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index a831a798772..837c6872258 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -377,7 +377,10 @@ def _loc_to_iloc(self, arg):
                     warnings.warn(warn_msg, FutureWarning)
                     return arg
             try:
-                indices = self._frame.index._indices_of(arg)
+                if isinstance(self._frame.index, RangeIndex):
+                    indices = self._frame.index._indices_of(arg)
+                else:
+                    indices = self._frame.index._column.indices_of(arg)
                 if (n := len(indices)) == 0:
                     raise KeyError("Label scalar is out of bounds")
                 elif n == 1:

From 8f2d68750f839326343db00debb5735fe14075d3 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 29 Aug 2024 17:40:20 -0700
Subject: [PATCH 150/270] Refactor dictionary encoding in PQ writer to migrate
 to the new `cuco::static_map` (#16541)

Part of #12261. This PR refactors the dictionary encoding in Parquet writers to migrate from `cuco::legacy::static_map` to `cuco::static_map` to build the dictionaries.

### Performance Results
The changes result in +0.08% average speed improvement and +16.22% average memory footprint increase (stems from the adjusted sizes by `cuco::make_window_extent` due to [prime gap](https://en.wikipedia.org/wiki/Prime_gap)) across the benchmark cases extended from #16591

Currently, we do see a roughly 8% speed improvement in map insert and find kernels which is counteracted by the map init and map collect kernels as they have to process 16.22% more slots. With a cuco bump, the average speed improvement will increase from 0.08% to +3% and the memory footprint change will go back from 16.22% to +0%.

### Hardware used for benchmarking
```
 `NVIDIA RTX 5880 Ada Generation`
* SM Version: 890 (PTX Version: 860)
* Number of SMs: 110
* SM Default Clock Rate: 18446744071874 MHz
* Global Memory: 23879 MiB Free / 48632 MiB Total
* Global Memory Bus Peak: 960 GB/sec (384-bit DDR @10001MHz)
* Max Shared Memory: 100 KiB/SM, 48 KiB/Block
* L2 Cache Size: 98304 KiB
* Maximum Active Blocks: 24/SM
* Maximum Active Threads: 1536/SM, 1024/Block
* Available Registers: 65536/SM, 65536/Block
* ECC Enabled: No
```

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16541
---
 cpp/src/io/parquet/chunk_dict.cu   | 370 ++++++++++++++++-------------
 cpp/src/io/parquet/parquet_gpu.cuh |  73 +++++-
 cpp/src/io/parquet/parquet_gpu.hpp |  44 +---
 cpp/src/io/parquet/writer_impl.cu  |  42 ++--
 4 files changed, 295 insertions(+), 234 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index a43c6d4cbb6..17ccb73c0a8 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -22,6 +22,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuco/static_map_ref.cuh>
 #include <cuda/atomic>
 
 namespace cudf::io::parquet::detail {
@@ -30,28 +31,14 @@ namespace {
 constexpr int DEFAULT_BLOCK_SIZE = 256;
 }
 
-template <int block_size>
-CUDF_KERNEL void __launch_bounds__(block_size)
-  initialize_chunk_hash_maps_kernel(device_span<EncColumnChunk> chunks)
-{
-  auto const chunk = chunks[blockIdx.x];
-  auto const t     = threadIdx.x;
-  // fut: Now that per-chunk dict is same size as ck.num_values, try to not use one block per chunk
-  for (thread_index_type i = 0; i < chunk.dict_map_size; i += block_size) {
-    if (t + i < chunk.dict_map_size) {
-      new (&chunk.dict_map_slots[t + i].first) map_type::atomic_key_type{KEY_SENTINEL};
-      new (&chunk.dict_map_slots[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL};
-    }
-  }
-}
-
 template <typename T>
 struct equality_functor {
   column_device_view const& col;
-  __device__ bool operator()(size_type lhs_idx, size_type rhs_idx)
+  __device__ bool operator()(key_type lhs_idx, key_type rhs_idx) const
   {
-    // We don't call this for nulls so this is fine
-    auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
+    // We don't call this for nulls so this is fine.
+    auto constexpr equal =
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
     return equal(col.element<T>(lhs_idx), col.element<T>(rhs_idx));
   }
 };
@@ -59,38 +46,167 @@ struct equality_functor {
 template <typename T>
 struct hash_functor {
   column_device_view const& col;
-  __device__ auto operator()(size_type idx) const
+  uint32_t const seed = 0;
+  __device__ auto operator()(key_type idx) const
   {
-    return cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
+    return cudf::hashing::detail::MurmurHash3_x86_32<T>{seed}(col.element<T>(idx));
   }
 };
 
+template <int block_size>
 struct map_insert_fn {
-  map_type::device_mutable_view& map;
+  storage_ref_type const& storage_ref;
+  EncColumnChunk* const& chunk;
 
   template <typename T>
-  __device__ bool operator()(column_device_view const& col, size_type i)
+  __device__ void operator()(size_type const s_start_value_idx, size_type const end_value_idx)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
-      auto hash_fn     = hash_functor<T>{col};
-      auto equality_fn = equality_functor<T>{col};
-      return map.insert(std::pair(i, i), hash_fn, equality_fn);
+      using block_reduce = cub::BlockReduce<size_type, block_size>;
+      __shared__ typename block_reduce::TempStorage reduce_storage;
+
+      auto const col                     = chunk->col_desc;
+      column_device_view const& data_col = *col->leaf_column;
+      __shared__ size_type total_num_dict_entries;
+
+      using equality_fn_type = equality_functor<T>;
+      using hash_fn_type     = hash_functor<T>;
+      // Choosing `linear_probing` over `double_hashing` for slighhhtly better performance seen in
+      // benchmarks.
+      using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
+
+      // Make a view of the hash map.
+      auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL},
+                                               cuco::empty_value{VALUE_SENTINEL},
+                                               equality_fn_type{data_col},
+                                               probing_scheme_type{hash_fn_type{data_col}},
+                                               cuco::thread_scope_block,
+                                               storage_ref};
+
+      // Create a map ref with `cuco::insert` operator
+      auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
+      auto const t        = threadIdx.x;
+
+      // Create atomic refs to the current chunk's num_dict_entries and uniq_data_size
+      cuda::atomic_ref<size_type, SCOPE> const chunk_num_dict_entries{chunk->num_dict_entries};
+      cuda::atomic_ref<size_type, SCOPE> const chunk_uniq_data_size{chunk->uniq_data_size};
+
+      // Note: Adjust the following loop to use `cg::tile<map_cg_size>` if needed in the future.
+      for (thread_index_type val_idx = s_start_value_idx + t; val_idx - t < end_value_idx;
+           val_idx += block_size) {
+        size_type is_unique      = 0;
+        size_type uniq_elem_size = 0;
+
+        // Check if this index is valid.
+        auto const is_valid =
+          val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
+
+        // Insert tile_val_idx to hash map and count successful insertions.
+        if (is_valid) {
+          // Insert the keys using a single thread for best performance for now.
+          is_unique      = map_insert_ref.insert(cuco::pair{val_idx, val_idx});
+          uniq_elem_size = [&]() -> size_type {
+            if (not is_unique) { return 0; }
+            switch (col->physical_type) {
+              case Type::INT32: return 4;
+              case Type::INT64: return 8;
+              case Type::INT96: return 12;
+              case Type::FLOAT: return 4;
+              case Type::DOUBLE: return 8;
+              case Type::BYTE_ARRAY: {
+                auto const col_type = data_col.type().id();
+                if (col_type == type_id::STRING) {
+                  // Strings are stored as 4 byte length + string bytes
+                  return 4 + data_col.element<string_view>(val_idx).size_bytes();
+                } else if (col_type == type_id::LIST) {
+                  // Binary is stored as 4 byte length + bytes
+                  return 4 +
+                         get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
+                }
+                CUDF_UNREACHABLE(
+                  "Byte array only supports string and list<byte> column types for dictionary "
+                  "encoding!");
+              }
+              case Type::FIXED_LEN_BYTE_ARRAY:
+                if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
+                CUDF_UNREACHABLE(
+                  "Fixed length byte array only supports decimal 128 column types for dictionary "
+                  "encoding!");
+              default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
+            }
+          }();
+        }
+        // Reduce num_unique and uniq_data_size from all tiles.
+        auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
+        __syncthreads();
+        auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
+        // The first thread in the block atomically updates total num_unique and uniq_data_size
+        if (t == 0) {
+          total_num_dict_entries =
+            chunk_num_dict_entries.fetch_add(num_unique, cuda::std::memory_order_relaxed);
+          total_num_dict_entries += num_unique;
+          chunk_uniq_data_size.fetch_add(uniq_data_size, cuda::std::memory_order_relaxed);
+        }
+        __syncthreads();
+
+        // Check if the num unique values in chunk has already exceeded max dict size and early exit
+        if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
+      }  // for loop
     } else {
       CUDF_UNREACHABLE("Unsupported type to insert in map");
     }
   }
 };
 
+template <int block_size>
 struct map_find_fn {
-  map_type::device_view& map;
-
+  storage_ref_type const& storage_ref;
+  EncColumnChunk* const& chunk;
   template <typename T>
-  __device__ map_type::device_view::iterator operator()(column_device_view const& col, size_type i)
+  __device__ void operator()(size_type const s_start_value_idx,
+                             size_type const end_value_idx,
+                             size_type const s_ck_start_val_idx)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
-      auto hash_fn     = hash_functor<T>{col};
-      auto equality_fn = equality_functor<T>{col};
-      return map.find(i, hash_fn, equality_fn);
+      auto const col                     = chunk->col_desc;
+      column_device_view const& data_col = *col->leaf_column;
+
+      using equality_fn_type = equality_functor<T>;
+      using hash_fn_type     = hash_functor<T>;
+      // Choosing `linear_probing` over `double_hashing` for slighhhtly better performance seen in
+      // benchmarks.
+      using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
+
+      // Make a view of the hash map.
+      auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL},
+                                               cuco::empty_value{VALUE_SENTINEL},
+                                               equality_fn_type{data_col},
+                                               probing_scheme_type{hash_fn_type{data_col}},
+                                               cuco::thread_scope_block,
+                                               storage_ref};
+
+      // Create a map ref with `cuco::find` operator
+      auto const map_find_ref = hash_map_ref.with_operators(cuco::find);
+      auto const t            = threadIdx.x;
+
+      // Note: Adjust the following loop to use `cg::tiles<map_cg_size>` if needed in the future.
+      for (thread_index_type val_idx = s_start_value_idx + t; val_idx < end_value_idx;
+           val_idx += block_size) {
+        // Find the key using a single thread for best performance for now.
+        if (data_col.is_valid(val_idx)) {
+          // No need for atomic as this is not going to be modified by any other thread.
+          chunk->dict_index[val_idx - s_ck_start_val_idx] = [&]() {
+            auto const found_slot = map_find_ref.find(val_idx);
+
+            // Fail if we didn't find the previously inserted key.
+            cudf_assert(found_slot != map_find_ref.end() &&
+                        "Unable to find value in map in dictionary index construction");
+
+            // Return the found value.
+            return found_slot->second;
+          }();
+        }
+      }
     } else {
       CUDF_UNREACHABLE("Unsupported type to find in map");
     }
@@ -99,124 +215,61 @@ struct map_find_fn {
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
+  populate_chunk_hash_maps_kernel(device_span<window_type> const map_storage,
+                                  cudf::detail::device_2dspan<PageFragment const> frags)
 {
-  auto col_idx = blockIdx.y;
-  auto block_x = blockIdx.x;
-  auto t       = threadIdx.x;
-  auto frag    = frags[col_idx][block_x];
-  auto chunk   = frag.chunk;
-  auto col     = chunk->col_desc;
+  auto const col_idx = blockIdx.y;
+  auto const block_x = blockIdx.x;
+  auto const frag    = frags[col_idx][block_x];
+  auto chunk         = frag.chunk;
+  auto col           = chunk->col_desc;
 
   if (not chunk->use_dictionary) { return; }
 
-  using block_reduce = cub::BlockReduce<size_type, block_size>;
-  __shared__ typename block_reduce::TempStorage reduce_storage;
-
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
 
-  // Find the bounds of values in leaf column to be inserted into the map for current chunk
+  // Find the bounds of values in leaf column to be inserted into the map for current chunk.
   size_type const s_start_value_idx = row_to_value_idx(start_row, *col);
   size_type const end_value_idx     = row_to_value_idx(end_row, *col);
 
   column_device_view const& data_col = *col->leaf_column;
-
-  // Make a view of the hash map
-  auto hash_map_mutable = map_type::device_mutable_view(chunk->dict_map_slots,
-                                                        chunk->dict_map_size,
-                                                        cuco::empty_key{KEY_SENTINEL},
-                                                        cuco::empty_value{VALUE_SENTINEL});
-
-  __shared__ size_type total_num_dict_entries;
-  thread_index_type val_idx = s_start_value_idx + t;
-  while (val_idx - block_size < end_value_idx) {
-    auto const is_valid =
-      val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
-
-    // insert element at val_idx to hash map and count successful insertions
-    size_type is_unique      = 0;
-    size_type uniq_elem_size = 0;
-    if (is_valid) {
-      is_unique =
-        type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx);
-      uniq_elem_size = [&]() -> size_type {
-        if (not is_unique) { return 0; }
-        switch (col->physical_type) {
-          case Type::INT32: return 4;
-          case Type::INT64: return 8;
-          case Type::INT96: return 12;
-          case Type::FLOAT: return 4;
-          case Type::DOUBLE: return 8;
-          case Type::BYTE_ARRAY: {
-            auto const col_type = data_col.type().id();
-            if (col_type == type_id::STRING) {
-              // Strings are stored as 4 byte length + string bytes
-              return 4 + data_col.element<string_view>(val_idx).size_bytes();
-            } else if (col_type == type_id::LIST) {
-              // Binary is stored as 4 byte length + bytes
-              return 4 + get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
-            }
-            CUDF_UNREACHABLE(
-              "Byte array only supports string and list<byte> column types for dictionary "
-              "encoding!");
-          }
-          case Type::FIXED_LEN_BYTE_ARRAY:
-            if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
-            CUDF_UNREACHABLE(
-              "Fixed length byte array only supports decimal 128 column types for dictionary "
-              "encoding!");
-          default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
-        }
-      }();
-    }
-
-    auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
-    __syncthreads();
-    auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
-    if (t == 0) {
-      total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
-      total_num_dict_entries += num_unique;
-      atomicAdd(&chunk->uniq_data_size, uniq_data_size);
-    }
-    __syncthreads();
-
-    // Check if the num unique values in chunk has already exceeded max dict size and early exit
-    if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
-
-    val_idx += block_size;
-  }  // while
+  storage_ref_type const storage_ref{chunk->dict_map_size,
+                                     map_storage.data() + chunk->dict_map_offset};
+  type_dispatcher(data_col.type(),
+                  map_insert_fn<block_size>{storage_ref, chunk},
+                  s_start_value_idx,
+                  end_value_idx);
 }
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  collect_map_entries_kernel(device_span<EncColumnChunk> chunks)
+  collect_map_entries_kernel(device_span<window_type> const map_storage,
+                             device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
   if (not chunk.use_dictionary) { return; }
 
-  auto t   = threadIdx.x;
-  auto map = map_type::device_view(chunk.dict_map_slots,
-                                   chunk.dict_map_size,
-                                   cuco::empty_key{KEY_SENTINEL},
-                                   cuco::empty_value{VALUE_SENTINEL});
-
-  __shared__ cuda::atomic<size_type, cuda::thread_scope_block> counter;
+  auto t = threadIdx.x;
+  __shared__ cuda::atomic<size_type, SCOPE> counter;
   using cuda::std::memory_order_relaxed;
-  if (t == 0) { new (&counter) cuda::atomic<size_type, cuda::thread_scope_block>{0}; }
+  if (t == 0) { new (&counter) cuda::atomic<size_type, SCOPE>{0}; }
   __syncthreads();
-  for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
-    if (t + i < chunk.dict_map_size) {
-      auto* slot = reinterpret_cast<map_type::value_type*>(map.begin_slot() + t + i);
-      auto key   = slot->first;
+
+  // Iterate over all windows in the map.
+  for (; t < chunk.dict_map_size; t += block_size) {
+    auto window = map_storage.data() + chunk.dict_map_offset + t;
+    // Collect all slots from each window.
+    for (auto& slot : *window) {
+      auto const key = slot.first;
       if (key != KEY_SENTINEL) {
-        auto loc = counter.fetch_add(1, memory_order_relaxed);
+        auto const loc = counter.fetch_add(1, memory_order_relaxed);
         cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
         chunk.dict_data[loc] = key;
-        // If sorting dict page ever becomes a hard requirement, enable the following statement and
-        // add a dict sorting step before storing into the slot's second field.
-        // chunk.dict_data_idx[loc] = t + i;
-        slot->second = loc;
+        // If sorting dict page ever becomes a hard requirement, enable the following statement
+        // and add a dict sorting step before storing into the slot's second field.
+        // chunk.dict_data_idx[loc] = idx;
+        slot.second = loc;
       }
     }
   }
@@ -224,75 +277,60 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
+  get_dictionary_indices_kernel(device_span<window_type> const map_storage,
+                                cudf::detail::device_2dspan<PageFragment const> frags)
 {
-  auto col_idx = blockIdx.y;
-  auto block_x = blockIdx.x;
-  auto t       = threadIdx.x;
-  auto frag    = frags[col_idx][block_x];
-  auto chunk   = frag.chunk;
-  auto col     = chunk->col_desc;
+  auto const col_idx = blockIdx.y;
+  auto const block_x = blockIdx.x;
+  auto const frag    = frags[col_idx][block_x];
+  auto chunk         = frag.chunk;
 
   if (not chunk->use_dictionary) { return; }
 
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
 
+  auto const col = chunk->col_desc;
   // Find the bounds of values in leaf column to be searched in the map for current chunk
   auto const s_start_value_idx  = row_to_value_idx(start_row, *col);
   auto const s_ck_start_val_idx = row_to_value_idx(chunk->start_row, *col);
   auto const end_value_idx      = row_to_value_idx(end_row, *col);
 
   column_device_view const& data_col = *col->leaf_column;
-
-  auto map = map_type::device_view(chunk->dict_map_slots,
-                                   chunk->dict_map_size,
-                                   cuco::empty_key{KEY_SENTINEL},
-                                   cuco::empty_value{VALUE_SENTINEL});
-
-  thread_index_type val_idx = s_start_value_idx + t;
-  while (val_idx < end_value_idx) {
-    if (data_col.is_valid(val_idx)) {
-      auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx);
-      cudf_assert(found_slot != map.end() &&
-                  "Unable to find value in map in dictionary index construction");
-      if (found_slot != map.end()) {
-        // No need for atomic as this is not going to be modified by any other thread
-        auto* val_ptr = reinterpret_cast<map_type::mapped_type*>(&found_slot->second);
-        chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
-      }
-    }
-
-    val_idx += block_size;
-  }
-}
-
-void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
-{
-  constexpr int block_size = 1024;
-  initialize_chunk_hash_maps_kernel<block_size>
-    <<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
+  storage_ref_type const storage_ref{chunk->dict_map_size,
+                                     map_storage.data() + chunk->dict_map_offset};
+
+  type_dispatcher(data_col.type(),
+                  map_find_fn<block_size>{storage_ref, chunk},
+                  s_start_value_idx,
+                  end_value_idx,
+                  s_ck_start_val_idx);
 }
 
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
+void populate_chunk_hash_maps(device_span<window_type> const map_storage,
+                              cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   populate_chunk_hash_maps_kernel<DEFAULT_BLOCK_SIZE>
-    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
+    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_storage, frags);
 }
 
-void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
+void collect_map_entries(device_span<window_type> const map_storage,
+                         device_span<EncColumnChunk> chunks,
+                         rmm::cuda_stream_view stream)
 {
   constexpr int block_size = 1024;
-  collect_map_entries_kernel<block_size><<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
+  collect_map_entries_kernel<block_size>
+    <<<chunks.size(), block_size, 0, stream.value()>>>(map_storage, chunks);
 }
 
-void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
+void get_dictionary_indices(device_span<window_type> const map_storage,
+                            cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   get_dictionary_indices_kernel<DEFAULT_BLOCK_SIZE>
-    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
+    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_storage, frags);
 }
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index e3c44c78898..7c09764da2d 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -18,25 +18,37 @@
 
 #include "parquet_gpu.hpp"
 
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/types.hpp>
 
-#include <cuco/static_map.cuh>
+#include <cuco/pair.cuh>
+#include <cuco/storage.cuh>
 
 namespace cudf::io::parquet::detail {
 
-auto constexpr KEY_SENTINEL   = size_type{-1};
-auto constexpr VALUE_SENTINEL = size_type{-1};
+using key_type    = size_type;
+using mapped_type = size_type;
+using slot_type   = cuco::pair<key_type, mapped_type>;
 
-using map_type = cuco::legacy::static_map<size_type, size_type>;
+auto constexpr map_cg_size =
+  1;  ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset.
+      ///< Note: Adjust insert and find loops to use `cg::tile<map_cg_size>` if increasing this.
+auto constexpr window_size =
+  1;  ///< Number of concurrent slots (set for best performance) handled by each thread.
+auto constexpr occupancy_factor = 1.43f;  ///< cuCollections suggests using a hash map of size
+                                          ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor.
 
-/**
- * @brief The alias of `map_type::pair_atomic_type` class.
- *
- * Declare this struct by trivial subclassing instead of type aliasing so we can have forward
- * declaration of this struct somewhere else.
- */
-struct slot_type : public map_type::pair_atomic_type {};
+auto constexpr KEY_SENTINEL   = key_type{-1};
+auto constexpr VALUE_SENTINEL = mapped_type{-1};
+auto constexpr SCOPE          = cuda::thread_scope_block;
+
+using storage_type     = cuco::aow_storage<slot_type,
+                                       window_size,
+                                       cuco::extent<std::size_t>,
+                                       cudf::detail::cuco_allocator<char>>;
+using storage_ref_type = typename storage_type::ref_type;
+using window_type      = typename storage_type::window_type;
 
 /**
  * @brief Return the byte length of parquet dtypes that are physically represented by INT32
@@ -81,4 +93,43 @@ inline size_type __device__ row_to_value_idx(size_type idx,
   return idx;
 }
 
+/**
+ * @brief Insert chunk values into their respective hash maps
+ *
+ * @param map_storage Bulk hashmap storage
+ * @param frags Column fragments
+ * @param stream CUDA stream to use
+ */
+void populate_chunk_hash_maps(device_span<window_type> const map_storage,
+                              cudf::detail::device_2dspan<PageFragment const> frags,
+                              rmm::cuda_stream_view stream);
+
+/**
+ * @brief Compact dictionary hash map entries into chunk.dict_data
+ *
+ * @param map_storage Bulk hashmap storage
+ * @param chunks Flat span of chunks to compact hash maps for
+ * @param stream CUDA stream to use
+ */
+void collect_map_entries(device_span<window_type> const map_storage,
+                         device_span<EncColumnChunk> chunks,
+                         rmm::cuda_stream_view stream);
+
+/**
+ * @brief Get the Dictionary Indices for each row
+ *
+ * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise
+ * stored in input column [row]. Stores these indices into chunk.dict_index.
+ *
+ * Since dict_data itself contains indices into the original cudf column, this means that
+ * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
+ *
+ * @param map_storage Bulk hashmap storage
+ * @param frags Column fragments
+ * @param stream CUDA stream to use
+ */
+void get_dictionary_indices(device_span<window_type> const map_storage,
+                            cudf::detail::device_2dspan<PageFragment const> frags,
+                            rmm::cuda_stream_view stream);
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 8f52f073833..125d35f6499 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -514,7 +514,6 @@ constexpr unsigned int kDictHashBits = 16;
 constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
 
 struct EncPage;
-struct slot_type;
 
 // convert Encoding to a mask value
 constexpr uint32_t encoding_to_mask(Encoding encoding)
@@ -560,7 +559,8 @@ struct EncColumnChunk {
   uint8_t is_compressed;    //!< Nonzero if the chunk uses compression
   uint32_t dictionary_size;    //!< Size of dictionary page including header
   uint32_t ck_stat_size;       //!< Size of chunk-level statistics (included in 1st page header)
-  slot_type* dict_map_slots;   //!< Hash map storage for calculating dict encoding for this chunk
+  uint32_t dict_map_offset;    //!< Offset of the hash map storage for calculating dict encoding for
+                               //!< this chunk
   size_type dict_map_size;     //!< Size of dict_map_slots
   size_type num_dict_entries;  //!< Total number of entries in dictionary
   size_type
@@ -1001,46 +1001,6 @@ void InitFragmentStatistics(device_span<statistics_group> groups,
                             device_span<PageFragment const> fragments,
                             rmm::cuda_stream_view stream);
 
-/**
- * @brief Initialize per-chunk hash maps used for dictionary with sentinel values
- *
- * @param chunks Flat span of chunks to initialize hash maps for
- * @param stream CUDA stream to use
- */
-void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
-
-/**
- * @brief Insert chunk values into their respective hash maps
- *
- * @param frags Column fragments
- * @param stream CUDA stream to use
- */
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
-                              rmm::cuda_stream_view stream);
-
-/**
- * @brief Compact dictionary hash map entries into chunk.dict_data
- *
- * @param chunks Flat span of chunks to compact hash maps for
- * @param stream CUDA stream to use
- */
-void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
-
-/**
- * @brief Get the Dictionary Indices for each row
- *
- * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise
- * stored in input column [row]. Stores these indices into chunk.dict_index.
- *
- * Since dict_data itself contains indices into the original cudf column, this means that
- * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
- *
- * @param frags Column fragments
- * @param stream CUDA stream to use
- */
-void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
-                            rmm::cuda_stream_view stream);
-
 /**
  * @brief Launches kernel for initializing encoder data pages
  *
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 74992aa733f..46c3151c731 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1285,10 +1285,10 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     return std::pair(std::move(dict_data), std::move(dict_index));
   }
 
-  // Allocate slots for each chunk
-  std::vector<rmm::device_uvector<slot_type>> hash_maps_storage;
-  hash_maps_storage.reserve(h_chunks.size());
-  for (auto& chunk : h_chunks) {
+  // Variable to keep track of the current total map storage size
+  size_t total_map_storage_size = 0;
+  // Populate dict offsets and sizes for each chunk that need to build a dictionary.
+  std::for_each(h_chunks.begin(), h_chunks.end(), [&](auto& chunk) {
     auto const& chunk_col_desc = col_desc[chunk.col_desc_id];
     auto const is_requested_non_dict =
       chunk_col_desc.requested_encoding != column_encoding::USE_DEFAULT &&
@@ -1300,19 +1300,31 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
       chunk.use_dictionary = false;
     } else {
       chunk.use_dictionary = true;
-      // cuCollections suggests using a hash map of size N * (1/0.7) = num_values * 1.43
-      // https://github.com/NVIDIA/cuCollections/blob/3a49fc71/include/cuco/static_map.cuh#L190-L193
-      auto& inserted_map   = hash_maps_storage.emplace_back(chunk.num_values * 1.43, stream);
-      chunk.dict_map_slots = inserted_map.data();
-      chunk.dict_map_size  = inserted_map.size();
+      chunk.dict_map_size =
+        static_cast<cudf::size_type>(cuco::make_window_extent<map_cg_size, window_size>(
+          static_cast<cudf::size_type>(occupancy_factor * chunk.num_values)));
+      chunk.dict_map_offset = total_map_storage_size;
+      total_map_storage_size += chunk.dict_map_size;
     }
-  }
+  });
 
-  chunks.host_to_device_async(stream);
+  // No chunk needs to create a dictionary, exit early
+  if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; }
 
-  initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
-  populate_chunk_hash_maps(frags, stream);
+  // Create a single bulk storage used by all sub-dictionaries
+  auto map_storage = storage_type{
+    total_map_storage_size,
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
+  // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer.
+  device_span<window_type> const map_storage_data{map_storage.data(), total_map_storage_size};
 
+  // Synchronize
+  chunks.host_to_device_async(stream);
+  // Initialize storage with the given sentinel
+  map_storage.initialize_async({KEY_SENTINEL, VALUE_SENTINEL}, {stream.value()});
+  // Populate the hash map for each chunk
+  populate_chunk_hash_maps(map_storage_data, frags, stream);
+  // Synchronize again
   chunks.device_to_host_sync(stream);
 
   // Make decision about which chunks have dictionary
@@ -1372,8 +1384,8 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     chunk.dict_index          = inserted_dict_index.data();
   }
   chunks.host_to_device_async(stream);
-  collect_map_entries(chunks.device_view().flat_view(), stream);
-  get_dictionary_indices(frags, stream);
+  collect_map_entries(map_storage_data, chunks.device_view().flat_view(), stream);
+  get_dictionary_indices(map_storage_data, frags, stream);
 
   return std::pair(std::move(dict_data), std::move(dict_index));
 }

From f932bf9c62f73aabee2ac094180036399ce88dcf Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 29 Aug 2024 15:28:37 -1000
Subject: [PATCH 151/270] Fix Series.to_frame(name=None) setting a None name
 (#16698)

In pandas 2.0, `to_frame(name=None)` allowed the resulting column name to be `None` https://github.com/pandas-dev/pandas/pull/45523

Looks like based on the current default of `cudf.Series.to_frame`, this behavior was not reflected.

Additionally, created a `SingleColumnFrame._to_frame` to more easily share the logic between `Series.to_frame` and `Index.to_frame`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16698
---
 python/cudf/cudf/core/_base_index.py         | 58 --------------------
 python/cudf/cudf/core/index.py               | 57 +++++++++++++++++++
 python/cudf/cudf/core/series.py              | 12 +---
 python/cudf/cudf/core/single_column_frame.py | 11 ++++
 python/cudf/cudf/tests/test_series.py        |  7 +++
 5 files changed, 77 insertions(+), 68 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index a224e0ce0d0..ff114474aa4 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -798,64 +798,6 @@ def fillna(self, value, downcast=None):
 
         return super().fillna(value=value)
 
-    def to_frame(self, index=True, name=no_default):
-        """Create a DataFrame with a column containing this Index
-
-        Parameters
-        ----------
-        index : boolean, default True
-            Set the index of the returned DataFrame as the original Index
-        name : object, defaults to index.name
-            The passed name should substitute for the index name (if it has
-            one).
-
-        Returns
-        -------
-        DataFrame
-            DataFrame containing the original Index data.
-
-        See Also
-        --------
-        Index.to_series : Convert an Index to a Series.
-        Series.to_frame : Convert Series to DataFrame.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal')
-        >>> idx.to_frame()
-               animal
-        animal
-        Ant       Ant
-        Bear     Bear
-        Cow       Cow
-
-        By default, the original Index is reused. To enforce a new Index:
-
-        >>> idx.to_frame(index=False)
-            animal
-        0   Ant
-        1  Bear
-        2   Cow
-
-        To override the name of the resulting column, specify `name`:
-
-        >>> idx.to_frame(index=False, name='zoo')
-            zoo
-        0   Ant
-        1  Bear
-        2   Cow
-        """
-
-        if name is no_default:
-            col_name = 0 if self.name is None else self.name
-        else:
-            col_name = name
-
-        return cudf.DataFrame(
-            {col_name: self._values}, index=self if index else None
-        )
-
     def to_arrow(self):
         """Convert to a suitable Arrow object."""
         raise NotImplementedError
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 66d03682de4..b2bd20c4982 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -529,6 +529,11 @@ def to_pandas(
             name=self.name,
         )
 
+    def to_frame(
+        self, index: bool = True, name: Hashable = no_default
+    ) -> cudf.DataFrame:
+        return self._as_int_index().to_frame(index=index, name=name)
+
     @property
     def is_unique(self) -> bool:
         return True
@@ -1646,6 +1651,58 @@ def to_pandas(
         result.name = self.name
         return result
 
+    def to_frame(
+        self, index: bool = True, name: Hashable = no_default
+    ) -> cudf.DataFrame:
+        """Create a DataFrame with a column containing this Index
+
+        Parameters
+        ----------
+        index : boolean, default True
+            Set the index of the returned DataFrame as the original Index
+        name : object, defaults to index.name
+            The passed name should substitute for the index name (if it has
+            one).
+
+        Returns
+        -------
+        DataFrame
+            DataFrame containing the original Index data.
+
+        See Also
+        --------
+        Index.to_series : Convert an Index to a Series.
+        Series.to_frame : Convert Series to DataFrame.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal')
+        >>> idx.to_frame()
+               animal
+        animal
+        Ant       Ant
+        Bear     Bear
+        Cow       Cow
+
+        By default, the original Index is reused. To enforce a new Index:
+
+        >>> idx.to_frame(index=False)
+            animal
+        0   Ant
+        1  Bear
+        2   Cow
+
+        To override the name of the resulting column, specify `name`:
+
+        >>> idx.to_frame(index=False, name='zoo')
+            zoo
+        0   Ant
+        1  Bear
+        2   Cow
+        """
+        return self._to_frame(name=name, index=self if index else None)
+
     def append(self, other):
         if is_list_like(other):
             to_concat = [self]
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 837c6872258..aadbd80f4b4 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1160,7 +1160,7 @@ def reset_index(
         )
 
     @_performance_tracking
-    def to_frame(self, name=None):
+    def to_frame(self, name: abc.Hashable = no_default) -> cudf.DataFrame:
         """Convert Series into a DataFrame
 
         Parameters
@@ -1192,15 +1192,7 @@ def to_frame(self, name=None):
         13   <NA>
         15      d
         """  # noqa: E501
-
-        if name is not None:
-            col = name
-        elif self.name is None:
-            col = 0
-        else:
-            col = self.name
-
-        return cudf.DataFrame({col: self._column}, index=self.index)
+        return self._to_frame(name=name, index=self.index)
 
     @_performance_tracking
     def memory_usage(self, index=True, deep=False):
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 55dda34a576..0e66f383ca0 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -158,6 +158,17 @@ def to_arrow(self) -> pa.Array:
         """
         return self._column.to_arrow()
 
+    def _to_frame(
+        self, name: Hashable, index: cudf.Index | None
+    ) -> cudf.DataFrame:
+        """Helper function for Series.to_frame, Index.to_frame"""
+        if name is no_default:
+            col_name = 0 if self.name is None else self.name
+        else:
+            col_name = name
+        ca = ColumnAccessor({col_name: self._column}, verify=False)
+        return cudf.DataFrame._from_data(ca, index=index)
+
     @property  # type: ignore
     @_performance_tracking
     def is_unique(self) -> bool:
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8d673e23ab2..a24002dc38e 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2557,6 +2557,13 @@ def test_series_arrow_list_types_roundtrip():
             cudf.from_pandas(pdf)
 
 
+@pytest.mark.parametrize("base_name", [None, "a"])
+def test_series_to_frame_none_name(base_name):
+    result = cudf.Series(range(1), name=base_name).to_frame(name=None)
+    expected = pd.Series(range(1), name=base_name).to_frame(name=None)
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize("klass", [cudf.Index, cudf.Series])
 @pytest.mark.parametrize(
     "data", [pa.array([float("nan")]), pa.chunked_array([[float("nan")]])]

From 62a53b34f6c5c9145e908403d674cc6c16bab7f2 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 29 Aug 2024 21:54:02 -0400
Subject: [PATCH 152/270] [FEA] Add third-party library integration testing of
 cudf.pandas to cudf (#16645)

Closes #16580

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16645
---
 .github/workflows/test.yaml                   |  11 +
 ci/cudf_pandas_scripts/run_tests.sh           |   3 +
 .../ci_run_library_tests.sh                   |  56 +++
 .../third-party-integration/test.sh           |  83 ++++
 .../dependencies.yaml                         | 276 +++++++++++++
 .../tests/conftest.py                         | 173 +++++++++
 .../tests/pytest.ini                          |   7 +
 .../tests/test_cugraph.py                     |  94 +++++
 .../tests/test_cuml.py                        | 152 ++++++++
 .../tests/test_dask.py                        |  10 +
 .../tests/test_featureengine.py               |  47 +++
 .../tests/test_holoviews.py                   |  79 ++++
 .../tests/test_hvplot.py                      |  72 ++++
 .../tests/test_ibis.py                        | 169 ++++++++
 .../tests/test_matplotlib.py                  |  70 ++++
 .../tests/test_numpy.py                       |  59 +++
 .../tests/test_plotly.py                      |  67 ++++
 .../tests/test_pytorch.py                     | 128 ++++++
 .../tests/test_scipy.py                       |  65 ++++
 .../tests/test_seaborn.py                     |  60 +++
 .../tests/test_sklearn.py                     |  82 ++++
 .../tests/test_stumpy.py                      |  94 +++++
 .../tests/test_stumpy_distributed.py          |  48 +++
 .../tests/test_tensorflow.py                  | 367 ++++++++++++++++++
 .../tests/test_xgboost.py                     | 135 +++++++
 25 files changed, 2407 insertions(+)
 create mode 100755 ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh
 create mode 100755 ci/cudf_pandas_scripts/third-party-integration/test.sh
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9feea050b19..2c68f2861bb 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -124,3 +124,14 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/cudf_pandas_scripts/run_tests.sh
+  third-party-integration-tests-cudf-pandas:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      container_image: "rapidsai/ci-conda:latest"
+      run_script: |
+        ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 52964496b36..8b85695c861 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -64,7 +64,9 @@ fi
 python -m pip install ipykernel
 python -m ipykernel install --user --name python3
 
+# The third-party integration tests are ignored because they are run nightly in seperate CI job
 python -m pytest -p cudf.pandas \
+    --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \
     --cov-config=./python/cudf/.coveragerc \
     --cov=cudf \
     --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \
@@ -80,6 +82,7 @@ for version in "${versions[@]}"; do
     echo "Installing pandas version: ${version}"
     python -m pip install "numpy>=1.23,<2.0a0" "pandas==${version}"
     python -m pytest -p cudf.pandas \
+    --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \
     --cov-config=./python/cudf/.coveragerc \
     --cov=cudf \
     --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \
diff --git a/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh b/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh
new file mode 100755
index 00000000000..54a56508cdc
--- /dev/null
+++ b/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+cleanup() {
+    rm ${TEST_DIR}/results-*.pickle
+}
+
+trap cleanup EXIT
+
+runtest_gold() {
+    local lib=$1
+
+    pytest \
+    -v \
+    --continue-on-collection-errors \
+    --cache-clear \
+    --numprocesses=${NUM_PROCESSES} \
+    --dist=worksteal \
+    ${TEST_DIR}/test_${lib}*.py
+}
+
+runtest_cudf_pandas() {
+    local lib=$1
+
+    pytest \
+    -p cudf.pandas \
+    -v \
+    --continue-on-collection-errors \
+    --cache-clear \
+    --numprocesses=${NUM_PROCESSES} \
+    --dist=worksteal \
+    ${TEST_DIR}/test_${lib}*.py
+}
+
+main() {
+    local lib=$1
+
+    # generation phase
+    runtest_gold ${lib}
+    runtest_cudf_pandas ${lib}
+
+    # assertion phase
+    pytest \
+    --compare \
+    -p cudf.pandas \
+    -v \
+    --continue-on-collection-errors \
+    --cache-clear \
+    --numprocesses=${NUM_PROCESSES} \
+    --dist=worksteal \
+    ${TEST_DIR}/test_${lib}*.py
+}
+
+main $@
diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh
new file mode 100755
index 00000000000..89b28c30e39
--- /dev/null
+++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+# Common setup steps shared by Python test jobs
+
+set -euo pipefail
+
+write_output() {
+  local key="$1"
+  local value="$2"
+  echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}"
+}
+
+extract_lib_from_dependencies_yaml() {
+    local file=$1
+    # Parse all keys in dependencies.yaml under the "files" section,
+    # extract all the keys that start with "test_", and extract the rest
+    local extracted_libs="$(yq -o json $file | jq -rc '.files | with_entries(select(.key | contains("test_"))) | keys | map(sub("^test_"; ""))')"
+    echo $extracted_libs
+}
+
+main() {
+    local dependencies_yaml="$1"
+
+    LIBS=$(extract_lib_from_dependencies_yaml "$dependencies_yaml")
+    LIBS=${LIBS#[}
+    LIBS=${LIBS%]}
+
+    for lib in ${LIBS//,/ }; do
+        lib=$(echo "$lib" | tr -d '""')
+        echo "Running tests for library $lib"
+
+        CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi)
+
+        . /opt/conda/etc/profile.d/conda.sh
+
+        rapids-logger "Generate Python testing dependencies"
+        rapids-dependency-file-generator \
+          --config "$dependencies_yaml" \
+          --output conda \
+          --file-key test_${lib} \
+          --matrix "cuda=${CUDA_MAJOR};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
+
+        rapids-mamba-retry env create --yes -f env.yaml -n test
+
+        # Temporarily allow unbound variables for conda activation.
+        set +u
+        conda activate test
+        set -u
+
+        repo_root=$(git rev-parse --show-toplevel)
+        TEST_DIR=${repo_root}/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests
+
+        rapids-print-env
+
+        rapids-logger "Check GPU usage"
+        nvidia-smi
+
+        EXITCODE=0
+        trap "EXITCODE=1" ERR
+        set +e
+
+        rapids-logger "pytest ${lib}"
+
+        NUM_PROCESSES=8
+        serial_libraries=(
+            "tensorflow"
+        )
+        for serial_library in "${serial_libraries[@]}"; do
+            if [ "${lib}" = "${serial_library}" ]; then
+                NUM_PROCESSES=1
+            fi
+        done
+
+        TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh ${lib}
+
+        rapids-logger "Test script exiting with value: ${EXITCODE}"
+    done
+
+    exit ${EXITCODE}
+}
+
+main "$@"
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
new file mode 100644
index 00000000000..05e1d8178d5
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
@@ -0,0 +1,276 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Dependency list for https://github.com/rapidsai/dependency-file-generator
+files:
+  checks:
+    output: none
+    includes:
+      - develop
+      - py_version
+  test_dask:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_dask
+  test_matplotlib:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_matplotlib
+  test_numpy:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_numpy
+  test_pytorch:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_pytorch
+  test_seaborn:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_seaborn
+  test_scipy:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_scipy
+  test_sklearn:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_sklearn
+  test_stumpy:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_stumpy
+  test_tensorflow:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_tensorflow
+  test_xgboost:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_xgboost
+  test_cuml:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_cuml
+  test_cugraph:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_cugraph
+  test_ibis:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_ibis
+  test_hvplot:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_hvplot
+  test_holoviews:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_holoviews
+  test_plotly:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_plotly
+
+channels:
+  - rapidsai-nightly
+  - rapidsai
+  - conda-forge
+  - nvidia
+
+dependencies:
+  develop:
+    common:
+      - output_types: conda
+        packages:
+          - pre-commit
+  cuda_version:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              cuda: "11"
+            packages:
+              - cuda-version=11.8
+          - matrix:
+              cuda: "11.8"
+            packages:
+              - cuda-version=11.8
+          - matrix:
+              cuda: "12.0"
+            packages:
+              - cuda-version=12.0
+          - matrix:
+              cuda: "12.2"
+            packages:
+              - cuda-version=12.2
+          - matrix:
+              cuda: "12.5"
+            packages:
+              - cuda-version=12.5
+          - matrix:
+              cuda: "12"
+            packages:
+              - cuda-version=12.5
+  py_version:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              py: "3.10"
+            packages:
+              - python=3.10
+          - matrix:
+              py: "3.11"
+            packages:
+              - python=3.11
+          - matrix:
+            packages:
+              - python>=3.10,<3.12
+  test_base:
+    common:
+      - output_types: conda
+        packages:
+          - cudf==24.10.*,>=0.0.0a0
+          - pandas
+          - pytest
+          - pytest-xdist
+  test_dask:
+    common:
+      - output_types: conda
+        packages:
+          - dask
+  test_matplotlib:
+    common:
+      - output_types: conda
+        packages:
+          - matplotlib-base
+  test_numpy:
+    common:
+      - output_types: conda
+        packages:
+          - numpy
+  test_pytorch:
+    common:
+      - output_types: conda
+        packages:
+          - numpy
+          - pytorch>=2.1.0
+  test_seaborn:
+    common:
+      - output_types: conda
+        packages:
+          - seaborn
+  test_scipy:
+    common:
+      - output_types: conda
+        packages:
+          - scipy
+  test_sklearn:
+    common:
+      - output_types: conda
+        packages:
+          - scikit-learn
+  test_stumpy:
+    common:
+      - output_types: conda
+        packages:
+          - dask
+          - stumpy
+  test_tensorflow:
+    common:
+      - output_types: conda
+        packages:
+          - tensorflow
+  test_xgboost:
+    common:
+      - output_types: conda
+        packages:
+          - hypothesis
+          - numpy
+          - scipy
+          - scikit-learn
+          - pip
+          - pip:
+            - xgboost>=2.0.1
+  test_cuml:
+    common:
+      - output_types: conda
+        packages:
+          - cuml==24.10.*,>=0.0.0a0
+          - scikit-learn
+  test_cugraph:
+    common:
+      - output_types: conda
+        packages:
+          - cugraph==24.10.*,>=0.0.0a0
+          - networkx
+  test_ibis:
+    common:
+      - output_types: conda
+        packages:
+          - pip
+          - pip:
+              - ibis-framework[pandas]
+  test_hvplot:
+    common:
+      - output_types: conda
+        packages:
+          - hvplot
+  test_holoviews:
+    common:
+      - output_types: conda
+        packages:
+          - holoviews
+  test_plotly:
+    common:
+      - output_types: conda
+        packages:
+          - plotly
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py
new file mode 100644
index 00000000000..33b6ffdbd5c
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import os
+import pickle
+from typing import TYPE_CHECKING, BinaryIO
+
+import _pytest
+import _pytest.config
+import _pytest.nodes
+import pytest
+
+if TYPE_CHECKING:
+    import _pytest.python
+
+from _pytest.stash import StashKey
+
+from cudf.pandas.module_accelerator import disable_module_accelerator
+
+file_handle_key = StashKey[BinaryIO]()
+basename_key = StashKey[str]()
+test_folder_key = StashKey[str]()
+results = StashKey[tuple[dict, dict]]()
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--compare",
+        action="store_true",
+        default=False,
+        help="Run comparison step?",
+    )
+
+
+def read_results(f):
+    while True:
+        try:
+            yield pickle.load(f)
+        except EOFError:
+            return
+
+
+def pytest_collection_modifyitems(
+    session, config: _pytest.config.Config, items: list[_pytest.nodes.Item]
+):
+    if config.getoption("--compare"):
+        current_pass = "compare"
+    elif "cudf.pandas" in config.option.plugins:
+        current_pass = "cudf_pandas"
+    else:
+        current_pass = "gold"
+
+    def swap_xfail(item: _pytest.nodes.Item, name: str):
+        """Replace custom `xfail_**` mark with a `xfail` mark having the same kwargs."""
+
+        old_mark = item.keywords[name]
+        new_mark = pytest.mark.xfail(**old_mark.kwargs)
+
+        # Replace all "xfail_**" mark in the node chain with the "xfail" mark
+        # if not found, the node chain is not modified.
+        for node, mark in item.iter_markers_with_node(name):
+            idx = node.own_markers.index(mark)
+            node.own_markers[idx] = new_mark
+
+    for item in items:
+        if current_pass == "gold" and "xfail_gold" in item.keywords:
+            swap_xfail(item, "xfail_gold")
+        elif (
+            current_pass == "cudf_pandas"
+            and "xfail_cudf_pandas" in item.keywords
+        ):
+            swap_xfail(item, "xfail_cudf_pandas")
+        elif current_pass == "compare" and "xfail_compare" in item.keywords:
+            swap_xfail(item, "xfail_compare")
+
+
+def pytest_configure(config: _pytest.config.Config):
+    gold_basename = "results-gold"
+    cudf_basename = "results-cudf-pandas"
+    test_folder = os.path.join(os.path.dirname(__file__))
+
+    if config.getoption("--compare"):
+        # Everyone reads everything
+        gold_path = os.path.join(test_folder, f"{gold_basename}.pickle")
+        cudf_path = os.path.join(test_folder, f"{cudf_basename}.pickle")
+        with disable_module_accelerator():
+            with open(gold_path, "rb") as f:
+                gold_results = dict(read_results(f))
+        with open(cudf_path, "rb") as f:
+            cudf_results = dict(read_results(f))
+        config.stash[results] = (gold_results, cudf_results)
+    else:
+        if "cudf.pandas" in config.option.plugins:
+            basename = cudf_basename
+        else:
+            basename = gold_basename
+
+        if hasattr(config, "workerinput"):
+            # If we're on an xdist worker, open a worker-unique pickle file.
+            worker = config.workerinput["workerid"]
+            filename = f"{basename}-{worker}.pickle"
+        else:
+            filename = f"{basename}.pickle"
+
+        pickle_path = os.path.join(test_folder, filename)
+        config.stash[file_handle_key] = open(pickle_path, "wb")
+        config.stash[test_folder_key] = test_folder
+        config.stash[basename_key] = basename
+
+
+def pytest_pyfunc_call(pyfuncitem: _pytest.python.Function):
+    if pyfuncitem.config.getoption("--compare"):
+        gold_results, cudf_results = pyfuncitem.config.stash[results]
+        key = pyfuncitem.nodeid
+        try:
+            gold = gold_results[key]
+        except KeyError:
+            assert False, "pickled gold result is not available"
+        try:
+            cudf = cudf_results[key]
+        except KeyError:
+            assert False, "pickled cudf result is not available"
+        if gold is None and cudf is None:
+            raise ValueError(f"Integration test {key} did not return a value")
+        asserter = pyfuncitem.get_closest_marker("assert_eq")
+        if asserter is None:
+            assert gold == cudf, "Test failed"
+        else:
+            asserter.kwargs["fn"](gold, cudf)
+    else:
+        # Replace default call of test function with one that captures the
+        # result
+        testfunction = pyfuncitem.obj
+        funcargs = pyfuncitem.funcargs
+        testargs = {
+            arg: funcargs[arg] for arg in pyfuncitem._fixtureinfo.argnames
+        }
+        result = testfunction(**testargs)
+        # Tuple-based key-value pairs, key is the node-id
+        try:
+            pickle.dump(
+                (pyfuncitem.nodeid, result),
+                pyfuncitem.config.stash[file_handle_key],
+            )
+        except pickle.PicklingError:
+            pass
+    return True
+
+
+def pytest_unconfigure(config):
+    if config.getoption("--compare"):
+        return
+    if file_handle_key not in config.stash:
+        # We didn't open a pickle file
+        return
+    if not hasattr(config, "workerinput"):
+        # If we're the controlling process
+        if (
+            hasattr(config.option, "numprocesses")
+            and config.option.numprocesses is not None
+        ):
+            # Concat the worker partial pickle results and remove them
+            for i in range(config.option.numprocesses):
+                worker_result = os.path.join(
+                    config.stash[test_folder_key],
+                    f"{config.stash[basename_key]}-gw{i}.pickle",
+                )
+                with open(worker_result, "rb") as f:
+                    config.stash[file_handle_key].write(f.read())
+                os.remove(worker_result)
+    # Close our file
+    del config.stash[file_handle_key]
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
new file mode 100644
index 00000000000..817d98e6ba2
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
@@ -0,0 +1,7 @@
+[pytest]
+xfail_strict=true
+markers=
+    assert_eq: custom binary asserter for a test
+    xfail_gold: this test is expected to fail in the gold pass
+    xfail_cudf_pandas: this test is expected to fail in the cudf_pandas pass
+    xfail_compare: this test is expected to fail in the comparison pass
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py
new file mode 100644
index 00000000000..7acc8672063
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import cugraph
+import cupy as cp
+import networkx as nx
+import numpy as np
+import pandas as pd
+import pytest
+
+cugraph_algos = [
+    "betweenness_centrality",
+    "degree_centrality",
+    "katz_centrality",
+    "sorensen_coefficient",
+    "jaccard_coefficient",
+]
+
+nx_algos = [
+    "betweenness_centrality",
+    "degree_centrality",
+    "katz_centrality",
+]
+
+
+def assert_cugraph_equal(expect, got):
+    if isinstance(expect, cp.ndarray):
+        expect = expect.get()
+    if isinstance(got, cp.ndarray):
+        got = got.get()
+    elif isinstance(expect, np.ndarray) and isinstance(got, np.ndarray):
+        assert np.array_equal(expect, got)
+    else:
+        assert expect == got
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_cugraph_equal)
+
+
+@pytest.fixture(scope="session")
+def df():
+    return pd.DataFrame({"source": [0, 1, 2], "destination": [1, 2, 3]})
+
+
+@pytest.fixture(scope="session")
+def adjacency_matrix():
+    data = {
+        "A": [0, 1, 1, 0],
+        "B": [1, 0, 0, 1],
+        "C": [1, 0, 0, 1],
+        "D": [0, 1, 1, 0],
+    }
+    df = pd.DataFrame(data, index=["A", "B", "C", "D"])
+    return df
+
+
+@pytest.mark.parametrize("algo", cugraph_algos)
+def test_cugraph_from_pandas_edgelist(df, algo):
+    G = cugraph.Graph()
+    G.from_pandas_edgelist(df)
+    return getattr(cugraph, algo)(G).to_pandas().values
+
+
+@pytest.mark.parametrize("algo", cugraph_algos)
+def test_cugraph_from_pandas_adjacency(adjacency_matrix, algo):
+    G = cugraph.Graph()
+    G.from_pandas_adjacency(adjacency_matrix)
+    res = getattr(cugraph, algo)(G).to_pandas()
+    return res.sort_values(list(res.columns)).values
+
+
+@pytest.mark.parametrize("algo", cugraph_algos)
+def test_cugraph_from_numpy_array(df, algo):
+    G = cugraph.Graph()
+    G.from_numpy_array(df.values)
+    return getattr(cugraph, algo)(G).to_pandas().values
+
+
+@pytest.mark.parametrize("algo", nx_algos)
+def test_networkx_from_pandas_edgelist(df, algo):
+    G = nx.from_pandas_edgelist(
+        df, "source", "destination", ["source", "destination"]
+    )
+    return getattr(nx, algo)(G)
+
+
+@pytest.mark.parametrize("algo", nx_algos)
+def test_networkx_from_pandas_adjacency(adjacency_matrix, algo):
+    G = nx.from_pandas_adjacency(adjacency_matrix)
+    return getattr(nx, algo)(G)
+
+
+@pytest.mark.parametrize("algo", nx_algos)
+def test_networkx_from_numpy_array(adjacency_matrix, algo):
+    G = nx.from_numpy_array(adjacency_matrix.values)
+    return getattr(nx, algo)(G)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py
new file mode 100644
index 00000000000..892d0886596
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import cupy as cp
+import numpy as np
+import pandas as pd
+import pytest
+from cuml.cluster import KMeans
+from cuml.decomposition import PCA
+from cuml.ensemble import RandomForestClassifier
+from cuml.linear_model import LinearRegression, LogisticRegression
+from cuml.metrics import accuracy_score
+from cuml.model_selection import train_test_split
+from cuml.pipeline import Pipeline
+from cuml.preprocessing import StandardScaler
+
+
+def assert_cuml_equal(expect, got):
+    # Coerce GPU arrays to CPU
+    if isinstance(expect, cp.ndarray):
+        expect = expect.get()
+    if isinstance(got, cp.ndarray):
+        got = got.get()
+
+    # Handle equality
+    if isinstance(expect, KMeans) and isinstance(got, KMeans):
+        # same clusters
+        np.testing.assert_allclose(
+            expect.cluster_centers_, got.cluster_centers_
+        )
+    elif isinstance(expect, np.ndarray) and isinstance(got, np.ndarray):
+        np.testing.assert_allclose(expect, got)
+    elif isinstance(expect, tuple) and isinstance(got, tuple):
+        assert len(expect) == len(got)
+        for e, g in zip(expect, got):
+            assert_cuml_equal(e, g)
+    elif isinstance(expect, pd.DataFrame):
+        assert pd.testing.assert_frame_equal(expect, got)
+    elif isinstance(expect, pd.Series):
+        assert pd.testing.assert_series_equal(expect, got)
+    else:
+        assert expect == got
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_cuml_equal)
+
+
+@pytest.fixture
+def binary_classification_data():
+    data = {
+        "feature1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+        "feature2": [2.0, 4.0, 1.0, 3.0, 5.0, 7.0, 6.0, 8.0, 10.0, 9.0],
+        "target": [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+    }
+    df = pd.DataFrame(data)
+    return df
+
+
+def test_linear_regression():
+    lr = LinearRegression(fit_intercept=True, normalize=False, algorithm="eig")
+    X = pd.DataFrame()
+    X["col1"] = np.array([1, 1, 2, 2], dtype=np.float32)
+    X["col2"] = np.array([1, 2, 2, 3], dtype=np.float32)
+    y = pd.Series(np.array([6.0, 8.0, 9.0, 11.0], dtype=np.float32))
+    lr.fit(X, y)
+
+    X_new = pd.DataFrame()
+    X_new["col1"] = np.array([3, 2], dtype=np.float32)
+    X_new["col2"] = np.array([5, 5], dtype=np.float32)
+    preds = lr.predict(X_new)
+    return preds.values
+
+
+def test_logistic_regression(binary_classification_data):
+    X = binary_classification_data[["feature1", "feature2"]]
+    y = binary_classification_data["target"]
+
+    (X_train, X_test, y_train, y_test) = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+
+    model = LogisticRegression()
+    model.fit(X_train, y_train)
+
+    y_pred = model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+
+    return accuracy
+
+
+def test_random_forest(binary_classification_data):
+    X = binary_classification_data[["feature1", "feature2"]]
+    y = binary_classification_data["target"]
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+    model = RandomForestClassifier(n_estimators=100)
+    model.fit(X_train, y_train)
+    preds = model.predict(X_test)
+    return preds.values
+
+
+def test_clustering():
+    rng = np.random.default_rng(42)
+    nsamps = 300
+    X = rng.random((nsamps, 2))
+    data = pd.DataFrame(X, columns=["x", "y"])
+
+    kmeans = KMeans(n_clusters=3, random_state=42)
+    kmeans.fit(data)
+    return kmeans
+
+
+def test_data_scaling():
+    data = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0])
+    scaler = StandardScaler()
+
+    scaled_data = scaler.fit_transform(data.values.reshape(-1, 1))
+    return scaled_data
+
+
+def test_pipeline(binary_classification_data):
+    X = binary_classification_data[["feature1", "feature2"]]
+    y = binary_classification_data["target"]
+
+    pipe = Pipeline(
+        [
+            ("scaler", StandardScaler()),
+            ("pca", PCA()),
+            ("random_forest", LogisticRegression()),
+        ]
+    )
+
+    pipe.fit(X, y)
+    results = pipe.predict(X)
+    return results.values
+
+
+@pytest.mark.parametrize(
+    "X, y",
+    [
+        (pd.DataFrame({"a": range(10), "b": range(10)}), pd.Series(range(10))),
+        (
+            pd.DataFrame({"a": range(10), "b": range(10)}).values,
+            pd.Series(range(10)).values,
+        ),  # cudf.pandas wrapped numpy arrays
+    ],
+)
+def test_train_test_split(X, y):
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    # Compare only the size of the data splits
+    return len(X_train), len(X_test), len(y_train), len(y_test)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py
new file mode 100644
index 00000000000..c34778dfded
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import pandas as pd
+
+import dask.dataframe as dd
+
+
+def test_sum():
+    data = {"x": range(1, 11)}
+    ddf = dd.from_pandas(pd.DataFrame(data), npartitions=2)
+    return ddf["x"].sum().compute()
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py
new file mode 100644
index 00000000000..3e247291fad
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import numpy as np
+import pandas as pd
+from feature_engine.imputation import DropMissingData
+from feature_engine.preprocessing import MatchVariables
+
+
+def test_drop_missing_data():
+    data = {
+        "x": [np.nan, 1, 1, 0, np.nan],
+        "y": ["a", np.nan, "b", np.nan, "a"],
+    }
+    df = pd.DataFrame(data)
+
+    dmd = DropMissingData()
+    dmd.fit(df)
+    dmd.transform(df)
+
+    return dmd
+
+
+def test_match_variables():
+    train = pd.DataFrame(
+        {
+            "Name": ["tom", "nick", "krish", "jack"],
+            "City": ["London", "Manchester", "Liverpool", "Bristol"],
+            "Age": [20, 21, 19, 18],
+            "Marks": [0.9, 0.8, 0.7, 0.6],
+        }
+    )
+
+    test = pd.DataFrame(
+        {
+            "Name": ["tom", "sam", "nick"],
+            "Age": [20, 22, 23],
+            "Marks": [0.9, 0.7, 0.6],
+            "Hobbies": ["tennis", "rugby", "football"],
+        }
+    )
+
+    match_columns = MatchVariables()
+
+    match_columns.fit(train)
+
+    df_transformed = match_columns.transform(test)
+
+    return df_transformed
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
new file mode 100644
index 00000000000..bef02c86355
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import holoviews as hv
+import numpy as np
+import pandas as pd
+import pytest
+
+nsamps = 1000
+hv.extension("bokeh")  # load holoviews extension
+
+
+def assert_holoviews_equal(expect, got):
+    expect_data, expect_ndims, expect_kdims, expect_vdims, expect_shape = (
+        expect
+    )
+    got_data, got_ndims, got_kdims, got_vdims, got_shape = got
+
+    if isinstance(expect_data, dict):
+        np.testing.assert_allclose(expect_data["x"], got_data["x"])
+        np.testing.assert_allclose(
+            expect_data["Frequency"], got_data["Frequency"]
+        )
+    else:
+        pd._testing.assert_frame_equal(expect_data, got_data)
+    assert expect_ndims == got_ndims
+    assert expect_kdims == got_kdims
+    assert expect_vdims == got_vdims
+    assert expect_shape == got_shape
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_holoviews_equal)
+
+
+@pytest.fixture(scope="module")
+def df():
+    rng = np.random.default_rng(42)
+    return pd.DataFrame(
+        {
+            "x": rng.random(nsamps),
+            "y": rng.random(nsamps),
+            "category": rng.integers(0, 10, nsamps),
+            "category2": rng.integers(0, 10, nsamps),
+        }
+    )
+
+
+def get_plot_info(plot):
+    return (
+        plot.data,
+        plot.ndims,
+        plot.kdims,
+        plot.vdims,
+        plot.shape,
+    )
+
+
+def test_holoviews_barplot(df):
+    return get_plot_info(hv.Bars(df, kdims="category", vdims="y"))
+
+
+def test_holoviews_scatterplot(df):
+    return get_plot_info(hv.Scatter(df, kdims="x", vdims="y"))
+
+
+def test_holoviews_curve(df):
+    return get_plot_info(hv.Curve(df, kdims="category", vdims="y"))
+
+
+def test_holoviews_heatmap(df):
+    return get_plot_info(
+        hv.HeatMap(df, kdims=["category", "category2"], vdims="y")
+    )
+
+
+def test_holoviews_histogram(df):
+    return get_plot_info(hv.Histogram(df.values))
+
+
+def test_holoviews_hexbin(df):
+    return get_plot_info(hv.HexTiles(df, kdims=["x", "y"], vdims="y"))
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py
new file mode 100644
index 00000000000..0f0d2f8bcbd
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import hvplot.pandas  # noqa: F401, needs to monkey patch pandas with this.
+import numpy as np
+import pandas as pd
+import pytest
+
+nsamps = 1000
+
+
+def assert_hvplot_equal(expect, got):
+    expect_data, expect_ndims, expect_kdims, expect_vdims, expect_shape = (
+        expect
+    )
+    got_data, got_ndims, got_kdims, got_vdims, got_shape = got
+
+    if isinstance(expect_data, dict):
+        np.testing.assert_allclose(expect_data["x"], got_data["x"])
+        np.testing.assert_allclose(
+            expect_data["Frequency"], got_data["Frequency"]
+        )
+    else:
+        pd._testing.assert_frame_equal(expect_data, got_data)
+    assert expect_ndims == got_ndims
+    assert expect_kdims == got_kdims
+    assert expect_vdims == got_vdims
+    assert expect_shape == got_shape
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_hvplot_equal)
+
+
+@pytest.fixture(scope="module")
+def df():
+    rng = np.random.default_rng(42)
+    return pd.DataFrame(
+        {
+            "x": rng.random(nsamps),
+            "y": rng.random(nsamps),
+            "category": rng.integers(0, 10, nsamps),
+            "category2": rng.integers(0, 10, nsamps),
+        }
+    )
+
+
+def get_plot_info(plot):
+    return (
+        plot.data,
+        plot.ndims,
+        plot.kdims,
+        plot.vdims,
+        plot.shape,
+    )
+
+
+def test_hvplot_barplot(df):
+    return get_plot_info(df.hvplot.bar(x="category", y="y"))
+
+
+def test_hvplot_scatterplot(df):
+    return get_plot_info(df.hvplot.scatter(x="x", y="y"))
+
+
+def test_hvplot_lineplot(df):
+    return get_plot_info(df.hvplot.line(x="x", y="y"))
+
+
+def test_hvplot_heatmap(df):
+    return get_plot_info(df.hvplot.heatmap(x="x", y="y", C="y"))
+
+
+def test_hvplot_hexbin(df):
+    return get_plot_info(df.hvplot.hexbin(x="x", y="y", C="y"))
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py
new file mode 100644
index 00000000000..2a8cf7c6ac2
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import ibis
+import numpy as np
+import pandas as pd
+import pytest
+
+ibis.set_backend("pandas")
+ibis.options.interactive = False
+
+
+def ibis_assert_equal(expect, got, rtol: float = 1e-7, atol: float = 0.0):
+    pd._testing.assert_almost_equal(expect, got, rtol=rtol, atol=atol)
+
+
+pytestmark = pytest.mark.assert_eq(fn=ibis_assert_equal)
+
+
+COLUMN_REDUCTIONS = ["sum", "min", "max", "mean", "var", "std"]
+ELEMENTWISE_UFUNCS = [
+    "sin",
+    "cos",
+    "atan",
+    "exp",
+    "log",
+    "abs",
+]
+STRING_UNARY_FUNCS = [
+    "lower",
+    "upper",
+    "capitalize",
+    "reverse",
+]
+
+
+@pytest.fixture
+def ibis_table_num_str():
+    N = 1000
+    K = 5
+    rng = np.random.default_rng(42)
+
+    df = pd.DataFrame(
+        rng.integers(0, 100, (N, K)), columns=[f"col{x}" for x in np.arange(K)]
+    )
+    df["key"] = rng.choice(np.arange(10), N)
+    df["str_col"] = rng.choice(["Hello", "World", "It's", "Me", "Again"], N)
+    table = ibis.memtable(df, name="t")
+    return table
+
+
+@pytest.fixture
+def ibis_table_num():
+    N = 100
+    K = 2
+    rng = np.random.default_rng(42)
+
+    df = pd.DataFrame(
+        rng.integers(0, 100, (N, K)), columns=[f"val{x}" for x in np.arange(K)]
+    )
+    df["key"] = rng.choice(np.arange(10), N)
+    table = ibis.memtable(df, name="t")
+    return table
+
+
+@pytest.mark.parametrize("op", COLUMN_REDUCTIONS)
+def test_column_reductions(ibis_table_num_str, op):
+    t = ibis_table_num_str
+    return getattr(t.col1, op)().to_pandas()
+
+
+@pytest.mark.parametrize("op", ["mean", "sum", "min", "max"])
+def test_groupby_reductions(ibis_table_num_str, op):
+    t = ibis_table_num_str
+    return getattr(t.group_by("key").col1, op)().to_pandas()
+
+
+@pytest.mark.parametrize("op", ELEMENTWISE_UFUNCS)
+def test_mutate_ufunc(ibis_table_num_str, op):
+    t = ibis_table_num_str
+    expr = getattr(t.col1, op)()
+    return t.mutate(col1_sin=expr).to_pandas()
+
+
+@pytest.mark.parametrize("op", STRING_UNARY_FUNCS)
+def test_string_unary(ibis_table_num_str, op):
+    t = ibis_table_num_str
+    return getattr(t.str_col, op)().to_pandas()
+
+
+def test_nunique(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.col1.nunique().to_pandas()
+
+
+def test_count(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.col1.count().to_pandas()
+
+
+def test_select(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.select("col0", "col1").to_pandas()
+
+
+def test_between(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.key.between(4, 8).to_pandas()
+
+
+def test_notin(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.key.notin([0, 1, 8, 3]).to_pandas()
+
+
+def test_window(ibis_table_num_str):
+    t = ibis_table_num_str
+    return (
+        t.group_by("key").mutate(demeaned=t.col1 - t.col1.mean()).to_pandas()
+    )
+
+
+def test_limit(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.limit(5).to_pandas()
+
+
+def test_filter(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.filter([t.key == 4, t.col0 > 15]).to_pandas()
+
+
+@pytest.mark.skip(reason="Join ordering not currently guaranteed, i.e., flaky")
+@pytest.mark.parametrize("join_type", ["inner", "left", "right"])
+def test_join_exact_ordering(ibis_table_num_str, ibis_table_num, join_type):
+    t1 = ibis_table_num_str
+    t2 = ibis_table_num
+    res = t1.join(t2, "key", how=join_type).to_pandas()
+    return res
+
+
+@pytest.mark.parametrize("join_type", ["inner", "left", "right"])
+def test_join_sort_correctness(ibis_table_num_str, ibis_table_num, join_type):
+    """
+    While we don't currently guarantee exact row ordering
+    we can still test join correctness with ex-post sorting.
+    """
+    t1 = ibis_table_num_str
+    t2 = ibis_table_num
+    res = t1.join(t2, "key", how=join_type).to_pandas()
+
+    res_sorted = res.sort_values(by=res.columns.tolist()).reset_index(
+        drop=True
+    )
+    return res_sorted
+
+
+def test_order_by(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.order_by(ibis.desc("col1")).to_pandas()
+
+
+def test_aggregate_having(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.aggregate(
+        by=["key"],
+        sum_c0=t.col0.sum(),
+        avg_c0=t.col0.mean(),
+        having=t.col1.mean() > 50,
+    ).to_pandas()
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
new file mode 100644
index 00000000000..665b9d6fb08
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import pytest
+from matplotlib.axes import Axes
+from matplotlib.collections import PathCollection
+from matplotlib.lines import Line2D
+from matplotlib.patches import Rectangle
+from pandas._testing import assert_equal
+
+
+def assert_plots_equal(expect, got):
+    if isinstance(expect, Axes) and isinstance(got, Axes):
+        for expect_ch, got_ch in zip(
+            expect.get_children(), got.get_children()
+        ):
+            assert type(expect_ch) == type(got_ch)
+            if isinstance(expect_ch, Line2D):
+                assert_equal(expect_ch.get_xdata(), got_ch.get_xdata())
+                assert_equal(expect_ch.get_ydata(), got_ch.get_ydata())
+            elif isinstance(expect_ch, Rectangle):
+                assert expect_ch.get_height() == got_ch.get_height()
+    elif isinstance(expect, PathCollection) and isinstance(
+        got, PathCollection
+    ):
+        assert_equal(expect.get_offsets()[:, 0], got.get_offsets()[:, 0])
+        assert_equal(expect.get_offsets()[:, 1], got.get_offsets()[:, 1])
+    else:
+        assert_equal(expect, got)
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal)
+
+
+def test_line():
+    df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]})
+    (data,) = plt.plot(df["x"], df["y"], marker="o", linestyle="-")
+
+    return plt.gca()
+
+
+def test_bar():
+    data = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
+    ax = data.plot(kind="bar")
+    return ax
+
+
+def test_scatter():
+    df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [5, 4, 3, 2, 1]})
+
+    fig, ax = plt.subplots(figsize=(8, 6))
+    ax.scatter(df["x"], df["y"])
+
+    return plt.gca()
+
+
+def test_dataframe_plot():
+    rng = np.random.default_rng(42)
+    df = pd.DataFrame(rng.random((10, 5)), columns=["a", "b", "c", "d", "e"])
+    ax = df.plot()
+
+    return ax
+
+
+def test_series_plot():
+    sr = pd.Series([1, 2, 3, 4, 5])
+    ax = sr.plot()
+
+    return ax
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
new file mode 100644
index 00000000000..472f1889354
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+nsamps = 1000
+reductions = ["sum", "min", "max", "mean", "var", "std"]
+
+
+pytestmark = pytest.mark.assert_eq(fn=np.testing.assert_allclose)
+
+
+@pytest.fixture(scope="module")
+def sr():
+    rng = np.random.default_rng(42)
+    return pd.Series(rng.random(nsamps))
+
+
+@pytest.mark.parametrize("op", reductions)
+def test_numpy_series_reductions(sr, op):
+    return getattr(np, op)(sr)
+
+
+@pytest.fixture(scope="module")
+def df():
+    rng = np.random.default_rng(42)
+    return pd.DataFrame({"A": rng.random(nsamps), "B": rng.random(nsamps)})
+
+
+@pytest.mark.parametrize("op", reductions)
+def test_numpy_dataframe_reductions(df, op):
+    return getattr(np, op)(df)
+
+
+def test_numpy_dot(df):
+    return np.dot(df, df.T)
+
+
+def test_numpy_fft(sr):
+    fft = np.fft.fft(sr)
+    return fft
+
+
+def test_numpy_sort(df):
+    return np.sort(df)
+
+
+@pytest.mark.parametrize("percentile", [0, 25, 50, 75, 100])
+def test_numpy_percentile(df, percentile):
+    return np.percentile(df, percentile)
+
+
+def test_numpy_unique(df):
+    return np.unique(df)
+
+
+def test_numpy_transpose(df):
+    return np.transpose(df)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py
new file mode 100644
index 00000000000..27d9df83476
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import pytest
+
+nsamps = 100
+
+
+def assert_plotly_equal(expect, got):
+    assert type(expect) == type(got)
+    if isinstance(expect, dict):
+        assert expect.keys() == got.keys()
+        for k in expect.keys():
+            assert_plotly_equal(expect[k], got[k])
+    elif isinstance(got, list):
+        assert len(expect) == len(got)
+        for i in range(len(expect)):
+            assert_plotly_equal(expect[i], got[i])
+    elif isinstance(expect, np.ndarray):
+        np.testing.assert_allclose(expect, got)
+    else:
+        assert expect == got
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_plotly_equal)
+
+
+@pytest.fixture(scope="module")
+def df():
+    rng = np.random.default_rng(42)
+    return pd.DataFrame(
+        {
+            "x": rng.random(nsamps),
+            "y": rng.random(nsamps),
+            "category": rng.integers(0, 10, nsamps),
+            "category2": rng.integers(0, 10, nsamps),
+        }
+    )
+
+
+def test_plotly_scatterplot(df):
+    return px.scatter(df, x="x", y="y").to_plotly_json()
+
+
+def test_plotly_lineplot(df):
+    return px.line(df, x="category", y="y").to_plotly_json()
+
+
+def test_plotly_barplot(df):
+    return px.bar(df, x="category", y="y").to_plotly_json()
+
+
+def test_plotly_histogram(df):
+    return px.histogram(df, x="category").to_plotly_json()
+
+
+def test_plotly_pie(df):
+    return px.pie(df, values="category", names="category2").to_plotly_json()
+
+
+def test_plotly_heatmap(df):
+    return px.density_heatmap(df, x="category", y="category2").to_plotly_json()
+
+
+def test_plotly_boxplot(df):
+    return px.box(df, x="category", y="y").to_plotly_json()
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
new file mode 100644
index 00000000000..ae9db3836a6
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+import pytest
+import torch
+
+pytestmark = pytest.mark.assert_eq(fn=torch.testing.assert_close)
+
+
+@pytest.fixture
+def data():
+    rng = np.random.default_rng(0)
+    x1 = rng.random(100, dtype=np.float32)
+    x2 = rng.random(100, dtype=np.float32)
+    y = np.zeros(100).astype(np.int64)
+
+    y[(x1 > x2) & (x1 > 0)] = 0
+    y[(x1 < x2) & (x1 > 0)] = 1
+    y[(x1 > x2) & (x1 < 0)] = 2
+    y[(x1 < x2) & (x1 < 0)] = 3
+
+    return x1, x2, y
+
+
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, x1, x2, y):
+        self.x1 = x1
+        self.x2 = x2
+        self.y = y
+
+    def __getitem__(self, idx):
+        x1 = self.x1[idx]
+        x2 = self.x2[idx]
+        y = self.y[idx]
+        return (x1, x2), y
+
+    def __len__(self):
+        return len(self.x1)
+
+
+def test_dataloader_auto_batching(data):
+    x1, x2, y = (pd.Series(i) for i in data)
+
+    dataset = Dataset(x1, x2, y)
+
+    # default collate_fn
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=10)
+
+    (x1, x2), y = next(iter(dataloader))
+    return x1, x2, y
+
+
+def test_dataloader_manual_batching(data):
+    x1, x2, y = (pd.Series(i) for i in data)
+
+    dataset = Dataset(x1, x2, y)
+
+    # default collate_fn
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=None)
+
+    (x1, x2), y = next(iter(dataloader))
+    return x1, x2, y
+
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(2, 10)
+        self.relu1 = torch.nn.ReLU()
+        self.fc2 = torch.nn.Linear(10, 10)
+        self.relu2 = torch.nn.ReLU()
+        self.output = torch.nn.Linear(10, 4)
+
+    def forward(self, x1, x2):
+        x = torch.stack([x1, x2], dim=0).T
+        x = self.fc1(x)
+        x = self.relu1(x)
+        x = self.fc2(x)
+        x = self.relu2(x)
+        return torch.nn.functional.softmax(x, dim=1)
+
+
+def train(model, dataloader, optimizer, criterion):
+    model.train()
+    for (x1, x2), y in dataloader:
+        x1 = x1.to("cuda")
+        x2 = x2.to("cuda")
+        y = y.to("cuda")
+
+        optimizer.zero_grad()
+        y_pred = model(x1, x2)
+        loss = criterion(y_pred, y)
+        loss.backward()
+        optimizer.step()
+
+
+def test_torch_train(data):
+    torch.manual_seed(0)
+
+    x1, x2, y = (pd.Series(i) for i in data)
+    dataset = Dataset(x1, x2, y)
+    # default collate_fn
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=10)
+
+    model = Model().to("cuda")
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
+    criterion = torch.nn.CrossEntropyLoss()
+
+    train(model, dataloader, optimizer, criterion)
+
+    test_x1, test_x2 = next(iter(dataloader))[0]
+    test_x1 = test_x1.to("cuda")
+    test_x2 = test_x2.to("cuda")
+
+    return model(test_x1, test_x2)
+
+
+def test_torch_tensor_ctor():
+    s = pd.Series(range(5))
+    return torch.tensor(s.values)
+
+
+@pytest.mark.xfail_cudf_pandas(reason="Known failure, see xdf/#210")
+@pytest.mark.xfail_compare
+def test_torch_tensor_from_numpy():
+    s = pd.Series(range(5))
+    return torch.from_numpy(s.values)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py
new file mode 100644
index 00000000000..963a8549000
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+import pytest
+import scipy
+
+
+@pytest.mark.parametrize("func", ["hmean", "tvar", "gstd"])
+def test_scipy_stats(func):
+    rng = np.random.default_rng(42)
+    data = pd.Series(rng.random(1000))
+    return getattr(scipy.stats, func)(data)
+
+
+@pytest.mark.parametrize("func", ["norm"])
+def test_scipy_linalg(func):
+    rng = np.random.default_rng(42)
+    data = pd.Series(rng.random(1000))
+    return getattr(scipy.linalg, func)(data)
+
+
+pytestmark = pytest.mark.assert_eq(fn=pd._testing.assert_almost_equal)
+
+
+def test_compute_pi():
+    def circle(x):
+        return (1 - x**2) ** 0.5
+
+    x = pd.Series(np.linspace(0, 1, 100))
+    y = pd.Series(circle(np.linspace(0, 1, 100)))
+
+    result = scipy.integrate.trapezoid(y, x)
+    return result * 4
+
+
+def test_matrix_solve():
+    A = pd.DataFrame([[2, 3], [1, 2]])
+    b = pd.Series([1, 2])
+
+    return scipy.linalg.solve(A, b)
+
+
+def test_correlation():
+    data = pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": [5, 4, 3, 2, 1]})
+
+    return scipy.stats.pearsonr(data["A"], data["B"])
+
+
+def test_optimization():
+    x = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0])
+
+    def rosen(x):  # banana function from scipy tutorial
+        return sum(
+            100.0 * (x[1:] - x[:-1] ** 2.0) ** 2.0 + (1 - x[:-1]) ** 2.0
+        )
+
+    result = scipy.optimize.fmin(rosen, x)
+    return result
+
+
+def test_regression():
+    data = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 5, 4, 5]})
+    result = scipy.stats.linregress(data["y"], data["y"])
+    return result.slope, result.intercept
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
new file mode 100644
index 00000000000..4b272900acd
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import pandas as pd
+import pytest
+import seaborn as sns
+from matplotlib.axes import Axes
+from matplotlib.collections import PathCollection
+from matplotlib.lines import Line2D
+from matplotlib.patches import Rectangle
+from pandas._testing import assert_equal
+
+
+def assert_plots_equal(expect, got):
+    if isinstance(expect, Axes) and isinstance(got, Axes):
+        for expect_ch, got_ch in zip(
+            expect.get_children(), got.get_children()
+        ):
+            assert type(expect_ch) == type(got_ch)
+            if isinstance(expect_ch, Line2D):
+                assert_equal(expect_ch.get_xdata(), got_ch.get_xdata())
+                assert_equal(expect_ch.get_ydata(), got_ch.get_ydata())
+            elif isinstance(expect_ch, Rectangle):
+                assert expect_ch.get_height() == got_ch.get_height()
+    elif isinstance(expect, PathCollection) and isinstance(
+        got, PathCollection
+    ):
+        assert_equal(expect.get_offsets()[:, 0], got.get_offsets()[:, 0])
+        assert_equal(expect.get_offsets()[:, 1], got.get_offsets()[:, 1])
+    else:
+        assert_equal(expect, got)
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal)
+
+
+@pytest.fixture(scope="module")
+def df():
+    df = pd.DataFrame(
+        {
+            "x": [2, 3, 4, 5, 11],
+            "y": [4, 3, 2, 1, 15],
+            "hue": ["c", "a", "b", "b", "a"],
+        }
+    )
+    return df
+
+
+def test_bar(df):
+    ax = sns.barplot(data=df, x="x", y="y")
+    return ax
+
+
+def test_scatter(df):
+    ax = sns.scatterplot(data=df, x="x", y="y", hue="hue")
+    return ax
+
+
+def test_lineplot_with_sns_data():
+    df = sns.load_dataset("flights")
+    ax = sns.lineplot(data=df, x="month", y="passengers")
+    return ax
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py
new file mode 100644
index 00000000000..1635fd3dcda
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.cluster import KMeans
+from sklearn.feature_selection import SelectKBest, f_classif
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+
+
+def test_regression():
+    data = {
+        "feature1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        "feature2": [2, 4, 1, 3, 5, 7, 6, 8, 10, 9],
+        "target": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
+    }
+    df = pd.DataFrame(data)
+
+    X = df[["feature1", "feature2"]]
+    y = df["target"]
+
+    # Data Splitting
+    (X_train, X_test, y_train, y_test) = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+
+    # Basic deterministic LR model
+    model = LogisticRegression()
+    model.fit(X_train, y_train)
+
+    # predction phase
+    y_pred = model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+
+    return accuracy
+
+
+@pytest.mark.assert_eq(fn=np.testing.assert_allclose)
+def test_clustering():
+    rng = np.random.default_rng(42)
+    nsamps = 300
+    X = rng.random((nsamps, 2))
+    data = pd.DataFrame(X, columns=["x", "y"])
+
+    # Create and fit a KMeans clustering model
+    kmeans = KMeans(n_clusters=3, random_state=42)
+    kmeans.fit(data)
+    return kmeans.cluster_centers_
+
+
+def test_feature_selection():
+    rng = np.random.default_rng(42)
+    n_samples = 100
+    n_features = 10
+
+    X = rng.random((n_samples, n_features))
+    y = rng.integers(0, 2, size=n_samples)
+
+    data = pd.DataFrame(
+        X, columns=[f"feature{i}" for i in range(1, n_features + 1)]
+    )
+    data["target"] = y
+
+    # Select the top k features
+    k_best = SelectKBest(score_func=f_classif, k=5)
+    k_best.fit_transform(X, y)
+
+    feat_inds = k_best.get_support(indices=True)
+    features = data.iloc[:, feat_inds]
+
+    return sorted(features.columns.tolist())
+
+
+@pytest.mark.assert_eq(fn=np.testing.assert_allclose)
+def test_data_scaling():
+    data = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0])
+    scaler = StandardScaler()
+
+    scaled_data = scaler.fit_transform(data.values.reshape(-1, 1))
+    return scaled_data
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py
new file mode 100644
index 00000000000..69248002a58
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+import pytest
+import stumpy
+from numba import cuda
+from pandas._testing import assert_equal
+
+
+def stumpy_assert_equal(expected, got):
+    def as_float64(x):
+        if isinstance(x, (tuple, list)):
+            return [as_float64(y) for y in x]
+        else:
+            return x.astype(np.float64)
+
+    assert_equal(as_float64(expected), as_float64(got))
+
+
+pytestmark = pytest.mark.assert_eq(fn=stumpy_assert_equal)
+
+
+def test_1d_time_series():
+    rng = np.random.default_rng(42)
+    ts = pd.Series(rng.random(10))
+    m = 3
+
+    return stumpy.stump(ts, m)
+
+
+def test_1d_gpu():
+    rng = np.random.default_rng(42)
+    your_time_series = rng.random(10000)
+    window_size = (
+        50  # Approximately, how many data points might be found in a pattern
+    )
+    all_gpu_devices = [
+        device.id for device in cuda.list_devices()
+    ]  # Get a list of all available GPU devices
+
+    return stumpy.gpu_stump(
+        your_time_series, m=window_size, device_id=all_gpu_devices
+    )
+
+
+def test_multidimensional_timeseries():
+    rng = np.random.default_rng(42)
+    # Each row represents data from a different dimension while each column represents
+    # data from the same dimension
+    your_time_series = rng.random((3, 1000))
+    # Approximately, how many data points might be found in a pattern
+    window_size = 50
+
+    return stumpy.mstump(your_time_series, m=window_size)
+
+
+def test_anchored_time_series_chains():
+    rng = np.random.default_rng(42)
+    your_time_series = rng.random(10000)
+    window_size = (
+        50  # Approximately, how many data points might be found in a pattern
+    )
+
+    matrix_profile = stumpy.stump(your_time_series, m=window_size)
+
+    left_matrix_profile_index = matrix_profile[:, 2]
+    right_matrix_profile_index = matrix_profile[:, 3]
+    idx = 10  # Subsequence index for which to retrieve the anchored time series chain for
+
+    anchored_chain = stumpy.atsc(
+        left_matrix_profile_index, right_matrix_profile_index, idx
+    )
+
+    all_chain_set, longest_unanchored_chain = stumpy.allc(
+        left_matrix_profile_index, right_matrix_profile_index
+    )
+
+    return anchored_chain, all_chain_set, longest_unanchored_chain
+
+
+def test_semantic_segmentation():
+    rng = np.random.default_rng(42)
+    your_time_series = rng.random(10000)
+    window_size = (
+        50  # Approximately, how many data points might be found in a pattern
+    )
+
+    matrix_profile = stumpy.stump(your_time_series, m=window_size)
+
+    subseq_len = 50
+    return stumpy.fluss(
+        matrix_profile[:, 1], L=subseq_len, n_regimes=2, excl_factor=1
+    )
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
new file mode 100644
index 00000000000..37e3cc34856
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+import pytest
+import stumpy
+from pandas._testing import assert_equal
+
+from dask.distributed import Client, LocalCluster
+
+
+def stumpy_assert_equal(expected, got):
+    def as_float64(x):
+        if isinstance(x, (tuple, list)):
+            return [as_float64(y) for y in x]
+        else:
+            return x.astype(np.float64)
+
+    assert_equal(as_float64(expected), as_float64(got))
+
+
+pytestmark = pytest.mark.assert_eq(fn=stumpy_assert_equal)
+
+
+# Shared dask client for all tests in this module
+@pytest.fixture(scope="module")
+def dask_client():
+    with LocalCluster(n_workers=4, threads_per_worker=1) as cluster:
+        with Client(cluster) as dask_client:
+            yield dask_client
+
+
+def test_1d_distributed(dask_client):
+    np.random.seed(42)
+    ts = pd.Series(np.random.rand(100))
+    m = 10
+    return stumpy.stumped(dask_client, ts, m)
+
+
+def test_multidimensional_distributed_timeseries(dask_client):
+    np.random.seed(42)
+    # Each row represents data from a different dimension while each column represents
+    # data from the same dimension
+    your_time_series = np.random.rand(3, 1000)
+    # Approximately, how many data points might be found in a pattern
+    window_size = 50
+
+    return stumpy.mstumped(dask_client, your_time_series, m=window_size)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
new file mode 100644
index 00000000000..ba1f518cbfd
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+import pytest
+import tensorflow as tf
+
+SHUFFLE_BUFFER = 500
+BATCH_SIZE = 2
+
+pytestmark = pytest.mark.assert_eq(fn=pd._testing.assert_equal)
+
+
+@pytest.fixture(scope="module")
+def df():
+    rng = np.random.RandomState(42)
+
+    nrows = 303
+    columns = {
+        "age": rng.randint(29, 78, size=(nrows,), dtype="int64"),
+        "sex": rng.randint(0, 2, size=(nrows,), dtype="int64"),
+        "cp": rng.randint(0, 5, size=(nrows,), dtype="int64"),
+        "trestbps": rng.randint(94, 201, size=(nrows,), dtype="int64"),
+        "chol": rng.randint(126, 565, size=(nrows,), dtype="int64"),
+        "fbs": rng.randint(0, 2, size=(nrows,), dtype="int64"),
+        "restecg": rng.randint(0, 3, size=(nrows,), dtype="int64"),
+        "thalach": rng.randint(71, 203, size=(nrows,), dtype="int64"),
+        "exang": rng.randint(0, 2, size=(nrows,), dtype="int64"),
+        "oldpeak": rng.uniform(0.0, 6.2, size=(nrows,)),
+        "slope": rng.randint(1, 4, size=(nrows,), dtype="int64"),
+        "ca": rng.randint(0, 4, size=(nrows,), dtype="int64"),
+        "thal": rng.choice(
+            ["fixed", "normal", "reversible", "1", "2"], size=(nrows,)
+        ),
+        "target": rng.randint(0, 2, size=(nrows,), dtype="int64"),
+    }
+
+    return pd.DataFrame(columns)
+
+
+@pytest.fixture(scope="module")
+def target(df):
+    return df.pop("target")
+
+
+@pytest.fixture
+def model_gen():
+    def make_model(numeric_features):
+        normalizer = tf.keras.layers.Normalization(axis=-1)
+        normalizer.adapt(numeric_features)
+        model = tf.keras.Sequential(
+            [
+                normalizer,
+                tf.keras.layers.Dense(10, activation="relu"),
+                tf.keras.layers.Dense(1),
+            ]
+        )
+
+        model.compile(
+            optimizer="adam",
+            loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+            metrics=["accuracy"],
+        )
+        return model
+
+    return make_model
+
+
+def test_dataframe_as_array(model_gen, df, target):
+    tf.keras.utils.set_random_seed(42)
+
+    numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"]
+    numeric_features = df[numeric_feature_names]
+
+    numeric_features = tf.convert_to_tensor(
+        numeric_features.values, dtype=tf.float32
+    )
+
+    model = model_gen(numeric_features)
+    model.fit(numeric_features, target, epochs=1, batch_size=BATCH_SIZE)
+
+    test_data = numeric_features[:BATCH_SIZE]
+    return model.predict(test_data)
+
+
+def test_dataframe_as_dataset(model_gen, df, target):
+    tf.keras.utils.set_random_seed(42)
+
+    numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"]
+    numeric_features = df[numeric_feature_names]
+
+    numeric_features = tf.convert_to_tensor(
+        numeric_features.values, dtype=tf.float32
+    )
+
+    dataset = tf.data.Dataset.from_tensor_slices((numeric_features, target))
+    dataset = dataset.shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE)
+
+    model = model_gen(numeric_features)
+    model.fit(dataset, epochs=1)
+
+    test_data = dataset.take(1)
+    return model.predict(test_data)
+
+
+def stack_dict(inputs, func=tf.stack):
+    values = []
+    for key in sorted(inputs.keys()):
+        values.append(CastLayer()(inputs[key]))
+
+    class MyLayer(tf.keras.layers.Layer):
+        def call(self, val):
+            return func(val, axis=-1)
+
+    return MyLayer()(values)
+
+
+def test_dataframe_as_dictionary_with_keras_input_layer(df, target):
+    # ensure deterministic results
+    tf.keras.utils.set_random_seed(42)
+
+    numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"]
+    numeric_features = df[numeric_feature_names]
+
+    inputs = {}
+    for name in numeric_features:
+        inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=tf.float32)
+
+    x = stack_dict(inputs, func=tf.concat)
+
+    normalizer = tf.keras.layers.Normalization(axis=-1)
+    normalizer.adapt(stack_dict(dict(numeric_features)))
+
+    x = normalizer(x)
+    x = tf.keras.layers.Dense(10, activation="relu")(x)
+    x = tf.keras.layers.Dense(1)(x)
+
+    model = tf.keras.Model(inputs, x)
+
+    model.compile(
+        optimizer="adam",
+        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+        metrics=["accuracy"],
+        run_eagerly=True,
+    )
+
+    # Train with dictionary of columns as input:
+    model.fit(dict(numeric_features), target, epochs=1, batch_size=BATCH_SIZE)
+
+    # Train with a dataset of dictionary-elements
+    numeric_dict_ds = tf.data.Dataset.from_tensor_slices(
+        (dict(numeric_features), target)
+    )
+    numeric_dict_batches = numeric_dict_ds.shuffle(SHUFFLE_BUFFER).batch(
+        BATCH_SIZE
+    )
+    model.fit(numeric_dict_batches, epochs=1)
+
+    # Predict
+    return model.predict(numeric_dict_batches.take(1))
+
+
+def test_full_example_train_with_ds(df, target):
+    # https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example
+    # Inputs are converted to tf.dataset and then batched
+
+    # ensure deterministic results
+    tf.keras.utils.set_random_seed(42)
+
+    numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"]
+    binary_feature_names = ["sex", "fbs", "exang"]
+    categorical_feature_names = ["cp", "restecg", "slope", "thal", "ca"]
+
+    numeric_features = df[numeric_feature_names]
+
+    inputs = {}
+    for name, column in df.items():
+        if isinstance(column[0], str):
+            dtype = tf.string
+        elif name in categorical_feature_names or name in binary_feature_names:
+            dtype = tf.int64
+        else:
+            dtype = tf.float32
+
+        inputs[name] = tf.keras.Input(shape=(), name=name, dtype=dtype)
+
+    preprocessed = []
+
+    # Process binary features
+    for name in binary_feature_names:
+        inp = inputs[name]
+        inp = inp[:, tf.newaxis]
+        float_value = CastLayer()(inp)
+        preprocessed.append(float_value)
+
+    normalizer = tf.keras.layers.Normalization(axis=-1)
+    normalizer.adapt(stack_dict(dict(numeric_features)))
+
+    # Process numeric features
+    numeric_inputs = {}
+    for name in numeric_feature_names:
+        numeric_inputs[name] = inputs[name]
+
+    numeric_inputs = stack_dict(numeric_inputs)
+    numeric_normalized = normalizer(numeric_inputs)
+
+    preprocessed.append(numeric_normalized)
+
+    # Process categorical features
+    for name in categorical_feature_names:
+        vocab = sorted(set(df[name]))
+        print(f"name: {name}")
+        print(f"vocab: {vocab}\n")
+
+        if isinstance(vocab[0], str):
+            lookup = tf.keras.layers.StringLookup(
+                vocabulary=vocab, output_mode="one_hot"
+            )
+        else:
+            lookup = tf.keras.layers.IntegerLookup(
+                vocabulary=vocab, output_mode="one_hot"
+            )
+
+        x = inputs[name][:, tf.newaxis]
+        x = lookup(x)
+        preprocessed.append(x)
+
+    # Concatenate all tensors
+    preprocesssed_result = MyConcatLayer()(preprocessed)
+
+    preprocessor = tf.keras.Model(inputs, preprocesssed_result)
+
+    # Create the model
+    body = tf.keras.Sequential(
+        [
+            tf.keras.layers.Dense(10, activation="relu"),
+            tf.keras.layers.Dense(10, activation="relu"),
+            tf.keras.layers.Dense(1),
+        ]
+    )
+
+    x = preprocessor(inputs)
+    result = body(x)
+
+    model = tf.keras.Model(inputs, result)
+
+    model.compile(
+        optimizer="adam",
+        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+        metrics=["accuracy"],
+    )
+
+    ds = tf.data.Dataset.from_tensor_slices((dict(df), target))
+    ds = ds.batch(BATCH_SIZE)
+    model.fit(ds, epochs=1)
+
+    return model.predict(ds.take(1))
+
+
+class CastLayer(tf.keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super(CastLayer, self).__init__(**kwargs)
+
+    def call(self, inp):
+        return tf.cast(inp, tf.float32)
+
+
+class MyConcatLayer(tf.keras.layers.Layer):
+    def call(self, values):
+        values = [tf.cast(v, tf.float32) for v in values]
+        return tf.concat(values, axis=-1)
+
+
+def test_full_example_train_with_df(df, target):
+    # https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example
+    # Inputs are directly passed as dictionary of series
+
+    # ensure deterministic results
+    tf.keras.utils.set_random_seed(42)
+
+    numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"]
+    binary_feature_names = ["sex", "fbs", "exang"]
+    categorical_feature_names = ["cp", "restecg", "slope", "thal", "ca"]
+
+    numeric_features = df[numeric_feature_names]
+
+    inputs = {}
+
+    for name, column in df.items():
+        if isinstance(column[0], str):
+            dtype = tf.string
+        elif name in categorical_feature_names or name in binary_feature_names:
+            dtype = tf.int64
+        else:
+            dtype = tf.float32
+
+        inputs[name] = tf.keras.Input(shape=(), name=name, dtype=dtype)
+
+    preprocessed = []
+
+    # Process binary features
+    for name in binary_feature_names:
+        inp = inputs[name]
+        inp = inp[:, tf.newaxis]
+        float_value = CastLayer()(inp)
+        preprocessed.append(float_value)
+
+    normalizer = tf.keras.layers.Normalization(axis=-1)
+    normalizer.adapt(stack_dict(dict(numeric_features)))
+
+    # Process numeric features
+    numeric_inputs = {}
+    for name in numeric_feature_names:
+        numeric_inputs[name] = inputs[name]
+
+    numeric_inputs = stack_dict(numeric_inputs)
+    numeric_normalized = normalizer(numeric_inputs)
+
+    preprocessed.append(numeric_normalized)
+
+    # Process categorical features
+    for name in categorical_feature_names:
+        vocab = sorted(set(df[name]))
+        print(f"name: {name}")
+        print(f"vocab: {vocab}\n")
+
+        if isinstance(vocab[0], str):
+            lookup = tf.keras.layers.StringLookup(
+                vocabulary=vocab, output_mode="one_hot"
+            )
+        else:
+            lookup = tf.keras.layers.IntegerLookup(
+                vocabulary=vocab, output_mode="one_hot"
+            )
+
+        x = inputs[name][:, tf.newaxis]
+        x = lookup(x)
+        preprocessed.append(x)
+
+    # Concatenate all tensors
+    preprocesssed_result = MyConcatLayer()(preprocessed)
+
+    preprocessor = tf.keras.Model(inputs, preprocesssed_result)
+
+    # Create the model
+    body = tf.keras.Sequential(
+        [
+            tf.keras.layers.Dense(10, activation="relu"),
+            tf.keras.layers.Dense(10, activation="relu"),
+            tf.keras.layers.Dense(1),
+        ]
+    )
+
+    x = preprocessor(inputs)
+    result = body(x)
+
+    model = tf.keras.Model(inputs, result)
+
+    model.compile(
+        optimizer="adam",
+        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+        metrics=["accuracy"],
+    )
+
+    model.fit(dict(df), target, epochs=1, batch_size=BATCH_SIZE)
+
+    return model.predict(dict(df[:BATCH_SIZE]))
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
new file mode 100644
index 00000000000..70f1e6a4250
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import pytest
+import scipy.sparse
+import xgboost as xgb
+from sklearn.datasets import make_regression
+from xgboost.testing import IteratorForTest, make_categorical
+
+n_samples = 128
+n_features = 16
+
+
+def xgboost_assert_equal(expect, got, rtol: float = 1e-7, atol: float = 0.0):
+    if isinstance(expect, (tuple, list)):
+        assert len(expect) == len(got)
+        for e, g in zip(expect, got):
+            xgboost_assert_equal(e, g, rtol, atol)
+    elif isinstance(expect, scipy.sparse.csr_matrix):
+        np.testing.assert_allclose(expect.data, got.data, rtol=rtol, atol=atol)
+        np.testing.assert_equal(expect.indices, got.indices)
+        np.testing.assert_equal(expect.indptr, got.indptr)
+    else:
+        pd._testing.assert_almost_equal(expect, got, rtol=rtol, atol=atol)
+
+
+pytestmark = pytest.mark.assert_eq(fn=xgboost_assert_equal)
+
+
+@pytest.fixture
+def reg_data() -> tuple[np.ndarray, np.ndarray]:
+    X, y = make_regression(n_samples, n_features, random_state=11)
+    return X, y
+
+
+@pytest.fixture
+def reg_batches_data() -> tuple[list[pd.DataFrame], list[pd.Series]]:
+    cov = []
+    res = []
+    for i in range(3):
+        X, y = make_regression(n_samples, n_features, random_state=i + 1)
+        cov.append(pd.DataFrame(X))
+        res.append(pd.Series(y))
+    return cov, res
+
+
+def test_with_dmatrix(
+    reg_data: tuple[np.ndarray, np.ndarray],
+) -> tuple[scipy.sparse.csr_matrix, scipy.sparse.csr_matrix]:
+    """DMatrix is the primary interface for XGBoost."""
+    X, y = reg_data
+    X_df = pd.DataFrame(X)
+    y_ser = pd.Series(y)
+    Xy = xgb.DMatrix(X_df, y_ser)
+    assert Xy.feature_names == list(map(str, X_df.columns))
+    csr_0 = Xy.get_data()
+
+    Xc, yc = make_categorical(
+        n_samples, n_features, n_categories=13, onehot=False
+    )
+    Xy = xgb.DMatrix(Xc, yc, enable_categorical=True)
+    csr_1 = Xy.get_data()
+    return csr_0, csr_1
+
+
+def test_with_quantile_dmatrix(
+    reg_data: tuple[np.ndarray, np.ndarray],
+) -> tuple[scipy.sparse.csr_matrix, scipy.sparse.csr_matrix]:
+    """QuantileDMatrix is an optimization for the `hist` tree method for XGBoost."""
+    from xgboost.testing.data import memory
+
+    memory.clear(warn=False)
+
+    X, y = reg_data
+    X_df = pd.DataFrame(X)
+    y_ser = pd.Series(y)
+    Xy = xgb.QuantileDMatrix(X_df, y_ser)
+    assert Xy.feature_names == list(map(str, X_df.columns))
+    csr_0 = Xy.get_data()
+
+    Xc, yc = make_categorical(
+        n_samples, n_features, n_categories=13, onehot=False
+    )
+    Xy = xgb.QuantileDMatrix(Xc, yc, enable_categorical=True)
+    csr_1 = Xy.get_data()
+    return csr_0, csr_1
+
+
+def test_with_iter_quantile_dmatrix(
+    reg_batches_data: tuple[list[pd.DataFrame], list[pd.DataFrame]],
+) -> scipy.sparse.csr_matrix:
+    """Using iterator to initialize QuantileDMatrix."""
+    cov, res = reg_batches_data
+    it = IteratorForTest(cov, res, w=None, cache=None)
+    Xy = xgb.QuantileDMatrix(it)
+    csr = Xy.get_data()
+    return csr
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+def test_with_external_memory(
+    device: str,
+    reg_batches_data: tuple[list[pd.DataFrame], list[pd.DataFrame]],
+) -> np.ndarray:
+    """Test with iterator-based external memory."""
+    cov, res = reg_batches_data
+    it = IteratorForTest(cov, res, w=None, cache="cache")
+    Xy = xgb.DMatrix(it)
+    predt = xgb.train({"device": device}, Xy, num_boost_round=1).predict(Xy)
+    return predt
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+def test_predict(device: str) -> np.ndarray:
+    reg = xgb.XGBRegressor(n_estimators=2, device=device)
+    X, y = make_regression(n_samples, n_features, random_state=11)
+    X_df = pd.DataFrame(X)
+    reg.fit(X_df, y)
+    booster = reg.get_booster()
+
+    predt0 = reg.predict(X_df)
+
+    predt1 = booster.inplace_predict(X_df)
+    np.testing.assert_allclose(predt0, predt1)
+
+    predt2 = booster.predict(xgb.DMatrix(X_df))
+    np.testing.assert_allclose(predt0, predt2)
+
+    predt3 = booster.inplace_predict(X)
+    np.testing.assert_allclose(predt0, predt3)
+
+    return predt0

From 23fb31e7af5e768722f640601034a9d490c2e54c Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Thu, 29 Aug 2024 19:27:06 -0700
Subject: [PATCH 153/270] Add a libcudf/thrust-based TPC-H derived datagen
 (#16294)

This PR adds a TPC-H (according to spec 3.0.1) inspired datagen written using `libcudf` and `thrust`

### Implementation Status

- [x] lineitem
- [x] orders
- [x] region
- [x] nation
- [x] supplier
- [x] customer
- [x] part
- [x] partsupp

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Yunsong Wang (https://github.com/PointKernel)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16294
---
 cpp/benchmarks/CMakeLists.txt                 |  24 +
 .../random_column_generator.cu                | 246 +++++
 .../random_column_generator.hpp               | 150 +++
 .../tpch_data_generator/table_helpers.cpp     | 386 +++++++
 .../tpch_data_generator/table_helpers.hpp     | 155 +++
 .../tpch_data_generator.cpp                   | 987 ++++++++++++++++++
 .../tpch_data_generator.hpp                   |  94 ++
 7 files changed, 2042 insertions(+)
 create mode 100644 cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu
 create mode 100644 cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
 create mode 100644 cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
 create mode 100644 cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
 create mode 100644 cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
 create mode 100644 cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 99ef9e2976f..d2c22b788cb 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -35,6 +35,30 @@ target_include_directories(
          "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
 )
 
+add_library(
+  tpch_data_generator STATIC
+  common/tpch_data_generator/tpch_data_generator.cpp common/tpch_data_generator/table_helpers.cpp
+  common/tpch_data_generator/random_column_generator.cu
+)
+target_compile_features(tpch_data_generator PUBLIC cxx_std_17 cuda_std_17)
+
+target_compile_options(
+  tpch_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+                             "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
+)
+
+target_link_libraries(
+  tpch_data_generator
+  PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
+  PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
+)
+
+target_include_directories(
+  tpch_data_generator
+  PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
+         "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+)
+
 # ##################################################################################################
 # * compiler function -----------------------------------------------------------------------------
 
diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu b/cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu
new file mode 100644
index 00000000000..4246bd1a83b
--- /dev/null
+++ b/cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "random_column_generator.hpp"
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/binaryop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+
+#include <string>
+
+namespace cudf::datagen {
+
+namespace {
+
+// Functor for generating random strings
+struct random_string_generator {
+  char* chars;
+  thrust::default_random_engine engine;
+  thrust::uniform_int_distribution<unsigned char> char_dist;
+
+  CUDF_HOST_DEVICE random_string_generator(char* c) : chars(c), char_dist(44, 122) {}
+
+  __device__ void operator()(thrust::tuple<int64_t, int64_t> str_begin_end)
+  {
+    auto begin = thrust::get<0>(str_begin_end);
+    auto end   = thrust::get<1>(str_begin_end);
+    engine.discard(begin);
+    for (auto i = begin; i < end; ++i) {
+      auto ch = char_dist(engine);
+      if (i == end - 1 && ch >= '\x7F') ch = ' ';  // last element ASCII only.
+      if (ch >= '\x7F')                            // x7F is at the top edge of ASCII
+        chars[i++] = '\xC4';                       // these characters are assigned two bytes
+      chars[i] = static_cast<char>(ch + (ch >= '\x7F'));
+    }
+  }
+};
+
+// Functor for generating random numbers
+template <typename T>
+struct random_number_generator {
+  T lower;
+  T upper;
+
+  CUDF_HOST_DEVICE random_number_generator(T lower, T upper) : lower(lower), upper(upper) {}
+
+  __device__ T operator()(const int64_t idx) const
+  {
+    if constexpr (cudf::is_integral<T>()) {
+      thrust::default_random_engine engine;
+      thrust::uniform_int_distribution<T> dist(lower, upper);
+      engine.discard(idx);
+      return dist(engine);
+    } else {
+      thrust::default_random_engine engine;
+      thrust::uniform_real_distribution<T> dist(lower, upper);
+      engine.discard(idx);
+      return dist(engine);
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<cudf::column> generate_random_string_column(cudf::size_type lower,
+                                                            cudf::size_type upper,
+                                                            cudf::size_type num_rows,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto offsets_begin = cudf::detail::make_counting_transform_iterator(
+    0, random_number_generator<cudf::size_type>(lower, upper));
+  auto [offsets_column, computed_bytes] = cudf::strings::detail::make_offsets_child_column(
+    offsets_begin, offsets_begin + num_rows, stream, mr);
+  rmm::device_uvector<char> chars(computed_bytes, stream);
+
+  auto const offset_itr =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
+
+  // We generate the strings in parallel into the `chars` vector using the
+  // offsets vector generated above.
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_zip_iterator(offset_itr, offset_itr + 1),
+                     num_rows,
+                     random_string_generator(chars.data()));
+
+  return cudf::make_strings_column(
+    num_rows, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
+}
+
+template <typename T>
+std::unique_ptr<cudf::column> generate_random_numeric_column(T lower,
+                                                             T upper,
+                                                             cudf::size_type num_rows,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto col = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_to_id<T>()}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr);
+  cudf::size_type begin = 0;
+  cudf::size_type end   = num_rows;
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator(begin),
+                    thrust::make_counting_iterator(end),
+                    col->mutable_view().begin<T>(),
+                    random_number_generator<T>(lower, upper));
+  return col;
+}
+
+template std::unique_ptr<cudf::column> generate_random_numeric_column<int8_t>(
+  int8_t lower,
+  int8_t upper,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<cudf::column> generate_random_numeric_column<int16_t>(
+  int16_t lower,
+  int16_t upper,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<cudf::column> generate_random_numeric_column<cudf::size_type>(
+  cudf::size_type lower,
+  cudf::size_type upper,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<cudf::column> generate_random_numeric_column<double>(
+  double lower,
+  double upper,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+std::unique_ptr<cudf::column> generate_primary_key_column(cudf::scalar const& start,
+                                                          cudf::size_type num_rows,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return cudf::sequence(num_rows, start, stream, mr);
+}
+
+std::unique_ptr<cudf::column> generate_repeat_string_column(std::string const& value,
+                                                            cudf::size_type num_rows,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const scalar = cudf::string_scalar(value);
+  return cudf::make_column_from_scalar(scalar, num_rows, stream, mr);
+}
+
+std::unique_ptr<cudf::column> generate_random_string_column_from_set(
+  cudf::host_span<const char* const> set,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Build a gather map of random strings to choose from
+  // The size of the string sets always fits within 16-bit integers
+  auto const indices =
+    generate_primary_key_column(cudf::numeric_scalar<int16_t>(0), set.size(), stream, mr);
+  auto const keys       = cudf::test::strings_column_wrapper(set.begin(), set.end()).release();
+  auto const gather_map = cudf::table_view({indices->view(), keys->view()});
+
+  // Build a column of random keys to gather from the set
+  auto const gather_keys =
+    generate_random_numeric_column<int16_t>(0, set.size() - 1, num_rows, stream, mr);
+
+  // Perform the gather operation
+  auto const gathered_table = cudf::gather(
+    gather_map, gather_keys->view(), cudf::out_of_bounds_policy::DONT_CHECK, stream, mr);
+  auto gathered_table_columns = gathered_table->release();
+  return std::move(gathered_table_columns[1]);
+}
+
+template <typename T>
+std::unique_ptr<cudf::column> generate_repeat_sequence_column(T seq_length,
+                                                              bool zero_indexed,
+                                                              cudf::size_type num_rows,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto pkey =
+    generate_primary_key_column(cudf::numeric_scalar<cudf::size_type>(0), num_rows, stream, mr);
+  auto repeat_seq_zero_indexed = cudf::binary_operation(pkey->view(),
+                                                        cudf::numeric_scalar<T>(seq_length),
+                                                        cudf::binary_operator::MOD,
+                                                        cudf::data_type{cudf::type_to_id<T>()},
+                                                        stream,
+                                                        mr);
+  if (zero_indexed) { return repeat_seq_zero_indexed; }
+  return cudf::binary_operation(repeat_seq_zero_indexed->view(),
+                                cudf::numeric_scalar<T>(1),
+                                cudf::binary_operator::ADD,
+                                cudf::data_type{cudf::type_to_id<T>()},
+                                stream,
+                                mr);
+}
+
+template std::unique_ptr<cudf::column> generate_repeat_sequence_column<int8_t>(
+  int8_t seq_length,
+  bool zero_indexed,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<cudf::column> generate_repeat_sequence_column<cudf::size_type>(
+  cudf::size_type seq_length,
+  bool zero_indexed,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp b/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
new file mode 100644
index 00000000000..3e254f49805
--- /dev/null
+++ b/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+
+#include <string>
+
+namespace cudf::datagen {
+
+/**
+ * @brief Generate a column of random strings
+ *
+ * @param lower The lower bound of the length of the strings
+ * @param upper The upper bound of the length of the strings
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> generate_random_string_column(
+  cudf::size_type lower,
+  cudf::size_type upper,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a column of random numbers
+ *
+ * Example:
+ *
+ * lower = 10
+ * upper = 15
+ * num_rows = 10
+ * result = [10, 11, 14, 14, 13, 12, 11, 11, 12, 14]
+
+ *
+ * @param lower The lower bound of the random numbers
+ * @param upper The upper bound of the random numbers
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+template <typename T>
+std::unique_ptr<cudf::column> generate_random_numeric_column(
+  T lower,
+  T upper,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a primary key column
+ *
+ * Example:
+ *
+ * start = 1
+ * num_rows = 10
+ * result = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+ *
+ * @param start The starting value of the primary key
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> generate_primary_key_column(
+  cudf::scalar const& start,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a column where all the rows have the same string value
+ *
+ * Example:
+ *
+ * value = "abc"
+ * num_rows = 5
+ * result = ["abc", "abc", "abc", "abc", "abc"]
+ *
+ * @param value The string value to fill the column with
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> generate_repeat_string_column(
+  std::string const& value,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a column by randomly choosing from set of strings
+ *
+ * Example:
+ *
+ * set = {"s1", "s2", "s3"}
+ * num_rows = 10
+ * result = ["s1", "s2", "s2", "s1", "s3", "s3", "s3", "s2", "s1", "s1"]
+ *
+ * @param set The set of strings to choose from
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> generate_random_string_column_from_set(
+  cudf::host_span<const char* const> set,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a column consisting of a repeating sequence of integers
+ *
+ * Example:
+ *
+ * seq_length = 3
+ * zero_indexed = false
+ * num_rows = 10
+ * result = [1, 2, 3, 1, 2, 3, 1, 2, 3, 1]
+ *
+ * @param seq_length The length of the repeating sequence
+ * @param zero_indexed Whether the sequence is zero or one indexed
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+template <typename T>
+std::unique_ptr<cudf::column> generate_repeat_sequence_column(
+  T seq_length,
+  bool zero_indexed,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp b/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
new file mode 100644
index 00000000000..36bf9c49cea
--- /dev/null
+++ b/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "table_helpers.hpp"
+
+#include "random_column_generator.hpp"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/join.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/convert/convert_integers.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/unary.hpp>
+
+#include <vector>
+
+namespace cudf::datagen {
+
+/**
+ * @brief Add a column of days to a column of timestamp_days
+ *
+ * @param timestamp_days The column of timestamp_days
+ * @param days The column of days to add
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> add_calendrical_days(cudf::column_view const& timestamp_days,
+                                                   cudf::column_view const& days,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const days_duration_type = cudf::cast(days, cudf::data_type{cudf::type_id::DURATION_DAYS});
+  auto const data_type          = cudf::data_type{cudf::type_id::TIMESTAMP_DAYS};
+  return cudf::binary_operation(
+    timestamp_days, days_duration_type->view(), cudf::binary_operator::ADD, data_type, stream, mr);
+}
+
+/**
+ * @brief Perform a left join operation between two tables
+ *
+ * @param left_input The left table
+ * @param right_input The right table
+ * @param left_on The indices of the columns to join on in the left table
+ * @param right_on The indices of the columns to join on in the right table
+ * @param compare_nulls The null equality comparison
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ */
+std::unique_ptr<cudf::table> perform_left_join(cudf::table_view const& left_input,
+                                               cudf::table_view const& right_input,
+                                               std::vector<cudf::size_type> const& left_on,
+                                               std::vector<cudf::size_type> const& right_on,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  constexpr auto oob_policy = cudf::out_of_bounds_policy::NULLIFY;
+  auto const left_selected  = left_input.select(left_on);
+  auto const right_selected = right_input.select(right_on);
+  auto const [left_join_indices, right_join_indices] =
+    cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, mr);
+
+  auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
+  auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
+
+  auto const left_indices_col  = cudf::column_view{left_indices_span};
+  auto const right_indices_col = cudf::column_view{right_indices_span};
+
+  auto const left_result  = cudf::gather(left_input, left_indices_col, oob_policy, stream, mr);
+  auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy, stream, mr);
+
+  auto joined_cols = left_result->release();
+  auto right_cols  = right_result->release();
+  joined_cols.insert(joined_cols.end(),
+                     std::make_move_iterator(right_cols.begin()),
+                     std::make_move_iterator(right_cols.end()));
+  return std::make_unique<cudf::table>(std::move(joined_cols));
+}
+
+/**
+ * @brief Generate the `p_retailprice` column of the `part` table
+ *
+ * @param p_partkey The `p_partkey` column of the `part` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_p_retailprice(
+  cudf::column_view const& p_partkey,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Expression: (90000 + ((p_partkey/10) modulo 20001) + 100 * (p_partkey modulo 1000)) / 100
+  auto table             = cudf::table_view({p_partkey});
+  auto p_partkey_col_ref = cudf::ast::column_reference(0);
+
+  auto scalar_10    = cudf::numeric_scalar<cudf::size_type>(10);
+  auto scalar_100   = cudf::numeric_scalar<cudf::size_type>(100);
+  auto scalar_1000  = cudf::numeric_scalar<cudf::size_type>(1000);
+  auto scalar_20001 = cudf::numeric_scalar<cudf::size_type>(20001);
+  auto scalar_90000 = cudf::numeric_scalar<cudf::size_type>(90000);
+
+  auto literal_10    = cudf::ast::literal(scalar_10);
+  auto literal_100   = cudf::ast::literal(scalar_100);
+  auto literal_1000  = cudf::ast::literal(scalar_1000);
+  auto literal_20001 = cudf::ast::literal(scalar_20001);
+  auto literal_90000 = cudf::ast::literal(scalar_90000);
+
+  auto expr_a = cudf::ast::operation(cudf::ast::ast_operator::DIV, p_partkey_col_ref, literal_10);
+  auto expr_b = cudf::ast::operation(cudf::ast::ast_operator::MOD, expr_a, literal_20001);
+  auto expr_c = cudf::ast::operation(cudf::ast::ast_operator::MOD, p_partkey_col_ref, literal_1000);
+  auto expr_d = cudf::ast::operation(cudf::ast::ast_operator::MUL, expr_c, literal_100);
+  auto expr_e = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_b, expr_d);
+  auto expr_f = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_e, literal_90000);
+  auto final_expr = cudf::ast::operation(cudf::ast::ast_operator::TRUE_DIV, expr_f, literal_100);
+
+  // Execute the AST expression
+  return cudf::compute_column(table, final_expr, stream, mr);
+}
+
+/**
+ * @brief Generate the `l_suppkey` column of the `lineitem` table
+ *
+ * @param l_partkey The `l_partkey` column of the `lineitem` table
+ * @param scale_factor The scale factor to use
+ * @param num_rows The number of rows in the `lineitem` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_l_suppkey(cudf::column_view const& l_partkey,
+                                                                cudf::size_type scale_factor,
+                                                                cudf::size_type num_rows,
+                                                                rmm::cuda_stream_view stream,
+                                                                rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Expression: (l_partkey + (i * (s/4 + (int)(l_partkey - 1)/s))) % s + 1
+
+  // Generate the `s` col
+  auto s_empty = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows, cudf::mask_state::UNALLOCATED, stream);
+
+  auto s = cudf::fill(s_empty->view(),
+                      0,
+                      num_rows,
+                      cudf::numeric_scalar<cudf::size_type>(scale_factor * 10'000),
+                      stream,
+                      mr);
+
+  // Generate the `i` col
+  auto i = generate_repeat_sequence_column<cudf::size_type>(4, true, num_rows, stream, mr);
+
+  // Create a table view out of `l_partkey`, `s`, and `i`
+  auto table = cudf::table_view({l_partkey, s->view(), i->view()});
+
+  // Create the AST expression
+  auto scalar_1  = cudf::numeric_scalar<cudf::size_type>(1);
+  auto scalar_4  = cudf::numeric_scalar<cudf::size_type>(4);
+  auto literal_1 = cudf::ast::literal(scalar_1);
+  auto literal_4 = cudf::ast::literal(scalar_4);
+
+  auto l_partkey_col_ref = cudf::ast::column_reference(0);
+  auto s_col_ref         = cudf::ast::column_reference(1);
+  auto i_col_ref         = cudf::ast::column_reference(2);
+
+  // (int)(l_partkey - 1)/s
+  auto expr_a = cudf::ast::operation(cudf::ast::ast_operator::SUB, l_partkey_col_ref, literal_1);
+  auto expr_b = cudf::ast::operation(cudf::ast::ast_operator::DIV, expr_a, s_col_ref);
+
+  // s/4
+  auto expr_c = cudf::ast::operation(cudf::ast::ast_operator::DIV, s_col_ref, literal_4);
+
+  // (s/4 + (int)(l_partkey - 1)/s)
+  auto expr_d = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_c, expr_b);
+
+  // (i * (s/4 + (int)(l_partkey - 1)/s))
+  auto expr_e = cudf::ast::operation(cudf::ast::ast_operator::MUL, i_col_ref, expr_d);
+
+  // (l_partkey + (i * (s/4 + (int)(l_partkey - 1)/s)))
+  auto expr_f = cudf::ast::operation(cudf::ast::ast_operator::ADD, l_partkey_col_ref, expr_e);
+
+  // (l_partkey + (i * (s/4 + (int)(l_partkey - 1)/s))) % s
+  auto expr_g = cudf::ast::operation(cudf::ast::ast_operator::MOD, expr_f, s_col_ref);
+
+  // (l_partkey + (i * (s/4 + (int)(l_partkey - 1)/s))) % s + 1
+  auto final_expr = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_g, literal_1);
+
+  // Execute the AST expression
+  return cudf::compute_column(table, final_expr, stream, mr);
+}
+
+/**
+ * @brief Generate the `ps_suppkey` column of the `partsupp` table
+ *
+ * @param ps_partkey The `ps_partkey` column of the `partsupp` table
+ * @param scale_factor The scale factor to use
+ * @param num_rows The number of rows in the `partsupp` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_ps_suppkey(
+  cudf::column_view const& ps_partkey,
+  cudf::size_type scale_factor,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Expression: ps_suppkey = (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s))) % s + 1
+
+  // Generate the `s` col
+  auto s_empty = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows, cudf::mask_state::UNALLOCATED, stream);
+
+  auto s = cudf::fill(s_empty->view(),
+                      0,
+                      num_rows,
+                      cudf::numeric_scalar<cudf::size_type>(scale_factor * 10'000),
+                      stream,
+                      mr);
+
+  // Generate the `i` col
+  auto i = generate_repeat_sequence_column<cudf::size_type>(4, true, num_rows, stream, mr);
+
+  // Create a table view out of `p_partkey`, `s`, and `i`
+  auto table = cudf::table_view({ps_partkey, s->view(), i->view()});
+
+  // Create the AST expression
+  auto scalar_1  = cudf::numeric_scalar<cudf::size_type>(1);
+  auto scalar_4  = cudf::numeric_scalar<cudf::size_type>(4);
+  auto literal_1 = cudf::ast::literal(scalar_1);
+  auto literal_4 = cudf::ast::literal(scalar_4);
+
+  auto ps_partkey_col_ref = cudf::ast::column_reference(0);
+  auto s_col_ref          = cudf::ast::column_reference(1);
+  auto i_col_ref          = cudf::ast::column_reference(2);
+
+  // (int)(ps_partkey - 1)/s
+  auto expr_a = cudf::ast::operation(cudf::ast::ast_operator::SUB, ps_partkey_col_ref, literal_1);
+  auto expr_b = cudf::ast::operation(cudf::ast::ast_operator::DIV, expr_a, s_col_ref);
+
+  // s/4
+  auto expr_c = cudf::ast::operation(cudf::ast::ast_operator::DIV, s_col_ref, literal_4);
+
+  // (s/4 + (int)(ps_partkey - 1)/s)
+  auto expr_d = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_c, expr_b);
+
+  // (i * (s/4 + (int)(ps_partkey - 1)/s))
+  auto expr_e = cudf::ast::operation(cudf::ast::ast_operator::MUL, i_col_ref, expr_d);
+
+  // (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s)))
+  auto expr_f = cudf::ast::operation(cudf::ast::ast_operator::ADD, ps_partkey_col_ref, expr_e);
+
+  // (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s))) % s
+  auto expr_g = cudf::ast::operation(cudf::ast::ast_operator::MOD, expr_f, s_col_ref);
+
+  // (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s))) % s + 1
+  auto final_expr = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_g, literal_1);
+
+  // Execute the AST expression
+  return cudf::compute_column(table, final_expr, stream, mr);
+}
+
+/**
+ * @brief Calculate the cardinality of the `lineitem` table
+ *
+ * @param o_rep_freqs The frequency of each `o_orderkey` value in the `lineitem` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] cudf::size_type calculate_l_cardinality(cudf::column_view const& o_rep_freqs,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const sum_agg = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  auto const l_num_rows_scalar =
+    cudf::reduce(o_rep_freqs, *sum_agg, cudf::data_type{cudf::type_id::INT32}, stream, mr);
+  return reinterpret_cast<cudf::numeric_scalar<cudf::size_type>*>(l_num_rows_scalar.get())
+    ->value(stream);
+}
+
+/**
+ * @brief Calculate the charge column for the `lineitem` table
+ *
+ * @param extendedprice The `l_extendedprice` column
+ * @param tax The `l_tax` column
+ * @param discount The `l_discount` column
+ * @param stream The CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_charge(cudf::column_view const& extendedprice,
+                                                             cudf::column_view const& tax,
+                                                             cudf::column_view const& discount,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const one                = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount = cudf::binary_operation(
+    one, discount, cudf::binary_operator::SUB, cudf::data_type{cudf::type_id::FLOAT64}, stream, mr);
+  auto disc_price = cudf::binary_operation(extendedprice,
+                                           one_minus_discount->view(),
+                                           cudf::binary_operator::MUL,
+                                           cudf::data_type{cudf::type_id::FLOAT64},
+                                           stream,
+                                           mr);
+  auto const one_plus_tax =
+    cudf::binary_operation(one, tax, cudf::binary_operator::ADD, tax.type(), stream, mr);
+  return cudf::binary_operation(disc_price->view(),
+                                one_plus_tax->view(),
+                                cudf::binary_operator::MUL,
+                                cudf::data_type{cudf::type_id::FLOAT64},
+                                stream,
+                                mr);
+}
+
+/**
+ * @brief Generate a column of random addresses according to TPC-H specification clause 4.2.2.7
+ *
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> generate_address_column(
+  cudf::size_type num_rows, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return generate_random_string_column(10, 40, num_rows, stream, mr);
+}
+
+/**
+ * @brief Generate a phone number column according to TPC-H specification clause 4.2.2.9
+ *
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> generate_phone_column(cudf::size_type num_rows,
+                                                                  rmm::cuda_stream_view stream,
+                                                                  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const part_a = cudf::strings::from_integers(
+    generate_random_numeric_column<int16_t>(10, 34, num_rows, stream, mr)->view());
+  auto const part_b = cudf::strings::from_integers(
+    generate_random_numeric_column<int16_t>(100, 999, num_rows, stream, mr)->view());
+  auto const part_c = cudf::strings::from_integers(
+    generate_random_numeric_column<int16_t>(100, 999, num_rows, stream, mr)->view());
+  auto const part_d = cudf::strings::from_integers(
+    generate_random_numeric_column<int16_t>(1000, 9999, num_rows, stream, mr)->view());
+  auto const phone_parts_table =
+    cudf::table_view({part_a->view(), part_b->view(), part_c->view(), part_d->view()});
+  return cudf::strings::concatenate(phone_parts_table,
+                                    cudf::string_scalar("-"),
+                                    cudf::string_scalar("", false),
+                                    cudf::strings::separator_on_nulls::NO,
+                                    stream,
+                                    mr);
+}
+
+}  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp b/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
new file mode 100644
index 00000000000..11091689469
--- /dev/null
+++ b/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <vector>
+
+namespace cudf::datagen {
+
+/**
+ * @brief Add a column of days to a column of timestamp_days
+ *
+ * @param timestamp_days The column of timestamp_days
+ * @param days The column of days to add
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> add_calendrical_days(
+  cudf::column_view const& timestamp_days,
+  cudf::column_view const& days,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Perform a left join operation between two tables
+ *
+ * @param left_input The left table
+ * @param right_input The right table
+ * @param left_on The indices of the columns to join on in the left table
+ * @param right_on The indices of the columns to join on in the right table
+ * @param compare_nulls The null equality comparison
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ */
+std::unique_ptr<cudf::table> perform_left_join(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `p_retailprice` column of the `part` table
+ *
+ * @param p_partkey The `p_partkey` column of the `part` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_p_retailprice(
+  cudf::column_view const& p_partkey,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `l_suppkey` column of the `lineitem` table
+ *
+ * @param l_partkey The `l_partkey` column of the `lineitem` table
+ * @param scale_factor The scale factor to use
+ * @param num_rows The number of rows in the `lineitem` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_l_suppkey(
+  cudf::column_view const& l_partkey,
+  cudf::size_type scale_factor,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `ps_suppkey` column of the `partsupp` table
+ *
+ * @param ps_partkey The `ps_partkey` column of the `partsupp` table
+ * @param scale_factor The scale factor to use
+ * @param num_rows The number of rows in the `partsupp` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_ps_suppkey(
+  cudf::column_view const& ps_partkey,
+  cudf::size_type scale_factor,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+/**
+ * @brief Calculate the cardinality of the `lineitem` table
+ *
+ * @param o_rep_freqs The frequency of each `o_orderkey` value in the `lineitem` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] cudf::size_type calculate_l_cardinality(
+  cudf::column_view const& o_rep_freqs,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+/**
+ * @brief Calculate the charge column for the `lineitem` table
+ *
+ * @param extendedprice The `l_extendedprice` column
+ * @param tax The `l_tax` column
+ * @param discount The `l_discount` column
+ * @param stream The CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_charge(
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& tax,
+  cudf::column_view const& discount,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a column of random addresses according to TPC-H specification clause 4.2.2.7
+ *
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> generate_address_column(
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a phone number column according to TPC-H specification clause 4.2.2.9
+ *
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> generate_phone_column(
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
new file mode 100644
index 00000000000..9001c50c5a5
--- /dev/null
+++ b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
@@ -0,0 +1,987 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tpch_data_generator.hpp"
+
+#include "random_column_generator.hpp"
+#include "table_helpers.hpp"
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/round.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/strings/convert/convert_integers.hpp>
+#include <cudf/strings/padding.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/unary.hpp>
+
+#include <array>
+#include <string>
+#include <vector>
+
+namespace cudf::datagen {
+
+namespace {
+constexpr std::array nations{
+  "ALGERIA", "ARGENTINA", "BRAZIL",         "CANADA",       "EGYPT", "ETHIOPIA", "FRANCE",
+  "GERMANY", "INDIA",     "INDONESIA",      "IRAN",         "IRAQ",  "JAPAN",    "JORDAN",
+  "KENYA",   "MOROCCO",   "MOZAMBIQUE",     "PERU",         "CHINA", "ROMANIA",  "SAUDI ARABIA",
+  "VIETNAM", "RUSSIA",    "UNITED KINGDOM", "UNITED STATES"};
+
+constexpr std::array years{"1992", "1993", "1994", "1995", "1996", "1997", "1998"};
+constexpr std::array months{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"};
+constexpr std::array days{"1",  "2",  "3",  "4",  "5",  "6",  "7",  "8",  "9",  "10", "11",
+                          "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22",
+                          "23", "24", "25", "26", "27", "28", "29", "30", "31"};
+
+constexpr std::array vocab_p_name{
+  "almond",   "antique",   "aquamarine", "azure",      "beige",     "bisque",    "black",
+  "blanched", "blue",      "blush",      "brown",      "burlywood", "burnished", "chartreuse",
+  "chiffon",  "chocolate", "coral",      "cornflower", "cornsilk",  "cream",     "cyan",
+  "dark",     "deep",      "dim",        "dodger",     "drab",      "firebrick", "floral",
+  "forest",   "frosted",   "gainsboro",  "ghost",      "goldenrod", "green",     "grey",
+  "honeydew", "hot",       "indian",     "ivory",      "khaki",     "lace",      "lavender",
+  "lawn",     "lemon",     "light",      "lime",       "linen",     "magenta",   "maroon",
+  "medium",   "metallic",  "midnight",   "mint",       "misty",     "moccasin",  "navajo",
+  "navy",     "olive",     "orange",     "orchid",     "pale",      "papaya",    "peach",
+  "peru",     "pink",      "plum",       "powder",     "puff",      "purple",    "red",
+  "rose",     "rosy",      "royal",      "saddle",     "salmon",    "sandy",     "seashell",
+  "sienna",   "sky",       "slate",      "smoke",      "snow",      "spring",    "steel",
+  "tan",      "thistle",   "tomato",     "turquoise",  "violet",    "wheat",     "white",
+  "yellow"};
+
+constexpr std::array vocab_modes{"REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB"};
+
+constexpr std::array vocab_instructions{
+  "DELIVER IN PERSON", "COLLECT COD", "NONE", "TAKE BACK RETURN"};
+
+constexpr std::array vocab_priorities{"1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW"};
+
+constexpr std::array vocab_segments{
+  "AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD"};
+
+constexpr std::array vocab_types{
+  "STANDARD ANODIZED TIN",     "STANDARD ANODIZED NICKEL", "STANDARD ANODIZED BRASS",
+  "STANDARD ANODIZED STEEL",   "STANDARD ANODIZED COPPER", "STANDARD BURNISHED TIN",
+  "STANDARD BURNISHED NICKEL", "STANDARD BURNISHED BRASS", "STANDARD BURNISHED STEEL",
+  "STANDARD BURNISHED COPPER", "STANDARD PLATED TIN",      "STANDARD PLATED NICKEL",
+  "STANDARD PLATED BRASS",     "STANDARD PLATED STEEL",    "STANDARD PLATED COPPER",
+  "STANDARD POLISHED TIN",     "STANDARD POLISHED NICKEL", "STANDARD POLISHED BRASS",
+  "STANDARD POLISHED STEEL",   "STANDARD POLISHED COPPER", "STANDARD BRUSHED TIN",
+  "STANDARD BRUSHED NICKEL",   "STANDARD BRUSHED BRASS",   "STANDARD BRUSHED STEEL",
+  "STANDARD BRUSHED COPPER",   "SMALL ANODIZED TIN",       "SMALL ANODIZED NICKEL",
+  "SMALL ANODIZED BRASS",      "SMALL ANODIZED STEEL",     "SMALL ANODIZED COPPER",
+  "SMALL BURNISHED TIN",       "SMALL BURNISHED NICKEL",   "SMALL BURNISHED BRASS",
+  "SMALL BURNISHED STEEL",     "SMALL BURNISHED COPPER",   "SMALL PLATED TIN",
+  "SMALL PLATED NICKEL",       "SMALL PLATED BRASS",       "SMALL PLATED STEEL",
+  "SMALL PLATED COPPER",       "SMALL POLISHED TIN",       "SMALL POLISHED NICKEL",
+  "SMALL POLISHED BRASS",      "SMALL POLISHED STEEL",     "SMALL POLISHED COPPER",
+  "SMALL BRUSHED TIN",         "SMALL BRUSHED NICKEL",     "SMALL BRUSHED BRASS",
+  "SMALL BRUSHED STEEL",       "SMALL BRUSHED COPPER",     "MEDIUM ANODIZED TIN",
+  "MEDIUM ANODIZED NICKEL",    "MEDIUM ANODIZED BRASS",    "MEDIUM ANODIZED STEEL",
+  "MEDIUM ANODIZED COPPER",    "MEDIUM BURNISHED TIN",     "MEDIUM BURNISHED NICKEL",
+  "MEDIUM BURNISHED BRASS",    "MEDIUM BURNISHED STEEL",   "MEDIUM BURNISHED COPPER",
+  "MEDIUM PLATED TIN",         "MEDIUM PLATED NICKEL",     "MEDIUM PLATED BRASS",
+  "MEDIUM PLATED STEEL",       "MEDIUM PLATED COPPER",     "MEDIUM POLISHED TIN",
+  "MEDIUM POLISHED NICKEL",    "MEDIUM POLISHED BRASS",    "MEDIUM POLISHED STEEL",
+  "MEDIUM POLISHED COPPER",    "MEDIUM BRUSHED TIN",       "MEDIUM BRUSHED NICKEL",
+  "MEDIUM BRUSHED BRASS",      "MEDIUM BRUSHED STEEL",     "MEDIUM BRUSHED COPPER",
+  "LARGE ANODIZED TIN",        "LARGE ANODIZED NICKEL",    "LARGE ANODIZED BRASS",
+  "LARGE ANODIZED STEEL",      "LARGE ANODIZED COPPER",    "LARGE BURNISHED TIN",
+  "LARGE BURNISHED NICKEL",    "LARGE BURNISHED BRASS",    "LARGE BURNISHED STEEL",
+  "LARGE BURNISHED COPPER",    "LARGE PLATED TIN",         "LARGE PLATED NICKEL",
+  "LARGE PLATED BRASS",        "LARGE PLATED STEEL",       "LARGE PLATED COPPER",
+  "LARGE POLISHED TIN",        "LARGE POLISHED NICKEL",    "LARGE POLISHED BRASS",
+  "LARGE POLISHED STEEL",      "LARGE POLISHED COPPER",    "LARGE BRUSHED TIN",
+  "LARGE BRUSHED NICKEL",      "LARGE BRUSHED BRASS",      "LARGE BRUSHED STEEL",
+  "LARGE BRUSHED COPPER",      "ECONOMY ANODIZED TIN",     "ECONOMY ANODIZED NICKEL",
+  "ECONOMY ANODIZED BRASS",    "ECONOMY ANODIZED STEEL",   "ECONOMY ANODIZED COPPER",
+  "ECONOMY BURNISHED TIN",     "ECONOMY BURNISHED NICKEL", "ECONOMY BURNISHED BRASS",
+  "ECONOMY BURNISHED STEEL",   "ECONOMY BURNISHED COPPER", "ECONOMY PLATED TIN",
+  "ECONOMY PLATED NICKEL",     "ECONOMY PLATED BRASS",     "ECONOMY PLATED STEEL",
+  "ECONOMY PLATED COPPER",     "ECONOMY POLISHED TIN",     "ECONOMY POLISHED NICKEL",
+  "ECONOMY POLISHED BRASS",    "ECONOMY POLISHED STEEL",   "ECONOMY POLISHED COPPER",
+  "ECONOMY BRUSHED TIN",       "ECONOMY BRUSHED NICKEL",   "ECONOMY BRUSHED BRASS",
+  "ECONOMY BRUSHED STEEL",     "ECONOMY BRUSHED COPPER",   "PROMO ANODIZED TIN",
+  "PROMO ANODIZED NICKEL",     "PROMO ANODIZED BRASS",     "PROMO ANODIZED STEEL",
+  "PROMO ANODIZED COPPER",     "PROMO BURNISHED TIN",      "PROMO BURNISHED NICKEL",
+  "PROMO BURNISHED BRASS",     "PROMO BURNISHED STEEL",    "PROMO BURNISHED COPPER",
+  "PROMO PLATED TIN",          "PROMO PLATED NICKEL",      "PROMO PLATED BRASS",
+  "PROMO PLATED STEEL",        "PROMO PLATED COPPER",      "PROMO POLISHED TIN",
+  "PROMO POLISHED NICKEL",     "PROMO POLISHED BRASS",     "PROMO POLISHED STEEL",
+  "PROMO POLISHED COPPER",     "PROMO BRUSHED TIN",        "PROMO BRUSHED NICKEL",
+  "PROMO BRUSHED BRASS",       "PROMO BRUSHED STEEL",      "PROMO BRUSHED COPPER"};
+
+constexpr std::array vocab_containers{
+  "SM CASE",   "SM BOX",     "SM BAG",    "SM JAR",     "SM PKG",    "SM PACK",   "SM CAN",
+  "SM DRUM",   "LG CASE",    "LG BOX",    "LG BAG",     "LG JAR",    "LG PKG",    "LG PACK",
+  "LG CAN",    "LG DRUM",    "MED CASE",  "MED BOX",    "MED BAG",   "MED JAR",   "MED PKG",
+  "MED PACK",  "MED CAN",    "MED DRUM",  "JUMBO CASE", "JUMBO BOX", "JUMBO BAG", "JUMBO JAR",
+  "JUMBO PKG", "JUMBO PACK", "JUMBO CAN", "JUMBO DRUM", "WRAP CASE", "WRAP BOX",  "WRAP BAG",
+  "WRAP JAR",  "WRAP PKG",   "WRAP PACK", "WRAP CAN",   "WRAP DRUM"};
+
+}  // namespace
+
+/**
+ * @brief Generate a table out of the independent columns of the `orders` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_orders_independent(double scale_factor,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  cudf::size_type const o_num_rows = scale_factor * 1'500'000;
+
+  // Generate the `o_orderkey` column
+  auto o_orderkey = [&]() {
+    auto const o_orderkey_candidates = generate_primary_key_column(
+      cudf::numeric_scalar<cudf::size_type>(1), 4 * o_num_rows, stream, mr);
+    auto const o_orderkey_unsorted = cudf::sample(cudf::table_view({o_orderkey_candidates->view()}),
+                                                  o_num_rows,
+                                                  cudf::sample_with_replacement::FALSE,
+                                                  0,
+                                                  stream,
+                                                  mr);
+    auto const sort_result =
+      cudf::sort_by_key(o_orderkey_unsorted->view(),
+                        cudf::table_view({o_orderkey_unsorted->view().column(0)}),
+                        {},
+                        {},
+                        stream,
+                        mr);
+    return std::move(sort_result->release()[0]);
+  }();
+
+  // Generate the `o_custkey` column
+  auto o_custkey = [&]() {
+    auto const col = generate_random_numeric_column<cudf::size_type>(
+      1, scale_factor * 49'000, o_num_rows, stream, mr);
+    auto const col_mul_3 = cudf::binary_operation(col->view(),
+                                                  cudf::numeric_scalar<cudf::size_type>(3),
+                                                  cudf::binary_operator::MUL,
+                                                  cudf::data_type{cudf::type_id::INT32},
+                                                  stream,
+                                                  mr);
+    return cudf::binary_operation(col_mul_3->view(),
+                                  cudf::numeric_scalar<cudf::size_type>(1),
+                                  cudf::binary_operator::ADD,
+                                  cudf::data_type{cudf::type_id::INT32},
+                                  stream,
+                                  mr);
+  }();
+
+  // Generate the `o_orderdate` column
+  auto o_orderdate_ts = [&]() {
+    auto const o_orderdate_year = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(years.data(), years.size()), o_num_rows, stream, mr);
+    auto const o_orderdate_month = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(months.data(), months.size()), o_num_rows, stream, mr);
+    auto const o_orderdate_day = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(days.data(), days.size()), o_num_rows, stream, mr);
+    auto const o_orderdate_str = cudf::strings::concatenate(
+      cudf::table_view(
+        {o_orderdate_year->view(), o_orderdate_month->view(), o_orderdate_day->view()}),
+      cudf::string_scalar("-"),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+
+    return cudf::strings::to_timestamps(o_orderdate_str->view(),
+                                        cudf::data_type{cudf::type_id::TIMESTAMP_DAYS},
+                                        std::string("%Y-%m-%d"),
+                                        stream,
+                                        mr);
+  }();
+
+  // Generate the `o_orderpriority` column
+  auto o_orderpriority = generate_random_string_column_from_set(
+    cudf::host_span<const char* const>(vocab_priorities.data(), vocab_priorities.size()),
+    o_num_rows,
+    stream,
+    mr);
+
+  // Generate the `o_clerk` column
+  auto o_clerk = [&]() {
+    auto const clerk_repeat = generate_repeat_string_column("Clerk#", o_num_rows, stream, mr);
+    auto const random_c     = generate_random_numeric_column<cudf::size_type>(
+      1, scale_factor * 1'000, o_num_rows, stream, mr);
+    auto const random_c_str        = cudf::strings::from_integers(random_c->view(), stream, mr);
+    auto const random_c_str_padded = cudf::strings::zfill(random_c_str->view(), 9, stream, mr);
+    return cudf::strings::concatenate(
+      cudf::table_view({clerk_repeat->view(), random_c_str_padded->view()}),
+      cudf::string_scalar(""),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+  }();
+
+  // Generate the `o_shippriority` column
+  auto o_shippriority = [&]() {
+    auto const empty = cudf::make_numeric_column(
+      cudf::data_type{cudf::type_id::INT8}, o_num_rows, cudf::mask_state::UNALLOCATED, stream);
+    return cudf::fill(empty->view(), 0, o_num_rows, cudf::numeric_scalar<int8_t>(0), stream, mr);
+  }();
+
+  // Generate the `o_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto o_comment = generate_random_string_column(19, 78, o_num_rows, stream, mr);
+
+  // Generate the `orders_independent` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(o_orderkey));
+  columns.push_back(std::move(o_custkey));
+  columns.push_back(std::move(o_orderdate_ts));
+  columns.push_back(std::move(o_orderpriority));
+  columns.push_back(std::move(o_clerk));
+  columns.push_back(std::move(o_shippriority));
+  columns.push_back(std::move(o_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+/**
+ * @brief Generate the `lineitem` table partially
+ *
+ * @param orders_independent Table with the independent columns of the `orders` table
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_lineitem_partial(cudf::table_view const& orders_independent,
+                                                       double scale_factor,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const o_num_rows = orders_independent.num_rows();
+  // Generate the `lineitem` table. For each row in the `orders` table,
+  // we have a random number (between 1 and 7) of rows in the `lineitem` table
+
+  // For each `o_orderkey`, generate a random number (between 1 and 7),
+  // which will be the number of rows in the `lineitem` table that will
+  // have the same `l_orderkey`
+  auto const o_rep_freqs = generate_random_numeric_column<int8_t>(1, 7, o_num_rows, stream, mr);
+
+  // Sum up the `o_rep_freqs` to get the number of rows in the
+  // `lineitem` table. This is required to generate the independent columns
+  // in the `lineitem` table
+  auto const l_num_rows = calculate_l_cardinality(o_rep_freqs->view(), stream, mr);
+
+  // We create a table out of `o_orderkey` and `o_orderdate_ts` by repeating
+  // the rows of `orders` according to the frequencies in `o_rep_freqs`
+  auto const o_orderkey     = orders_independent.column(0);
+  auto const o_orderdate_ts = orders_independent.column(2);
+  auto const l_base =
+    cudf::repeat(cudf::table_view({o_orderkey, o_orderdate_ts}), o_rep_freqs->view(), stream, mr);
+  auto l_base_columns = l_base->release();
+
+  // Generate the `l_orderkey` column
+  auto l_orderkey = std::move(l_base_columns[0]);
+
+  // Generate the `l_partkey` column
+  auto l_partkey = generate_random_numeric_column<cudf::size_type>(
+    1, scale_factor * 200'000, l_num_rows, stream, mr);
+
+  // Generate the `l_suppkey` column
+  auto l_suppkey = calculate_l_suppkey(l_partkey->view(), scale_factor, l_num_rows, stream, mr);
+
+  // Generate the `l_linenumber` column
+  auto l_linenumber = generate_repeat_sequence_column<int8_t>(7, false, l_num_rows, stream, mr);
+
+  // Generate the `l_quantity` column
+  auto l_quantity = generate_random_numeric_column<int8_t>(1, 50, l_num_rows, stream, mr);
+
+  // Generate the `l_discount` column
+  auto l_discount = [&]() {
+    auto const col = generate_random_numeric_column<double>(0.00, 0.10, l_num_rows, stream, mr);
+    return cudf::round(col->view(), 2);
+  }();
+
+  // Generate the `l_tax` column
+  auto l_tax = [&]() {
+    auto const col = generate_random_numeric_column<double>(0.00, 0.08, l_num_rows, stream, mr);
+    return cudf::round(col->view(), 2);
+  }();
+
+  // Get the orderdate column from the `l_base` table
+  auto const ol_orderdate_ts = std::move(l_base_columns[1]);
+
+  // Generate the `l_shipdate` column
+  auto l_shipdate_ts = [&]() {
+    auto const l_shipdate_rand_add_days =
+      generate_random_numeric_column<int8_t>(1, 121, l_num_rows, stream, mr);
+    return add_calendrical_days(
+      ol_orderdate_ts->view(), l_shipdate_rand_add_days->view(), stream, mr);
+  }();
+
+  // Generate the `l_commitdate` column
+  auto l_commitdate_ts = [&]() {
+    auto const l_commitdate_rand_add_days =
+      generate_random_numeric_column<int8_t>(30, 90, l_num_rows, stream, mr);
+    return add_calendrical_days(
+      ol_orderdate_ts->view(), l_commitdate_rand_add_days->view(), stream, mr);
+  }();
+
+  // Generate the `l_receiptdate` column
+  auto l_receiptdate_ts = [&]() {
+    auto const l_receiptdate_rand_add_days =
+      generate_random_numeric_column<int8_t>(1, 30, l_num_rows, stream, mr);
+    return add_calendrical_days(
+      l_shipdate_ts->view(), l_receiptdate_rand_add_days->view(), stream, mr);
+  }();
+
+  // Define the current date as per clause 4.2.2.12 of the TPC-H specification
+  constexpr cudf::size_type current_date_days_since_epoch = 9'298;
+  auto current_date =
+    cudf::timestamp_scalar<cudf::timestamp_D>(current_date_days_since_epoch, true);
+  auto current_date_literal = cudf::ast::literal(current_date);
+
+  // Generate the `l_returnflag` column
+  // if `l_receiptdate` <= current_date then "R" or "A" else "N"
+  auto l_returnflag = [&]() {
+    auto const col_ref = cudf::ast::column_reference(0);
+    auto const pred =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_ref, current_date_literal);
+    auto const binary_mask =
+      cudf::compute_column(cudf::table_view({l_receiptdate_ts->view()}), pred, stream, mr);
+
+    auto const multiplier =
+      generate_repeat_sequence_column<int8_t>(2, false, l_num_rows, stream, mr);
+    auto const ternary_mask   = cudf::binary_operation(binary_mask->view(),
+                                                     multiplier->view(),
+                                                     cudf::binary_operator::MUL,
+                                                     cudf::data_type{cudf::type_id::INT8},
+                                                     stream,
+                                                     mr);
+    auto const indices        = cudf::test::fixed_width_column_wrapper<int8_t>({0, 1, 2}).release();
+    auto const keys           = cudf::test::strings_column_wrapper({"N", "A", "R"}).release();
+    auto const gather_map     = cudf::table_view({indices->view(), keys->view()});
+    auto const gathered_table = cudf::gather(
+      gather_map, ternary_mask->view(), cudf::out_of_bounds_policy::DONT_CHECK, stream, mr);
+    return std::move(gathered_table->release()[1]);
+  }();
+
+  // Generate the `l_linestatus` column
+  // if `l_shipdate` > current_date then "F" else "O"
+  auto [l_linestatus, l_linestatus_mask] = [&]() {
+    auto const col_ref = cudf::ast::column_reference(0);
+    auto const pred =
+      cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref, current_date_literal);
+    auto mask = cudf::compute_column(cudf::table_view({l_shipdate_ts->view()}), pred, stream, mr);
+    auto mask_index_type      = cudf::cast(mask->view(), cudf::data_type{cudf::type_id::INT8});
+    auto const indices        = cudf::test::fixed_width_column_wrapper<int8_t>({0, 1}).release();
+    auto const keys           = cudf::test::strings_column_wrapper({"O", "F"}).release();
+    auto const gather_map     = cudf::table_view({indices->view(), keys->view()});
+    auto const gathered_table = cudf::gather(
+      gather_map, mask_index_type->view(), cudf::out_of_bounds_policy::DONT_CHECK, stream, mr);
+    return std::make_tuple(std::move(gathered_table->release()[1]), std::move(mask_index_type));
+  }();
+
+  // Generate the `l_shipinstruct` column
+  auto l_shipinstruct = generate_random_string_column_from_set(
+    cudf::host_span<const char* const>(vocab_instructions.data(), vocab_instructions.size()),
+    l_num_rows,
+    stream,
+    mr);
+
+  // Generate the `l_shipmode` column
+  auto l_shipmode = generate_random_string_column_from_set(
+    cudf::host_span<const char* const>(vocab_modes.data(), vocab_modes.size()),
+    l_num_rows,
+    stream,
+    mr);
+
+  // Generate the `l_comment` column
+  // NOTE: This column is not compliant with
+  // clause 4.2.2.10 of the TPC-H specification
+  auto l_comment = generate_random_string_column(10, 43, l_num_rows, stream, mr);
+
+  // Generate the `lineitem_partial` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(l_linestatus_mask));
+  columns.push_back(std::move(l_orderkey));
+  columns.push_back(std::move(l_partkey));
+  columns.push_back(std::move(l_suppkey));
+  columns.push_back(std::move(l_linenumber));
+  columns.push_back(std::move(l_quantity));
+  columns.push_back(std::move(l_discount));
+  columns.push_back(std::move(l_tax));
+  columns.push_back(std::move(l_shipdate_ts));
+  columns.push_back(std::move(l_commitdate_ts));
+  columns.push_back(std::move(l_receiptdate_ts));
+  columns.push_back(std::move(l_returnflag));
+  columns.push_back(std::move(l_linestatus));
+  columns.push_back(std::move(l_shipinstruct));
+  columns.push_back(std::move(l_shipmode));
+  columns.push_back(std::move(l_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const l_linestatus_mask = lineitem.column(0);
+  auto const l_orderkey        = lineitem.column(1);
+  auto const l_discount        = lineitem.column(6);
+  auto const l_tax             = lineitem.column(7);
+  auto const l_extendedprice   = lineitem.column(16);
+
+  std::vector<std::unique_ptr<cudf::column>> orders_dependent_columns;
+
+  // Generate the `o_totalprice` column
+  // We calculate the `charge` column, which is a function of `l_extendedprice`,
+  // `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
+  auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
+  auto o_totalprice   = [&]() {
+    auto const keys = cudf::table_view({l_orderkey});
+    cudf::groupby::groupby gb(keys);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    requests.push_back(cudf::groupby::aggregation_request());
+    requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+    requests[0].values = l_charge->view();
+    auto agg_result    = gb.aggregate(requests);
+    return cudf::round(agg_result.second[0].results[0]->view(), 2);
+  }();
+  orders_dependent_columns.push_back(std::move(o_totalprice));
+
+  // Generate the `o_orderstatus` column
+  auto o_orderstatus = [&]() {
+    auto const keys = cudf::table_view({l_orderkey});
+    cudf::groupby::groupby gb(keys);
+    std::vector<cudf::groupby::aggregation_request> requests;
+
+    // Perform a `count` aggregation on `l_orderkey`
+    requests.push_back(cudf::groupby::aggregation_request());
+    requests[0].aggregations.push_back(cudf::make_count_aggregation<cudf::groupby_aggregation>());
+    requests[0].values = l_orderkey;
+
+    // Perform a `sum` aggregation on `l_linestatus_mask`
+    requests.push_back(cudf::groupby::aggregation_request());
+    requests[1].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+    requests[1].values = l_linestatus_mask;
+
+    // Perform the aggregations
+    auto agg_result = gb.aggregate(requests);
+
+    // Create a `table_view` out of the `l_orderkey`, `count`, and `sum` columns
+    auto const count = std::move(agg_result.second[0].results[0]);
+    auto const sum   = cudf::cast(
+      agg_result.second[1].results[0]->view(), cudf::data_type{cudf::type_id::INT32}, stream, mr);
+
+    auto const table =
+      cudf::table_view({agg_result.first->get_column(0).view(), count->view(), sum->view()});
+
+    // Now on this table,
+    // if `sum` == `count` then "O",
+    // if `sum` == 0, then "F",
+    // else "P"
+
+    // So, we first evaluate an expression `sum == count` and generate a boolean mask
+    auto const count_ref = cudf::ast::column_reference(1);
+    auto const sum_ref   = cudf::ast::column_reference(2);
+    auto const expr_a    = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, sum_ref, count_ref);
+    auto const mask_a    = cudf::compute_column(table, expr_a);
+    auto const o_orderstatus_intermediate =
+      cudf::copy_if_else(cudf::string_scalar("O"), cudf::string_scalar("F"), mask_a->view());
+
+    // Then, we evaluate an expression `sum == 0` and generate a boolean mask
+    auto zero_scalar        = cudf::numeric_scalar<cudf::size_type>(0);
+    auto const zero_literal = cudf::ast::literal(zero_scalar);
+    auto const expr_b_left =
+      cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, sum_ref, count_ref);
+    auto const expr_b_right =
+      cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, sum_ref, zero_literal);
+    auto const expr_b =
+      cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_b_left, expr_b_right);
+    auto const mask_b = cudf::compute_column(table, expr_b);
+    return cudf::copy_if_else(
+      cudf::string_scalar("P"), o_orderstatus_intermediate->view(), mask_b->view());
+  }();
+  orders_dependent_columns.push_back(std::move(o_orderstatus));
+  return std::make_unique<cudf::table>(std::move(orders_dependent_columns));
+}
+
+/**
+ * @brief Generate the `partsupp` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_partsupp(double scale_factor,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Define the number of rows in the `part` and `partsupp` tables
+  cudf::size_type const p_num_rows  = scale_factor * 200'000;
+  cudf::size_type const ps_num_rows = scale_factor * 800'000;
+
+  // Generate the `ps_partkey` column
+  auto ps_partkey = [&]() {
+    auto const p_partkey =
+      generate_primary_key_column(cudf::numeric_scalar<cudf::size_type>(1), p_num_rows, stream, mr);
+    auto const rep_table = cudf::repeat(cudf::table_view({p_partkey->view()}), 4, stream, mr);
+    return std::move(rep_table->release()[0]);
+  }();
+
+  // Generate the `ps_suppkey` column
+  auto ps_suppkey = calculate_ps_suppkey(ps_partkey->view(), scale_factor, ps_num_rows, stream, mr);
+
+  // Generate the `ps_availqty` column
+  auto ps_availqty = generate_random_numeric_column<int16_t>(1, 9999, ps_num_rows, stream, mr);
+
+  // Generate the `ps_supplycost` column
+  auto ps_supplycost = [&]() {
+    auto const col = generate_random_numeric_column<double>(1.00, 1000.00, ps_num_rows, stream, mr);
+    return cudf::round(col->view(), 2);
+  }();
+
+  // Generate the `ps_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto ps_comment = generate_random_string_column(49, 198, ps_num_rows, stream, mr);
+
+  // Create the `partsupp` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(ps_partkey));
+  columns.push_back(std::move(ps_suppkey));
+  columns.push_back(std::move(ps_availqty));
+  columns.push_back(std::move(ps_supplycost));
+  columns.push_back(std::move(ps_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+/**
+ * @brief Generate the `part` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_part(double scale_factor,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  cudf::size_type const num_rows = scale_factor * 200'000;
+
+  // Generate the `p_partkey` column
+  auto p_partkey =
+    generate_primary_key_column(cudf::numeric_scalar<cudf::size_type>(1), num_rows, stream, mr);
+
+  // Generate the `p_name` column
+  auto p_name = [&]() {
+    auto const p_name_a = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(vocab_p_name.data(), vocab_p_name.size()),
+      num_rows,
+      stream,
+      mr);
+    auto const p_name_b = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(vocab_p_name.data(), vocab_p_name.size()),
+      num_rows,
+      stream,
+      mr);
+    auto const p_name_c = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(vocab_p_name.data(), vocab_p_name.size()),
+      num_rows,
+      stream,
+      mr);
+    auto const p_name_d = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(vocab_p_name.data(), vocab_p_name.size()),
+      num_rows,
+      stream,
+      mr);
+    auto const p_name_e = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(vocab_p_name.data(), vocab_p_name.size()),
+      num_rows,
+      stream,
+      mr);
+    return cudf::strings::concatenate(
+      cudf::table_view(
+        {p_name_a->view(), p_name_b->view(), p_name_c->view(), p_name_d->view(), p_name_e->view()}),
+      cudf::string_scalar(" "),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+  }();
+
+  // Generate the `p_mfgr` and `p_brand` columns
+  auto const random_values_m = generate_random_numeric_column<int8_t>(1, 5, num_rows, stream, mr);
+  auto const random_values_m_str =
+    cudf::strings::from_integers(random_values_m->view(), stream, mr);
+
+  auto const random_values_n = generate_random_numeric_column<int8_t>(1, 5, num_rows, stream, mr);
+  auto const random_values_n_str =
+    cudf::strings::from_integers(random_values_n->view(), stream, mr);
+
+  auto p_mfgr = [&]() {
+    auto const mfgr_repeat = generate_repeat_string_column("Manufacturer#", num_rows, stream, mr);
+    return cudf::strings::concatenate(
+      cudf::table_view({mfgr_repeat->view(), random_values_m_str->view()}),
+      cudf::string_scalar(""),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+  }();
+
+  auto p_brand = [&]() {
+    auto const brand_repeat = generate_repeat_string_column("Brand#", num_rows, stream, mr);
+    return cudf::strings::concatenate(
+      cudf::table_view(
+        {brand_repeat->view(), random_values_m_str->view(), random_values_n_str->view()}),
+      cudf::string_scalar(""),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+  }();
+
+  // Generate the `p_type` column
+  auto p_type = generate_random_string_column_from_set(
+    cudf::host_span<const char* const>(vocab_types.data(), vocab_types.size()),
+    num_rows,
+    stream,
+    mr);
+
+  // Generate the `p_size` column
+  auto p_size = generate_random_numeric_column<int8_t>(1, 50, num_rows, stream, mr);
+
+  // Generate the `p_container` column
+  auto p_container = generate_random_string_column_from_set(
+    cudf::host_span<const char* const>(vocab_containers.data(), vocab_containers.size()),
+    num_rows,
+    stream,
+    mr);
+
+  // Generate the `p_retailprice` column
+  auto p_retailprice = calculate_p_retailprice(p_partkey->view(), stream, mr);
+
+  // Generate the `p_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto p_comment = generate_random_string_column(5, 22, num_rows, stream, mr);
+
+  // Create the `part` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(p_partkey));
+  columns.push_back(std::move(p_name));
+  columns.push_back(std::move(p_mfgr));
+  columns.push_back(std::move(p_brand));
+  columns.push_back(std::move(p_type));
+  columns.push_back(std::move(p_size));
+  columns.push_back(std::move(p_container));
+  columns.push_back(std::move(p_retailprice));
+  columns.push_back(std::move(p_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+/**
+ * @brief Generate the `orders`, `lineitem`, and `part` tables
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::tuple<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
+generate_orders_lineitem_part(double scale_factor,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Generate a table with the independent columns of the `orders` table
+  auto orders_independent = generate_orders_independent(scale_factor, stream, mr);
+
+  // Generate the `lineitem` table partially
+  auto lineitem_partial =
+    generate_lineitem_partial(orders_independent->view(), scale_factor, stream, mr);
+
+  // Generate the `part` table
+  auto part = generate_part(scale_factor, stream, mr);
+
+  // Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column,
+  // add the column to the `lineitem` table, and write the `lineitem` table to a parquet file
+
+  auto l_extendedprice = [&]() {
+    auto const left = cudf::table_view(
+      {lineitem_partial->get_column(2).view(), lineitem_partial->get_column(5).view()});
+    auto const right = cudf::table_view({part->get_column(0).view(), part->get_column(7).view()});
+    auto const joined_table   = perform_left_join(left, right, {0}, {0}, stream, mr);
+    auto joined_table_columns = joined_table->release();
+    auto const l_quantity     = std::move(joined_table_columns[1]);
+    auto const l_quantity_fp =
+      cudf::cast(l_quantity->view(), cudf::data_type{cudf::type_id::FLOAT64});
+    auto const p_retailprice = std::move(joined_table_columns[3]);
+    auto const col           = cudf::binary_operation(l_quantity_fp->view(),
+                                            p_retailprice->view(),
+                                            cudf::binary_operator::MUL,
+                                            cudf::data_type{cudf::type_id::FLOAT64},
+                                            stream,
+                                            mr);
+    return cudf::round(col->view(), 2);
+  }();
+
+  auto lineitem_partial_columns = lineitem_partial->release();
+  lineitem_partial_columns.push_back(std::move(l_extendedprice));
+  auto lineitem_temp = std::make_unique<cudf::table>(std::move(lineitem_partial_columns));
+
+  // Generate the dependent columns of the `orders` table
+  // and merge them with the independent columns
+  auto orders_dependent = generate_orders_dependent(lineitem_temp->view(), stream, mr);
+
+  auto orders_independent_columns = orders_independent->release();
+  auto orders_dependent_columns   = orders_dependent->release();
+  orders_independent_columns.insert(orders_independent_columns.end(),
+                                    std::make_move_iterator(orders_dependent_columns.begin()),
+                                    std::make_move_iterator(orders_dependent_columns.end()));
+
+  // Create the `orders` table
+  auto orders = std::make_unique<cudf::table>(std::move(orders_independent_columns));
+
+  // Create the `lineitem` table
+  auto lineitem_temp_columns = lineitem_temp->release();
+  lineitem_temp_columns.erase(lineitem_temp_columns.begin());
+  auto lineitem = std::make_unique<cudf::table>(std::move(lineitem_temp_columns));
+
+  return std::make_tuple(std::move(orders), std::move(lineitem), std::move(part));
+}
+
+/**
+ * @brief Generate the `supplier` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_supplier(double scale_factor,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Calculate the number of rows based on the scale factor
+  cudf::size_type const num_rows = scale_factor * 10'000;
+
+  // Generate the `s_suppkey` column
+  auto s_suppkey =
+    generate_primary_key_column(cudf::numeric_scalar<cudf::size_type>(1), num_rows, stream, mr);
+
+  // Generate the `s_name` column
+  auto s_name = [&]() {
+    auto const supplier_repeat = generate_repeat_string_column("Supplier#", num_rows, stream, mr);
+    auto const s_suppkey_str   = cudf::strings::from_integers(s_suppkey->view(), stream, mr);
+    auto const s_suppkey_str_padded = cudf::strings::zfill(s_suppkey_str->view(), 9, stream, mr);
+    return cudf::strings::concatenate(
+      cudf::table_view({supplier_repeat->view(), s_suppkey_str_padded->view()}),
+      cudf::string_scalar(""),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+  }();
+
+  // Generate the `s_address` column
+  auto s_address = generate_address_column(num_rows, stream, mr);
+
+  // Generate the `s_nationkey` column
+  auto s_nationkey = generate_random_numeric_column<int8_t>(0, 24, num_rows, stream, mr);
+
+  // Generate the `s_phone` column
+  auto s_phone = generate_phone_column(num_rows, stream, mr);
+
+  // Generate the `s_acctbal` column
+  auto s_acctbal = [&]() {
+    auto const col = generate_random_numeric_column<double>(-999.99, 9999.99, num_rows, stream, mr);
+    return cudf::round(col->view(), 2);
+  }();
+
+  // Generate the `s_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto s_comment = generate_random_string_column(25, 100, num_rows, stream, mr);
+
+  // Create the `supplier` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(s_suppkey));
+  columns.push_back(std::move(s_name));
+  columns.push_back(std::move(s_address));
+  columns.push_back(std::move(s_nationkey));
+  columns.push_back(std::move(s_phone));
+  columns.push_back(std::move(s_acctbal));
+  columns.push_back(std::move(s_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+/**
+ * @brief Generate the `customer` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_customer(double scale_factor,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Calculate the number of rows based on the scale factor
+  cudf::size_type const num_rows = scale_factor * 150'000;
+
+  // Generate the `c_custkey` column
+  auto c_custkey =
+    generate_primary_key_column(cudf::numeric_scalar<cudf::size_type>(1), num_rows, stream, mr);
+
+  // Generate the `c_name` column
+  auto c_name = [&]() {
+    auto const customer_repeat = generate_repeat_string_column("Customer#", num_rows, stream, mr);
+    auto const c_custkey_str   = cudf::strings::from_integers(c_custkey->view(), stream, mr);
+    auto const c_custkey_str_padded = cudf::strings::zfill(c_custkey_str->view(), 9, stream, mr);
+    return cudf::strings::concatenate(
+      cudf::table_view({customer_repeat->view(), c_custkey_str_padded->view()}),
+      cudf::string_scalar(""),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+  }();
+
+  // Generate the `c_address` column
+  auto c_address = generate_address_column(num_rows, stream, mr);
+
+  // Generate the `c_nationkey` column
+  auto c_nationkey = generate_random_numeric_column<int8_t>(0, 24, num_rows, stream, mr);
+
+  // Generate the `c_phone` column
+  auto c_phone = generate_phone_column(num_rows, stream, mr);
+
+  // Generate the `c_acctbal` column
+  auto c_acctbal = [&]() {
+    auto const col = generate_random_numeric_column<double>(-999.99, 9999.99, num_rows, stream, mr);
+    return cudf::round(col->view(), 2);
+  }();
+
+  // Generate the `c_mktsegment` column
+  auto c_mktsegment = generate_random_string_column_from_set(
+    cudf::host_span<const char* const>(vocab_segments.data(), vocab_segments.size()),
+    num_rows,
+    stream,
+    mr);
+
+  // Generate the `c_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto c_comment = generate_random_string_column(29, 116, num_rows, stream, mr);
+
+  // Create the `customer` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(c_custkey));
+  columns.push_back(std::move(c_name));
+  columns.push_back(std::move(c_address));
+  columns.push_back(std::move(c_nationkey));
+  columns.push_back(std::move(c_phone));
+  columns.push_back(std::move(c_acctbal));
+  columns.push_back(std::move(c_mktsegment));
+  columns.push_back(std::move(c_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+/**
+ * @brief Generate the `nation` table
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_nation(rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Define the number of rows
+  constexpr cudf::size_type num_rows = 25;
+
+  // Generate the `n_nationkey` column
+  auto n_nationkey =
+    generate_primary_key_column(cudf::numeric_scalar<int8_t>(0), num_rows, stream, mr);
+
+  // Generate the `n_name` column
+  auto n_name = cudf::test::strings_column_wrapper(nations.begin(), nations.end()).release();
+
+  // Generate the `n_regionkey` column
+  std::vector<int8_t> region_keys{0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2,
+                                  4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1};
+  auto n_regionkey =
+    cudf::test::fixed_width_column_wrapper<int8_t>(region_keys.begin(), region_keys.end())
+      .release();
+
+  // Generate the `n_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto n_comment = generate_random_string_column(31, 114, num_rows, stream, mr);
+
+  // Create the `nation` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(n_nationkey));
+  columns.push_back(std::move(n_name));
+  columns.push_back(std::move(n_regionkey));
+  columns.push_back(std::move(n_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+/**
+ * @brief Generate the `region` table
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_region(rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Define the number of rows
+  constexpr cudf::size_type num_rows = 5;
+
+  // Generate the `r_regionkey` column
+  auto r_regionkey =
+    generate_primary_key_column(cudf::numeric_scalar<int8_t>(0), num_rows, stream, mr);
+
+  // Generate the `r_name` column
+  auto r_name =
+    cudf::test::strings_column_wrapper({"AFRICA", "AMERICA", "ASIA", "EUROPE", "MIDDLE EAST"})
+      .release();
+
+  // Generate the `r_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto r_comment = generate_random_string_column(31, 115, num_rows, stream, mr);
+
+  // Create the `region` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(r_regionkey));
+  columns.push_back(std::move(r_name));
+  columns.push_back(std::move(r_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+}  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
new file mode 100644
index 00000000000..a6286dd8dba
--- /dev/null
+++ b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/table/table.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace datagen {
+
+/**
+ * @brief Generate the `orders`, `lineitem`, and `part` tables
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::tuple<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
+generate_orders_lineitem_part(
+  double scale_factor,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `partsupp` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_partsupp(
+  double scale_factor,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `supplier` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_supplier(
+  double scale_factor,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `customer` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_customer(
+  double scale_factor,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `nation` table
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_nation(
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `region` table
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_region(
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+}  // namespace datagen
+}  // namespace CUDF_EXPORT cudf

From 5a81a80cef59649f059d55004f745001a59b3f6f Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 30 Aug 2024 11:57:53 -0400
Subject: [PATCH 154/270] [BUG] Add gpu node type to cudf-pandas 3rd-party
 integration nightly CI job (#16704)

Following up #16645, and adding a gpu node type to the nightly CI job

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16704
---
 .github/workflows/test.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2c68f2861bb..8605fa46f68 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -132,6 +132,7 @@ jobs:
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
+      node_type: "gpu-v100-latest-1"
       container_image: "rapidsai/ci-conda:latest"
       run_script: |
         ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml

From 2d6758f39592e6296a042eb8e771171c50899013 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Fri, 30 Aug 2024 13:22:58 -0700
Subject: [PATCH 155/270] Enable batched multi-source reading of JSONL files
 with large records (#16687)

Addresses #16664

Implements reallocate-and-retry logic when the initial buffer size estimate fails for byte range reading.
Chunked reader test checks for correct reallocation for different chunk sizes.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16687
---
 cpp/src/io/json/read_json.cu    | 48 ++++++++++++++++++++++++---------
 cpp/tests/io/json/json_test.cpp | 47 ++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 2658cbbed2f..98e8e8d3c7e 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -138,14 +138,14 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
-  int const num_subchunks_prealloced  = should_load_all_sources ? 0 : max_subchunks_prealloced;
+  int num_subchunks_prealloced        = should_load_all_sources ? 0 : max_subchunks_prealloced;
   std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
 
   // The allocation for single source compressed input is estimated by assuming a ~4:1
   // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
   // of subchunks.
   auto constexpr header_size = 4096;
-  std::size_t const buffer_size =
+  std::size_t buffer_size =
     reader_compression != compression_type::NONE
       ? total_source_size * estimated_compression_ratio + header_size
       : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
@@ -169,18 +169,40 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
     // Find next delimiter
     std::int64_t next_delim_pos     = -1;
     std::size_t next_subchunk_start = chunk_offset + chunk_size;
-    while (next_subchunk_start < total_source_size && next_delim_pos < buffer_offset) {
-      buffer_offset += readbufspan.size();
-      readbufspan    = ingest_raw_input(bufspan.last(buffer_size - buffer_offset),
-                                     sources,
-                                     reader_compression,
-                                     next_subchunk_start,
-                                     size_per_subchunk,
-                                     stream);
-      next_delim_pos = find_first_delimiter(readbufspan, '\n', stream) + buffer_offset;
-      if (next_delim_pos < buffer_offset) { next_subchunk_start += size_per_subchunk; }
+    while (next_delim_pos < buffer_offset) {
+      for (int subchunk = 0;
+           subchunk < num_subchunks_prealloced && next_delim_pos < buffer_offset &&
+           next_subchunk_start < total_source_size;
+           subchunk++) {
+        buffer_offset += readbufspan.size();
+        readbufspan    = ingest_raw_input(bufspan.last(buffer_size - buffer_offset),
+                                       sources,
+                                       reader_compression,
+                                       next_subchunk_start,
+                                       size_per_subchunk,
+                                       stream);
+        next_delim_pos = find_first_delimiter(readbufspan, '\n', stream) + buffer_offset;
+        next_subchunk_start += size_per_subchunk;
+      }
+      if (next_delim_pos < buffer_offset) {
+        if (next_subchunk_start >= total_source_size) {
+          // If we have reached the end of source list but the source does not terminate with a
+          // newline character
+          next_delim_pos = buffer_offset + readbufspan.size();
+        } else {
+          // Our buffer_size estimate is insufficient to read until the end of the line! We need to
+          // allocate more memory and try again!
+          num_subchunks_prealloced *= 2;
+          buffer_size = reader_compression != compression_type::NONE
+                          ? 2 * buffer_size
+                          : std::min(total_source_size,
+                                     buffer_size + num_subchunks_prealloced * size_per_subchunk) +
+                              num_extra_delimiters;
+          buffer.resize(buffer_size, stream);
+          bufspan = device_span<char>(reinterpret_cast<char*>(buffer.data()), buffer.size());
+        }
+      }
     }
-    if (next_delim_pos < buffer_offset) next_delim_pos = buffer_offset + readbufspan.size();
 
     return datasource::owning_buffer<rmm::device_buffer>(
       std::move(buffer),
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 576a698ba31..c26e5ca3edb 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -680,6 +680,53 @@ TEST_F(JsonReaderTest, JsonLinesByteRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{3000, 4000, 5000}});
 }
 
+TEST_F(JsonReaderTest, JsonLinesByteRangeWithRealloc)
+{
+  std::string long_string     = "haha";
+  std::size_t log_repetitions = 12;
+  long_string.reserve(long_string.size() * (1UL << log_repetitions));
+  for (std::size_t i = 0; i < log_repetitions; i++) {
+    long_string += long_string;
+  }
+
+  auto json_string = [&long_string]() {
+    std::string json_string   = R"(
+      { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+      { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+      { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+      { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+    std::string replace_chars = "c";
+    std::size_t pos           = json_string.find(replace_chars);
+    while (pos != std::string::npos) {
+      // Replace the substring with the specified string
+      json_string.replace(pos, replace_chars.size(), long_string);
+
+      // Find the next occurrence of the substring
+      pos = json_string.find(replace_chars, pos + long_string.size());
+    }
+    return json_string;
+  }();
+
+  // Initialize parsing options (reading json lines). Set byte range offset and size so as to read
+  // the second row of input
+  cudf::io::json_reader_options json_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{cudf::host_span<std::byte>(
+        reinterpret_cast<std::byte*>(json_string.data()), json_string.size())})
+      .lines(true)
+      .compression(cudf::io::compression_type::NONE)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL)
+      .byte_range_offset(16430)
+      .byte_range_size(30);
+
+  // Read full test data via existing, nested JSON lines reader
+  cudf::io::table_with_metadata result = cudf::io::read_json(json_lines_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), 1);
+  EXPECT_EQ(result.metadata.schema_info[2].name, long_string);
+}
+
 TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_AcrossFiles)
 {
   const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";

From c6c720f48815ec93a543cb42fbb128d3c0eb983e Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Fri, 30 Aug 2024 16:47:26 -0400
Subject: [PATCH 156/270] Implement exposed null mask APIs in pylibcudf
 (#15908)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15908
---
 docs/cudf/source/conf.py                      |   2 +
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 .../api_docs/pylibcudf/null_mask.rst          |   6 +
 python/cudf/cudf/_lib/null_mask.pyx           | 103 +++----------
 python/pylibcudf/pylibcudf/CMakeLists.txt     |   1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |   2 +
 python/pylibcudf/pylibcudf/__init__.py        |   2 +
 .../pylibcudf/pylibcudf/libcudf/null_mask.pxd |   2 -
 python/pylibcudf/pylibcudf/null_mask.pxd      |  18 +++
 python/pylibcudf/pylibcudf/null_mask.pyx      | 142 ++++++++++++++++++
 .../pylibcudf/tests/test_null_mask.py         |  59 ++++++++
 11 files changed, 252 insertions(+), 86 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst
 create mode 100644 python/pylibcudf/pylibcudf/null_mask.pxd
 create mode 100644 python/pylibcudf/pylibcudf/null_mask.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_null_mask.py

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 43e2d6031bc..c58bc42327c 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -342,6 +342,7 @@ def clean_all_xml_files(path):
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
     "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
+    "DeviceBuffer": ("rmm._lib.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
 }
 
 
@@ -383,6 +384,7 @@ def _generate_namespaces(namespaces):
     # Cython types that don't alias cleanly because of
     # https://github.com/cython/cython/issues/5609
     "size_type",
+    "size_t",
     "type_id",
     # Unknown base types
     "int32_t",
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 505765bba0f..6a2b66e8ea0 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -23,6 +23,7 @@ This page provides API documentation for pylibcudf.
     join
     lists
     merge
+    null_mask
     quantiles
     reduce
     replace
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst
new file mode 100644
index 00000000000..4799c62eace
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst
@@ -0,0 +1,6 @@
+=========
+null_mask
+=========
+
+.. automodule:: pylibcudf.null_mask
+   :members:
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index 3a7b6a59bf3..d54e8e66281 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -1,39 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from enum import Enum
-
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+import pylibcudf
+from pylibcudf.null_mask import MaskState
 
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 
-from libcpp.memory cimport make_unique, unique_ptr
-from libcpp.pair cimport pair
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.null_mask cimport (
-    bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
-    bitmask_and as cpp_bitmask_and,
-    bitmask_or as cpp_bitmask_or,
-    copy_bitmask as cpp_copy_bitmask,
-    create_null_mask as cpp_create_null_mask,
-    underlying_type_t_mask_state,
-)
-from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport mask_state, size_type
-
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport table_view_from_columns
-
-
-class MaskState(Enum):
-    """
-    Enum for null mask creation state
-    """
-    UNALLOCATED = <underlying_type_t_mask_state> mask_state.UNALLOCATED
-    UNINITIALIZED = <underlying_type_t_mask_state> mask_state.UNINITIALIZED
-    ALL_VALID = <underlying_type_t_mask_state> mask_state.ALL_VALID
-    ALL_NULL = <underlying_type_t_mask_state> mask_state.ALL_NULL
 
 
 @acquire_spill_lock()
@@ -45,33 +17,20 @@ def copy_bitmask(Column col):
     if col.base_mask is None:
         return None
 
-    cdef column_view col_view = col.view()
-    cdef device_buffer db
-    cdef unique_ptr[device_buffer] up_db
-
-    with nogil:
-        db = move(cpp_copy_bitmask(col_view))
-        up_db = move(make_unique[device_buffer](move(db)))
-
-    rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
+    rmm_db = pylibcudf.null_mask.copy_bitmask(col.to_pylibcudf(mode="read"))
     buf = as_buffer(rmm_db)
     return buf
 
 
-def bitmask_allocation_size_bytes(size_type num_bits):
+def bitmask_allocation_size_bytes(num_bits):
     """
     Given a size, calculates the number of bytes that should be allocated for a
     column validity mask
     """
-    cdef size_t output_size
-
-    with nogil:
-        output_size = cpp_bitmask_allocation_size_bytes(num_bits)
+    return pylibcudf.null_mask.bitmask_allocation_size_bytes(num_bits)
 
-    return output_size
 
-
-def create_null_mask(size_type size, state=MaskState.UNINITIALIZED):
+def create_null_mask(size, state=MaskState.UNINITIALIZED):
     """
     Given a size and a mask state, allocate a mask that can properly represent
     the given size with the given mask state
@@ -83,48 +42,24 @@ def create_null_mask(size_type size, state=MaskState.UNINITIALIZED):
     state : ``MaskState``, default ``MaskState.UNINITIALIZED``
         State the null mask should be created in
     """
-    if not isinstance(state, MaskState):
-        raise TypeError(
-            "`state` is required to be of type `MaskState`, got "
-            + (type(state).__name__)
-        )
-
-    cdef device_buffer db
-    cdef unique_ptr[device_buffer] up_db
-    cdef mask_state c_mask_state = <mask_state>(
-        <underlying_type_t_mask_state>(state.value)
-    )
-
-    with nogil:
-        db = move(cpp_create_null_mask(size, c_mask_state))
-        up_db = move(make_unique[device_buffer](move(db)))
-
-    rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
+    rmm_db = pylibcudf.null_mask.create_null_mask(size, state)
     buf = as_buffer(rmm_db)
     return buf
 
 
 @acquire_spill_lock()
-def bitmask_and(columns: list):
-    cdef table_view c_view = table_view_from_columns(columns)
-    cdef pair[device_buffer, size_type] c_result
-    cdef unique_ptr[device_buffer] up_db
-    with nogil:
-        c_result = move(cpp_bitmask_and(c_view))
-        up_db = move(make_unique[device_buffer](move(c_result.first)))
-    dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db))
-    buf = as_buffer(dbuf)
-    return buf, c_result.second
+def bitmask_and(list columns):
+    rmm_db, other = pylibcudf.null_mask.bitmask_and(
+        [col.to_pylibcudf(mode="read") for col in columns]
+    )
+    buf = as_buffer(rmm_db)
+    return buf, other
 
 
 @acquire_spill_lock()
-def bitmask_or(columns: list):
-    cdef table_view c_view = table_view_from_columns(columns)
-    cdef pair[device_buffer, size_type] c_result
-    cdef unique_ptr[device_buffer] up_db
-    with nogil:
-        c_result = move(cpp_bitmask_or(c_view))
-        up_db = move(make_unique[device_buffer](move(c_result.first)))
-    dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db))
-    buf = as_buffer(dbuf)
-    return buf, c_result.second
+def bitmask_or(list columns):
+    rmm_db, other = pylibcudf.null_mask.bitmask_or(
+        [col.to_pylibcudf(mode="read") for col in columns]
+    )
+    buf = as_buffer(rmm_db)
+    return buf, other
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index f81a32e07f9..a4f17344cb0 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -29,6 +29,7 @@ set(cython_sources
     join.pyx
     lists.pyx
     merge.pyx
+    null_mask.pyx
     quantiles.pyx
     reduce.pyx
     replace.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index 71f523fc3cd..841efa59bda 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -15,6 +15,7 @@ from . cimport (
     join,
     lists,
     merge,
+    null_mask,
     quantiles,
     reduce,
     replace,
@@ -57,6 +58,7 @@ __all__ = [
     "join",
     "lists",
     "merge",
+    "null_mask",
     "quantiles",
     "reduce",
     "replace",
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index e784c6c6dd5..d3878a89a6a 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -26,6 +26,7 @@
     join,
     lists,
     merge,
+    null_mask,
     quantiles,
     reduce,
     replace,
@@ -69,6 +70,7 @@
     "join",
     "lists",
     "merge",
+    "null_mask",
     "quantiles",
     "reduce",
     "replace",
diff --git a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
index 3fc2c7e8f1e..5f582091b06 100644
--- a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
@@ -8,8 +8,6 @@ from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type
 
 from rmm._lib.device_buffer cimport device_buffer
 
-ctypedef int32_t underlying_type_t_mask_state
-
 
 cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil:
     cdef device_buffer copy_bitmask "cudf::copy_bitmask" (
diff --git a/python/pylibcudf/pylibcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/null_mask.pxd
new file mode 100644
index 00000000000..ab5c0080312
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/null_mask.pxd
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.libcudf.types cimport mask_state, size_type
+
+from rmm._lib.device_buffer cimport DeviceBuffer
+
+from .column cimport Column
+
+
+cpdef DeviceBuffer copy_bitmask(Column col)
+
+cpdef size_t bitmask_allocation_size_bytes(size_type number_of_bits)
+
+cpdef DeviceBuffer create_null_mask(size_type size, mask_state state = *)
+
+cpdef tuple bitmask_and(list columns)
+
+cpdef tuple bitmask_or(list columns)
diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx
new file mode 100644
index 00000000000..5bdde06f21f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/null_mask.pyx
@@ -0,0 +1,142 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport make_unique
+from libcpp.pair cimport pair
+from libcpp.utility cimport move
+from pylibcudf.libcudf cimport null_mask as cpp_null_mask
+from pylibcudf.libcudf.types cimport mask_state, size_type
+
+from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+
+from pylibcudf.libcudf.types import mask_state as MaskState  # no-cython-lint
+
+from .column cimport Column
+from .table cimport Table
+
+
+cdef DeviceBuffer buffer_to_python(device_buffer buf):
+    return DeviceBuffer.c_from_unique_ptr(make_unique[device_buffer](move(buf)))
+
+
+cpdef DeviceBuffer copy_bitmask(Column col):
+    """Copies ``col``'s bitmask into a ``DeviceBuffer``.
+
+    For details, see :cpp:func:`copy_bitmask`.
+
+    Parameters
+    ----------
+    col : Column
+        Column whose bitmask needs to be copied
+
+    Returns
+    -------
+    rmm.DeviceBuffer
+        A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty ``DeviceBuffer``
+        if ``col`` is not nullable
+    """
+    cdef device_buffer db
+
+    with nogil:
+        db = move(cpp_null_mask.copy_bitmask(col.view()))
+
+    return buffer_to_python(move(db))
+
+cpdef size_t bitmask_allocation_size_bytes(size_type number_of_bits):
+    """
+    Computes the required bytes necessary to represent the specified number of bits
+    with a 64B padding boundary.
+
+    For details, see :cpp:func:`bitmask_allocation_size_bytes`.
+
+    Parameters
+    ----------
+    number_of_bits : size_type
+        The number of bits that need to be represented
+
+    Returns
+    -------
+    size_t
+        The necessary number of bytes
+    """
+    with nogil:
+        return cpp_null_mask.bitmask_allocation_size_bytes(number_of_bits)
+
+
+cpdef DeviceBuffer create_null_mask(
+    size_type size,
+    mask_state state = mask_state.UNINITIALIZED
+):
+    """Creates a ``DeviceBuffer`` for use as a null value indicator bitmask of a
+    ``Column``.
+
+    For details, see :cpp:func:`create_null_mask`.
+
+    Parameters
+    ----------
+    size : size_type
+        The number of elements to be represented by the mask
+    state : mask_state, optional
+        The desired state of the mask. Can be one of { MaskState.UNALLOCATED,
+        MaskState.UNINITIALIZED, MaskState.ALL_VALID, MaskState.ALL_NULL }
+        (default MaskState.UNINITIALIZED)
+
+    Returns
+    -------
+    rmm.DeviceBuffer
+        A ``DeviceBuffer`` for use as a null bitmask satisfying the desired size and
+        state
+    """
+    cdef device_buffer db
+
+    with nogil:
+        db = move(cpp_null_mask.create_null_mask(size, state))
+
+    return buffer_to_python(move(db))
+
+
+cpdef tuple bitmask_and(list columns):
+    """Performs bitwise AND of the bitmasks of a list of columns.
+
+    For details, see :cpp:func:`bitmask_and`.
+
+    Parameters
+    ----------
+    columns : list
+        The list of columns
+
+    Returns
+    -------
+    tuple[DeviceBuffer, size_type]
+        A tuple of the resulting mask and count of unset bits
+    """
+    cdef Table c_table = Table(columns)
+    cdef pair[device_buffer, size_type] c_result
+
+    with nogil:
+        c_result = move(cpp_null_mask.bitmask_and(c_table.view()))
+
+    return buffer_to_python(move(c_result.first)), c_result.second
+
+
+cpdef tuple bitmask_or(list columns):
+    """Performs bitwise OR of the bitmasks of a list of columns.
+
+    For details, see :cpp:func:`bitmask_or`.
+
+    Parameters
+    ----------
+    columns : list
+        The list of columns
+
+    Returns
+    -------
+    tuple[DeviceBuffer, size_type]
+        A tuple of the resulting mask and count of unset bits
+    """
+    cdef Table c_table = Table(columns)
+    cdef pair[device_buffer, size_type] c_result
+
+    with nogil:
+        c_result = move(cpp_null_mask.bitmask_or(c_table.view()))
+
+    return buffer_to_python(move(c_result.first)), c_result.second
diff --git a/python/pylibcudf/pylibcudf/tests/test_null_mask.py b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
new file mode 100644
index 00000000000..3edcae59edc
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from pylibcudf.null_mask import MaskState
+
+import rmm
+
+
+@pytest.fixture(params=[False, True])
+def nullable(request):
+    return request.param
+
+
+@pytest.fixture(params=["float32", "float64"])
+def column(request, nullable):
+    values = [2.5, 2.49, 1.6, 8, -1.5, -1.7, -0.5, 0.5]
+    typ = {"float32": pa.float32(), "float64": pa.float64()}[request.param]
+    if nullable:
+        values[2] = None
+    return plc.interop.from_arrow(pa.array(values, type=typ))
+
+
+def test_copy_bitmask(column, nullable):
+    expected = column.null_mask().obj if nullable else rmm.DeviceBuffer()
+    got = plc.null_mask.copy_bitmask(column)
+
+    assert expected.size == got.size
+    assert expected.tobytes() == got.tobytes()
+
+
+def test_bitmask_allocation_size_bytes():
+    assert plc.null_mask.bitmask_allocation_size_bytes(0) == 0
+    assert plc.null_mask.bitmask_allocation_size_bytes(1) == 64
+    assert plc.null_mask.bitmask_allocation_size_bytes(512) == 64
+    assert plc.null_mask.bitmask_allocation_size_bytes(513) == 128
+    assert plc.null_mask.bitmask_allocation_size_bytes(1024) == 128
+    assert plc.null_mask.bitmask_allocation_size_bytes(1025) == 192
+
+
+@pytest.mark.parametrize("size", [0, 1, 512, 1024])
+@pytest.mark.parametrize(
+    "state",
+    [
+        MaskState.UNALLOCATED,
+        MaskState.UNINITIALIZED,
+        MaskState.ALL_VALID,
+        MaskState.ALL_NULL,
+    ],
+)
+def test_create_null_mask(size, state):
+    mask = plc.null_mask.create_null_mask(size, state)
+
+    assert mask.size == (
+        0
+        if state == MaskState.UNALLOCATED
+        else plc.null_mask.bitmask_allocation_size_bytes(size)
+    )

From 5e420ff63ba2997a37bf5dfbfaa73c5f05225f9d Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 30 Aug 2024 17:44:03 -0400
Subject: [PATCH 157/270] Use merge base when calculating changed files
 (#16709)

`get-pr-info.outputs.base.sha` does not actually give the merge base, but merely the tip of the target branch. Calculate the merge base and pass it to the `changed-files` step.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16709
---
 .github/workflows/pr.yaml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 35c7e3d95b6..0d79568f589 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -56,14 +56,21 @@ jobs:
       - name: Checkout code repo
         uses: actions/checkout@v4
         with:
-          ref: ${{ inputs.sha }}
-          fetch-depth: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).commits }}
+          fetch-depth: 0
           persist-credentials: false
+      - name: Calculate merge base
+        id: calculate-merge-base
+        env:
+          PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
+          BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+        run: |
+          (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") > "$GITHUB_OUTPUT"
       - name: Get changed files
         id: changed-files
         uses: tj-actions/changed-files@v45
         with:
-          base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+          base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
+          sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
           files_yaml: |
             cpp:
               - '**'

From 4ad4b2347160212b10f394719f575c6e477f129e Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Sat, 31 Aug 2024 11:39:01 -0500
Subject: [PATCH 158/270] remove some unnecessary libcudf nightly builds
 (#16714)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to #16650 and #15483.

`libcudf` wheels are identical (same content, same filename) across Python versions, but due to an oversight in the PRs linked above, we're currently building nightlies of them once per Python version supported by RAPIDS 😭

You can see this on recent runs of the `build` workflow:

<img width="752" alt="image" src="https://github.com/user-attachments/assets/ba3a2192-1752-4d32-a79b-6f238fae9f18">

([build link](https://github.com/rapidsai/cudf/actions/runs/10627299703/job/29460218854))

This PR fixes that by applying the same matrix filter to `libcudf` nightly build jobs as is currently applied to PR jobs:

https://github.com/rapidsai/cudf/blob/5e420ff63ba2997a37bf5dfbfaa73c5f05225f9d/.github/workflows/pr.yaml#L195-L200

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16714
---
 .github/workflows/build.yaml | 2 ++
 .github/workflows/pr.yaml    | 1 +
 2 files changed, 3 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 72daff7b66b..b5d17022a3a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -71,6 +71,8 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
+      # build for every combination of arch and CUDA version, but only for the latest Python
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 0d79568f589..8730804e8b6 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -197,6 +197,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
+      # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
       script: "ci/build_wheel_libcudf.sh"

From 76059580abb7a60128545d6ed977c942ea39b3be Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Sun, 1 Sep 2024 11:56:27 -0500
Subject: [PATCH 159/270] Remove java
 ColumnView.copyWithBooleanColumnAsValidity (#16660)

This depends on https://github.com/NVIDIA/spark-rapids/pull/11399

Essentially ifElse is faster than this API and this API is not safe to use generically.

https://github.com/NVIDIA/spark-rapids/issues/11397#issuecomment-2310570124

So I am removing it after replacing all calls to it with calls to `ifElse/cudf::copy_if_else`

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16660
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 38 -----------------
 java/src/main/native/src/ColumnViewJni.cpp    | 15 -------
 java/src/main/native/src/ColumnViewJni.cu     | 31 --------------
 java/src/main/native/src/ColumnViewJni.hpp    | 16 -------
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 42 +------------------
 5 files changed, 1 insertion(+), 141 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 8ff2f0f0a73..6bd4e06c47e 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -913,25 +913,6 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co
     return new ColumnVector(bitwiseMergeAndSetValidity(getNativeView(), columnViews, mergeOp.nativeId));
   }
 
-  /**
-   * Creates a deep copy of a column while replacing the validity mask. The validity mask is the
-   * device_vector equivalent of the boolean column given as argument.
-   *
-   * The boolColumn must have the same number of rows as the current column.
-   * The result column will have the same number of rows as the current column.
-   * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i.
-   * For all other values (i.e. `false` or `null`), the result column will have nulls.
-   *
-   * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
-   * then the row value is undefined.
-   *
-   * @param boolColumn bool column whose value is to be used as the validity mask.
-   * @return Deep copy of the column with replaced validity mask.
-   */
-  public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) {
-    return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView()));
-  }
-
   /////////////////////////////////////////////////////////////////////////////
   // DATE/TIME
   /////////////////////////////////////////////////////////////////////////////
@@ -4767,25 +4748,6 @@ private static native long clamper(long nativeView, long loScalarHandle, long lo
   private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] viewHandles,
                                                         int nullConfig) throws CudfException;
 
-  /**
-   * Native method to deep copy a column while replacing the null mask. The null mask is the
-   * device_vector equivalent of the boolean column given as argument.
-   *
-   * The boolColumn must have the same number of rows as the exemplar column.
-   * The result column will have the same number of rows as the exemplar.
-   * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i.
-   * For all other values (i.e. `false` or `null`), the result column will have nulls.
-   *
-   * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
-   * then the resultant row value is undefined.
-   *
-   * @param exemplarViewHandle column view of the column that is deep copied.
-   * @param boolColumnViewHandle bool column whose value is to be used as the null mask.
-   * @return Deep copy of the column with replaced null mask.
-   */
-  private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle,
-                                                             long boolColumnViewHandle) throws CudfException;
-
   ////////
   // Native cudf::column_view life cycle and metadata access methods. Life cycle methods
   // should typically only be called from the OffHeap inner class.
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 4551325ebb1..72f0ad19912 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -2090,21 +2090,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithBooleanColumnAsValidity(
-  JNIEnv* env, jobject j_object, jlong exemplar_handle, jlong validity_column_handle)
-{
-  JNI_NULL_CHECK(env, exemplar_handle, "ColumnView handle is null", 0);
-  JNI_NULL_CHECK(env, validity_column_handle, "Validity column handle is null", 0);
-  try {
-    cudf::jni::auto_set_device(env);
-    auto const exemplar = *reinterpret_cast<cudf::column_view*>(exemplar_handle);
-    auto const validity = *reinterpret_cast<cudf::column_view*>(validity_column_handle);
-    return release_as_jlong(
-      cudf::jni::new_column_with_boolean_column_as_validity(exemplar, validity));
-  }
-  CATCH_STD(env, 0);
-}
-
 ////////
 // Native cudf::column_view life cycle and metadata access methods. Life cycle methods
 // should typically only be called from the CudfColumn inner class.
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 2dbff923544..46261b087ae 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -43,37 +43,6 @@
 
 namespace cudf::jni {
 
-std::unique_ptr<cudf::column> new_column_with_boolean_column_as_validity(
-  cudf::column_view const& exemplar, cudf::column_view const& validity_column)
-{
-  CUDF_EXPECTS(validity_column.type().id() == type_id::BOOL8,
-               "Validity column must be of type bool");
-  CUDF_EXPECTS(validity_column.size() == exemplar.size(),
-               "Exemplar and validity columns must have the same size");
-
-  auto validity_device_view = cudf::column_device_view::create(validity_column);
-  auto validity_begin       = cudf::detail::make_optional_iterator<bool>(
-    *validity_device_view, cudf::nullate::DYNAMIC{validity_column.has_nulls()});
-  auto validity_end            = validity_begin + validity_device_view->size();
-  auto [null_mask, null_count] = cudf::detail::valid_if(
-    validity_begin,
-    validity_end,
-    [] __device__(auto optional_bool) { return optional_bool.value_or(false); },
-    cudf::get_default_stream(),
-    rmm::mr::get_current_device_resource());
-  auto const exemplar_without_null_mask =
-    cudf::column_view{exemplar.type(),
-                      exemplar.size(),
-                      exemplar.head<void>(),
-                      nullptr,
-                      0,
-                      exemplar.offset(),
-                      std::vector<cudf::column_view>{exemplar.child_begin(), exemplar.child_end()}};
-  auto deep_copy = std::make_unique<cudf::column>(exemplar_without_null_mask);
-  deep_copy->set_null_mask(std::move(null_mask), null_count);
-  return deep_copy;
-}
-
 std::unique_ptr<cudf::column> generate_list_offsets(cudf::column_view const& list_length,
                                                     rmm::cuda_stream_view stream)
 {
diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp
index c9eef0139ea..c8c441e8fae 100644
--- a/java/src/main/native/src/ColumnViewJni.hpp
+++ b/java/src/main/native/src/ColumnViewJni.hpp
@@ -22,22 +22,6 @@
 
 namespace cudf::jni {
 
-/**
- * @brief Creates a deep copy of the exemplar column, with its validity set to the equivalent
- * of the boolean `validity` column's value.
- *
- * The bool_column must have the same number of rows as the exemplar column.
- * The result column will have the same number of rows as the exemplar.
- * For all indices `i` where the boolean column is `true`, the result column will have a valid value
- * at index i. For all other values (i.e. `false` or `null`), the result column will have nulls.
- *
- * @param exemplar The column to be deep copied.
- * @param bool_column bool column whose value is to be used as the validity.
- * @return Deep copy of the exemplar, with the replaced validity.
- */
-std::unique_ptr<cudf::column> new_column_with_boolean_column_as_validity(
-  cudf::column_view const& exemplar, cudf::column_view const& bool_column);
-
 /**
  * @brief Generates list offsets with lengths of each list.
  *
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 7136b162c13..708744569df 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -6395,46 +6395,6 @@ void testReplaceSameIndexColumnInStruct() {
     assertTrue(e.getMessage().contains("Duplicate mapping found for replacing child index"));
   }
 
-  @Test
-  void testCopyWithBooleanColumnAsValidity() {
-    final Boolean T = true;
-    final Boolean F = false;
-    final Integer X = null;
-
-    // Straight-line: Invalidate every other row.
-    try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
-         ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T, F, T, F, T, F, T);
-         ColumnVector expected = ColumnVector.fromBoxedInts(X, 2, X, 4, X, 6, X, 8, X, 10);
-         ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) {
-      assertColumnsAreEqual(expected, result);
-    }
-
-    // Straight-line: Invalidate all Rows.
-    try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
-         ColumnVector validity = ColumnVector.fromBoxedBooleans(F, F, F, F, F, F, F, F, F, F);
-         ColumnVector expected = ColumnVector.fromBoxedInts(X, X, X, X, X, X, X, X, X, X);
-         ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) {
-      assertColumnsAreEqual(expected, result);
-    }
-
-    // Nulls in the validity column are treated as invalid.
-    try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
-         ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T, F, T, F, null, F, null);
-         ColumnVector expected = ColumnVector.fromBoxedInts(X, 2, X, 4, X, 6, X, X, X, X);
-         ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) {
-      assertColumnsAreEqual(expected, result);
-    }
-
-    // Negative case: Mismatch in row count.
-    Exception x = assertThrows(CudfException.class, () ->  {
-      try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
-         ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T);
-         ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) {
-      }
-    });
-    assertTrue(x.getMessage().contains("Exemplar and validity columns must have the same size"));
-  }
-
   @Test
   void testSegmentedGather() {
     HostColumnVector.DataType dt = new ListType(true, new BasicType(true, DType.STRING));

From 557aabf8d0be528881aadb9795e6d92790a085a8 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Tue, 3 Sep 2024 11:43:05 -0500
Subject: [PATCH 160/270] Ensure we pass the has_nulls tparam to mixed_join
 kernels (#16708)

Fixes https://github.com/rapidsai/cudf/issues/16706

I'll build/test our stack with this change, but it looks like a typo.

If there's a quick unit test we can add I'd be happy to hear recommendations or for someone else to follow on with such a test.

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16708
---
 cpp/src/join/mixed_join_kernel.cuh      | 2 +-
 cpp/src/join/mixed_join_size_kernel.cuh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index 9d011d43de6..368b1fba870 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -130,7 +130,7 @@ void launch_mixed_join(table_device_view left_table,
                        int64_t shmem_size_per_block,
                        rmm::cuda_stream_view stream)
 {
-  mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>
+  mixed_join<DEFAULT_JOIN_BLOCK_SIZE, has_nulls>
     <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
       left_table,
       right_table,
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index a1066e32331..84e9be45030 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -124,7 +124,7 @@ std::size_t launch_compute_mixed_join_output_size(
   // Allocate storage for the counter used to get the size of the join output
   rmm::device_scalar<std::size_t> size(0, stream, mr);
 
-  compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+  compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, has_nulls>
     <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
       left_table,
       right_table,

From 25779d95d413e0ddf9379dee22e36eea7bf5f08e Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 3 Sep 2024 12:24:36 -0500
Subject: [PATCH 161/270] Add boost-devel to Java CI Docker image (#16707)

Fixes #16678.  Adds the boost-devel package to the Java CI Docker environment now that the Boost headers are not being picked up implicitly after libcudf dropped the Arrow dependency in #16640.  libcudfjni still requires Arrow for now, and thus requires Boost headers.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16707
---
 java/ci/Dockerfile.rocky | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/ci/Dockerfile.rocky b/java/ci/Dockerfile.rocky
index 6b87f3ed34e..152af22f7e4 100644
--- a/java/ci/Dockerfile.rocky
+++ b/java/ci/Dockerfile.rocky
@@ -28,7 +28,7 @@ ARG TARGETPLATFORM=linux/amd64
 FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE
 ARG TOOLSET_VERSION=11
 ### Install basic requirements
-RUN dnf --enablerepo=powertools install -y  scl-utils gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build
+RUN dnf --enablerepo=powertools install -y  scl-utils gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build boost-devel
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
 RUN mkdir /usr/local/rapids /rapids && chmod 777 /usr/local/rapids /rapids
 

From 0097b454254ac30739c59dee8f29a91e6643360b Mon Sep 17 00:00:00 2001
From: Hirota Akio <33370421+a-hirota@users.noreply.github.com>
Date: Wed, 4 Sep 2024 02:28:16 +0900
Subject: [PATCH 162/270] Fix typo in column_factories.hpp comment from 'depth
 1' to 'depth 2' (#16700)

This PR fixes a typo in the `cpp/include/cudf/column/column_factories.hpp` file. The comment incorrectly mentioned "data (depth 1)" instead of "data (depth 2)". This correction improves code clarity and documentation accuracy.

Authors:
  - Hirota Akio (https://github.com/a-hirota)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16700
---
 cpp/include/cudf/column/column_factories.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index c1f295b7ea8..b2dcb25acb5 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -469,7 +469,7 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
  * offsets (depth 1)   {0, 2, 5, 7}
  * data    (depth 1)
  * offsets (depth 2)
- * data    (depth 1)   {1, 2, 3, 4, 5, 6, 7}
+ * data    (depth 2)   {1, 2, 3, 4, 5, 6, 7}
  * @endcode
  *
  * @param[in] num_rows The number of lists the column represents.

From e18b537315c07b73d1eb26354208249605e3e8be Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 3 Sep 2024 08:30:15 -1000
Subject: [PATCH 163/270] Use Series._from_column more consistently to avoid
 validation (#16716)

This modifies cases where `_from_column` provided the same logic or where 1 column was produced so `._from_column` was valid to use

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16716
---
 python/cudf/cudf/_lib/text.pyx     |  2 +-
 python/cudf/cudf/core/dataframe.py | 15 ++++-----------
 python/cudf/cudf/core/series.py    | 14 ++++++--------
 python/cudf/cudf/io/text.py        |  2 +-
 4 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index ece69b424bb..b2c7232f549 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -86,4 +86,4 @@ def read_text(object filepaths_or_buffers,
             delim,
             c_options))
 
-    return {None: Column.from_unique_ptr(move(c_col))}
+    return Column.from_unique_ptr(move(c_col))
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0d632f4775f..7a171fe9e05 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -473,15 +473,8 @@ def __getitem__(self, arg):
         ca = self._frame._data
         index = self._frame.index
         if col_is_scalar:
-            s = Series._from_data(
-                data=ColumnAccessor(
-                    {key: ca._data[key] for key in column_names},
-                    multiindex=ca.multiindex,
-                    level_names=ca.level_names,
-                    verify=False,
-                ),
-                index=index,
-            )
+            name = column_names[0]
+            s = Series._from_column(ca._data[name], name=name, index=index)
             return s._getitem_preprocessed(row_spec)
         if column_names != list(self._frame._column_names):
             frame = self._frame._from_data(
@@ -7770,8 +7763,8 @@ def interleave_columns(self):
                 "interleave_columns does not support 'category' dtype."
             )
 
-        return self._constructor_sliced._from_data(
-            {None: libcudf.reshape.interleave_columns([*self._columns])}
+        return self._constructor_sliced._from_column(
+            libcudf.reshape.interleave_columns([*self._columns])
         )
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index aadbd80f4b4..48445f018d3 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -611,9 +611,7 @@ def from_masked_array(cls, data, mask, null_count=None):
         4      14
         dtype: int64
         """
-        col = as_column(data).set_mask(mask)
-        ca = ColumnAccessor({None: col}, verify=False)
-        return cls._from_data(ca)
+        return cls._from_column(as_column(data).set_mask(mask))
 
     @_performance_tracking
     def __init__(
@@ -1150,7 +1148,7 @@ def reset_index(
             if name is no_default:
                 name = 0 if self.name is None else self.name
             data[name] = data.pop(self.name)
-            return cudf.core.dataframe.DataFrame._from_data(data, index)
+            return self._constructor_expanddim._from_data(data, index)
         # For ``name`` behavior, see:
         # https://github.com/pandas-dev/pandas/issues/44575
         # ``name`` has to be ignored when `drop=True`
@@ -1661,9 +1659,7 @@ def _concat(cls, objs, axis=0, index: bool = True):
         if len(objs):
             col = col._with_type_metadata(objs[0].dtype)
 
-        return cls._from_data(
-            ColumnAccessor({name: col}, verify=False), index=result_index
-        )
+        return cls._from_column(col, name=name, index=result_index)
 
     @property  # type: ignore
     @_performance_tracking
@@ -1977,7 +1973,9 @@ def between(self, left, right, inclusive="both") -> Series:
                 "Inclusive has to be either string of 'both', "
                 "'left', 'right', or 'neither'."
             )
-        return self._from_data({self.name: lmask & rmask}, self.index)
+        return self._from_column(
+            lmask & rmask, name=self.name, index=self.index
+        )
 
     @_performance_tracking
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 0043efce1e4..5ce738cae0e 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -33,7 +33,7 @@ def read_text(
         filepath_or_buffer, "read_text"
     )
 
-    return cudf.Series._from_data(
+    return cudf.Series._from_column(
         libtext.read_text(
             filepath_or_buffer,
             delimiter=delimiter,

From a83ac6f27254b2ebf99397d81b776c74f93469bf Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 3 Sep 2024 10:07:49 -1000
Subject: [PATCH 164/270] Add return type annotations to MultiIndex (#16696)

Mostly just return type annotations. No logic changes.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16696
---
 docs/cudf/source/conf.py            |   2 +
 python/cudf/cudf/core/multiindex.py | 109 ++++++++++++++++------------
 2 files changed, 63 insertions(+), 48 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index c58bc42327c..95813907bf4 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -566,6 +566,8 @@ def on_missing_reference(app, env, node, contnode):
     ("py:obj", "cudf.Index.to_flat_index"),
     ("py:obj", "cudf.MultiIndex.to_flat_index"),
     ("py:meth", "pyarrow.Table.to_pandas"),
+    ("py:class", "pd.DataFrame"),
+    ("py:class", "pandas.core.indexes.frozen.FrozenList"),
     ("py:class", "pa.Array"),
     ("py:class", "ScalarLike"),
     ("py:class", "ParentType"),
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index a66e2936e3b..e00890ac5c3 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -247,7 +247,7 @@ def to_series(self, index=None, name=None):
         )
 
     @_performance_tracking
-    def astype(self, dtype, copy: bool = True):
+    def astype(self, dtype, copy: bool = True) -> Self:
         if not is_object_dtype(dtype):
             raise TypeError(
                 "Setting a MultiIndex dtype to anything other than object is "
@@ -256,7 +256,7 @@ def astype(self, dtype, copy: bool = True):
         return self
 
     @_performance_tracking
-    def rename(self, names, inplace=False):
+    def rename(self, names, inplace: bool = False) -> Self | None:
         """
         Alter MultiIndex level names
 
@@ -303,7 +303,9 @@ def rename(self, names, inplace=False):
         return self.set_names(names, level=None, inplace=inplace)
 
     @_performance_tracking
-    def set_names(self, names, level=None, inplace=False):
+    def set_names(
+        self, names, level=None, inplace: bool = False
+    ) -> Self | None:
         names_is_list_like = is_list_like(names)
         level_is_list_like = is_list_like(level)
 
@@ -345,7 +347,7 @@ def _from_data(
         cls,
         data: MutableMapping,
         name: Any = None,
-    ) -> MultiIndex:
+    ) -> Self:
         """
         Use when you have a ColumnAccessor-like mapping but no codes and levels.
         """
@@ -394,7 +396,7 @@ def copy(
         names=None,
         deep=False,
         name=None,
-    ):
+    ) -> Self:
         """Returns copy of MultiIndex object.
 
         Returns a copy of `MultiIndex`. The `levels` and `codes` value can be
@@ -457,7 +459,7 @@ def copy(
         )
 
     @_performance_tracking
-    def __repr__(self):
+    def __repr__(self) -> str:
         max_seq_items = pd.get_option("display.max_seq_items") or len(self)
 
         if len(self) > max_seq_items:
@@ -503,7 +505,7 @@ def __repr__(self):
     @property  # type: ignore
     @_external_only_api("Use ._codes instead")
     @_performance_tracking
-    def codes(self):
+    def codes(self) -> pd.core.indexes.frozen.FrozenList:
         """
         Returns the codes of the underlying MultiIndex.
 
@@ -531,7 +533,7 @@ def get_slice_bound(self, label, side):
 
     @property  # type: ignore
     @_performance_tracking
-    def nlevels(self):
+    def nlevels(self) -> int:
         """Integer number of levels in this MultiIndex."""
         return self._num_columns
 
@@ -590,7 +592,7 @@ def _get_level_label(self, level):
             return self.names[level]
 
     @_performance_tracking
-    def isin(self, values, level=None):
+    def isin(self, values, level=None) -> cp.ndarray:
         """Return a boolean array where the index values are in values.
 
         Compute boolean array of whether each index value is found in
@@ -864,7 +866,7 @@ def _validate_indexer(
         | slice
         | tuple[Any, ...]
         | list[tuple[Any, ...]],
-    ):
+    ) -> None:
         if isinstance(indexer, numbers.Number):
             return
         if isinstance(indexer, tuple):
@@ -900,12 +902,12 @@ def __eq__(self, other):
 
     @property  # type: ignore
     @_performance_tracking
-    def size(self):
+    def size(self) -> int:
         # The size of a MultiIndex is only dependent on the number of rows.
         return self._num_rows
 
     @_performance_tracking
-    def take(self, indices):
+    def take(self, indices) -> Self:
         if isinstance(indices, cudf.Series) and indices.has_nulls:
             raise ValueError("Column must have no nulls.")
         obj = super().take(indices)
@@ -957,7 +959,12 @@ def __getitem__(self, index):
             return result
 
     @_performance_tracking
-    def to_frame(self, index=True, name=no_default, allow_duplicates=False):
+    def to_frame(
+        self,
+        index: bool = True,
+        name=no_default,
+        allow_duplicates: bool = False,
+    ) -> cudf.DataFrame:
         """
         Create a DataFrame with the levels of the MultiIndex as columns.
 
@@ -1034,7 +1041,7 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False):
         )
 
     @_performance_tracking
-    def get_level_values(self, level):
+    def get_level_values(self, level) -> cudf.Index:
         """
         Return the values at the requested level
 
@@ -1067,30 +1074,30 @@ def get_level_values(self, level):
         )
         return level_values
 
-    def _is_numeric(self):
+    def _is_numeric(self) -> bool:
         return False
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
-    def _is_integer(self):
+    def _is_integer(self) -> bool:
         return False
 
-    def _is_floating(self):
+    def _is_floating(self) -> bool:
         return False
 
-    def _is_object(self):
+    def _is_object(self) -> bool:
         return False
 
-    def _is_categorical(self):
+    def _is_categorical(self) -> bool:
         return False
 
-    def _is_interval(self):
+    def _is_interval(self) -> bool:
         return False
 
     @classmethod
     @_performance_tracking
-    def _concat(cls, objs):
+    def _concat(cls, objs) -> Self:
         source_data = [o.to_frame(index=False) for o in objs]
 
         # TODO: Verify if this is really necessary or if we can rely on
@@ -1100,17 +1107,19 @@ def _concat(cls, objs):
             for obj in source_data[1:]:
                 obj.columns = colnames
 
-        source_data = cudf.DataFrame._concat(source_data)
+        source_df = cudf.DataFrame._concat(source_data)
         try:
             # Only set names if all objs have the same names
             (names,) = {o.names for o in objs} - {None}
         except ValueError:
-            names = [None] * source_data._num_columns
-        return cudf.MultiIndex.from_frame(source_data, names=names)
+            names = [None] * source_df._num_columns
+        return cudf.MultiIndex.from_frame(source_df, names=names)
 
     @classmethod
     @_performance_tracking
-    def from_tuples(cls, tuples, sortorder: int | None = None, names=None):
+    def from_tuples(
+        cls, tuples, sortorder: int | None = None, names=None
+    ) -> Self:
         """
         Convert list of tuples to MultiIndex.
 
@@ -1153,7 +1162,7 @@ def from_tuples(cls, tuples, sortorder: int | None = None, names=None):
         return cls.from_pandas(pdi)
 
     @_performance_tracking
-    def to_numpy(self):
+    def to_numpy(self) -> np.ndarray:
         return self.values_host
 
     def to_flat_index(self):
@@ -1167,7 +1176,7 @@ def to_flat_index(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def values_host(self):
+    def values_host(self) -> np.ndarray:
         """
         Return a numpy representation of the MultiIndex.
 
@@ -1195,7 +1204,7 @@ def values_host(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def values(self):
+    def values(self) -> cp.ndarray:
         """
         Return a CuPy representation of the MultiIndex.
 
@@ -1236,7 +1245,7 @@ def from_frame(
         df: pd.DataFrame | cudf.DataFrame,
         sortorder: int | None = None,
         names=None,
-    ):
+    ) -> Self:
         """
         Make a MultiIndex from a DataFrame.
 
@@ -1303,7 +1312,9 @@ def from_frame(
 
     @classmethod
     @_performance_tracking
-    def from_product(cls, iterables, sortorder: int | None = None, names=None):
+    def from_product(
+        cls, iterables, sortorder: int | None = None, names=None
+    ) -> Self:
         """
         Make a MultiIndex from the cartesian product of multiple iterables.
 
@@ -1355,7 +1366,7 @@ def from_arrays(
         arrays,
         sortorder=None,
         names=None,
-    ) -> MultiIndex:
+    ) -> Self:
         """
         Convert arrays to MultiIndex.
 
@@ -1410,7 +1421,7 @@ def from_arrays(
         )
 
     @_performance_tracking
-    def _poplevels(self, level):
+    def _poplevels(self, level) -> None | MultiIndex | cudf.Index:
         """
         Remove and return the specified levels from self.
 
@@ -1461,7 +1472,7 @@ def _poplevels(self, level):
         return popped
 
     @_performance_tracking
-    def swaplevel(self, i=-2, j=-1):
+    def swaplevel(self, i=-2, j=-1) -> Self:
         """
         Swap level i with level j.
         Calling this method does not change the ordering of the values.
@@ -1512,7 +1523,7 @@ def swaplevel(self, i=-2, j=-1):
         return midx
 
     @_performance_tracking
-    def droplevel(self, level=-1):
+    def droplevel(self, level=-1) -> MultiIndex | cudf.Index:
         """
         Removes the specified levels from the MultiIndex.
 
@@ -1598,7 +1609,9 @@ def to_pandas(
 
     @classmethod
     @_performance_tracking
-    def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
+    def from_pandas(
+        cls, multiindex: pd.MultiIndex, nan_as_null=no_default
+    ) -> Self:
         """
         Convert from a Pandas MultiIndex
 
@@ -1633,11 +1646,11 @@ def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
 
     @cached_property  # type: ignore
     @_performance_tracking
-    def is_unique(self):
+    def is_unique(self) -> bool:
         return len(self) == len(self.unique())
 
     @property
-    def dtype(self):
+    def dtype(self) -> np.dtype:
         return np.dtype("O")
 
     @_performance_tracking
@@ -1706,7 +1719,7 @@ def is_monotonic_decreasing(self) -> bool:
         )
 
     @_performance_tracking
-    def fillna(self, value):
+    def fillna(self, value) -> Self:
         """
         Fill null values with the specified value.
 
@@ -1758,7 +1771,7 @@ def nunique(self, dropna: bool = True) -> int:
         mi = self.dropna(how="all") if dropna else self
         return len(mi.unique())
 
-    def _clean_nulls_from_index(self):
+    def _clean_nulls_from_index(self) -> Self:
         """
         Convert all na values(if any) in MultiIndex object
         to `<NA>` as a preprocessing step to `__repr__` methods.
@@ -1769,20 +1782,20 @@ def _clean_nulls_from_index(self):
         )
 
     @_performance_tracking
-    def memory_usage(self, deep=False):
+    def memory_usage(self, deep: bool = False) -> int:
         usage = sum(col.memory_usage for col in self._columns)
         usage += sum(level.memory_usage(deep=deep) for level in self._levels)
         usage += sum(code.memory_usage for code in self._codes)
         return usage
 
     @_performance_tracking
-    def difference(self, other, sort=None):
+    def difference(self, other, sort=None) -> Self:
         if hasattr(other, "to_pandas"):
             other = other.to_pandas()
         return cudf.from_pandas(self.to_pandas().difference(other, sort))
 
     @_performance_tracking
-    def append(self, other):
+    def append(self, other) -> Self:
         """
         Append a collection of MultiIndex objects together
 
@@ -2000,7 +2013,7 @@ def get_loc(self, key):
         mask[true_inds] = True
         return mask
 
-    def _get_reconciled_name_object(self, other) -> MultiIndex:
+    def _get_reconciled_name_object(self, other) -> Self:
         """
         If the result of a set operation will be self,
         return self, unless the names change, in which
@@ -2026,7 +2039,7 @@ def _maybe_match_names(self, other):
         ]
 
     @_performance_tracking
-    def union(self, other, sort=None):
+    def union(self, other, sort=None) -> Self:
         if not isinstance(other, MultiIndex):
             msg = "other must be a MultiIndex or a list of tuples"
             try:
@@ -2050,7 +2063,7 @@ def union(self, other, sort=None):
         return self._union(other, sort=sort)
 
     @_performance_tracking
-    def _union(self, other, sort=None):
+    def _union(self, other, sort=None) -> Self:
         # TODO: When to_frame is refactored to return a
         # deep copy in future, we should push most of the common
         # logic between MultiIndex._union & BaseIndex._union into
@@ -2076,7 +2089,7 @@ def _union(self, other, sort=None):
         return midx
 
     @_performance_tracking
-    def _intersection(self, other, sort=None):
+    def _intersection(self, other, sort=None) -> Self:
         if self.names != other.names:
             deep = True
             col_names = list(range(0, self.nlevels))
@@ -2167,7 +2180,7 @@ def _columns_for_reset_index(
         else:
             yield from self._split_columns_by_levels(levels, in_levels=True)
 
-    def repeat(self, repeats, axis=None):
+    def repeat(self, repeats, axis=None) -> Self:
         return self._from_data(
             self._data._from_columns_like_self(
                 super()._repeat([*self._columns], repeats, axis)

From fa1486e1d1d09116d2b5f57dfef7d9307ebc76c6 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 3 Sep 2024 16:31:30 -0400
Subject: [PATCH 165/270] Remove ERROR_TEST gtest from libcudf (#16722)

Removes the `ERROR_TEST` gtest from libcudf. This test was only verifying some macros on mostly CUDA behavior and not libcudf specific functions. The tests have become troublesome to support in CI especially in conjunction with other tools like `compute-sanitizer`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

URL: https://github.com/rapidsai/cudf/pull/16722
---
 cpp/tests/CMakeLists.txt               |   4 -
 cpp/tests/error/error_handling_test.cu | 136 -------------------------
 2 files changed, 140 deletions(-)
 delete mode 100644 cpp/tests/error/error_handling_test.cu

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f86acbcc51b..1bedb344a01 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -110,10 +110,6 @@ ConfigureTest(SCALAR_TEST scalar/scalar_test.cpp scalar/scalar_device_view_test.
 # * timestamps tests ------------------------------------------------------------------------------
 ConfigureTest(TIMESTAMPS_TEST wrappers/timestamps_test.cu)
 
-# ##################################################################################################
-# * cudf tests ------------------------------------------------------------------------------------
-ConfigureTest(ERROR_TEST error/error_handling_test.cu)
-
 # ##################################################################################################
 # * groupby tests ---------------------------------------------------------------------------------
 ConfigureTest(
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
deleted file mode 100644
index 9c7459fa69d..00000000000
--- a/cpp/tests/error/error_handling_test.cu
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/default_stream.hpp>
-#include <cudf_test/stream_checking_resource_adaptor.hpp>
-#include <cudf_test/testing_main.hpp>
-
-#include <cudf/filling.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream.hpp>
-
-TEST(ExpectsTest, FalseCondition)
-{
-  EXPECT_THROW(CUDF_EXPECTS(false, "condition is false"), cudf::logic_error);
-}
-
-TEST(ExpectsTest, TrueCondition) { EXPECT_NO_THROW(CUDF_EXPECTS(true, "condition is true")); }
-
-TEST(CudaTryTest, Error) { EXPECT_THROW(CUDF_CUDA_TRY(cudaErrorLaunchFailure), cudf::cuda_error); }
-
-TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDF_CUDA_TRY(cudaSuccess)); }
-
-TEST(StreamCheck, success) { EXPECT_NO_THROW(CUDF_CHECK_CUDA(0)); }
-
-namespace {
-// Some silly kernel that will cause an error
-CUDF_KERNEL void test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
-}  // namespace
-
-// In a release build and without explicit synchronization, CUDF_CHECK_CUDA may
-// or may not fail on erroneous asynchronous CUDA calls. Invoke
-// cudaStreamSynchronize to guarantee failure on error. In a non-release build,
-// CUDF_CHECK_CUDA deterministically fails on erroneous asynchronous CUDA
-// calls.
-TEST(StreamCheck, FailedKernel)
-{
-  rmm::cuda_stream stream;
-  int a;
-  test_kernel<<<0, 0, 0, stream.value()>>>(&a);
-#ifdef NDEBUG
-  stream.synchronize();
-#endif
-  EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error);
-}
-
-TEST(StreamCheck, CatchFailedKernel)
-{
-  rmm::cuda_stream stream;
-  int a;
-  test_kernel<<<0, 0, 0, stream.value()>>>(&a);
-#ifndef NDEBUG
-  stream.synchronize();
-#endif
-  EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error);
-}
-
-CUDF_KERNEL void kernel() { asm("trap;"); }
-
-TEST(DeathTest, CudaFatalError)
-{
-  testing::FLAGS_gtest_death_test_style = "threadsafe";
-  auto call_kernel                      = []() {
-    kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>();
-    try {
-      CUDF_CUDA_TRY(cudaDeviceSynchronize());
-    } catch (const cudf::fatal_cuda_error& fe) {
-      std::abort();
-    }
-  };
-  ASSERT_DEATH(call_kernel(), "");
-}
-
-#ifndef NDEBUG
-
-CUDF_KERNEL void assert_false_kernel() { cudf_assert(false && "this kernel should die"); }
-
-CUDF_KERNEL void assert_true_kernel() { cudf_assert(true && "this kernel should live"); }
-
-TEST(DebugAssertDeathTest, cudf_assert_false)
-{
-  testing::FLAGS_gtest_death_test_style = "threadsafe";
-
-  auto call_kernel = []() {
-    auto const stream = cudf::get_default_stream().value();
-    assert_false_kernel<<<1, 1, 0, stream>>>();
-
-    // Kernel should fail with `cudaErrorAssert`
-    // This error invalidates the current device context, so we need to kill
-    // the current process. Running with EXPECT_DEATH spawns a new process for
-    // each attempted kernel launch
-    if (cudaErrorAssert == cudaDeviceSynchronize()) { std::abort(); }
-
-    // If we reach this point, the cudf_assert didn't work so we exit normally, which will cause
-    // EXPECT_DEATH to fail.
-  };
-
-  EXPECT_DEATH(call_kernel(), "this kernel should die");
-}
-
-TEST(DebugAssert, cudf_assert_true)
-{
-  auto const stream = cudf::get_default_stream().value();
-  assert_true_kernel<<<1, 1, 0, stream>>>();
-  ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
-}
-
-#endif
-
-// These tests don't use CUDF_TEST_PROGRAM_MAIN because :
-// 1.) They don't need the RMM Pool
-// 2.) The RMM Pool interferes with the death test
-int main(int argc, char** argv)
-{
-  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return 0; }
-
-  ::testing::InitGoogleTest(&argc, argv);
-  auto const cmd_opts = parse_cudf_test_opts(argc, argv);
-  auto adaptor        = make_stream_mode_adaptor(cmd_opts);
-  return RUN_ALL_TESTS();
-}

From 26091a44b3dbf0f56fc0dfc5f081077f2d00681f Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 4 Sep 2024 09:10:24 -0400
Subject: [PATCH 166/270] Refactor cudf pandas integration tests CI (#16728)

Following up #16645 with a couple improvements

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16728
---
 ..._library_tests.sh => run-library-tests.sh} | 24 +++++++------------
 .../third-party-integration/test.sh           |  2 +-
 2 files changed, 10 insertions(+), 16 deletions(-)
 rename ci/cudf_pandas_scripts/third-party-integration/{ci_run_library_tests.sh => run-library-tests.sh} (69%)

diff --git a/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh b/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
similarity index 69%
rename from ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh
rename to ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
index 54a56508cdc..d44d25d658c 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
@@ -9,23 +9,17 @@ cleanup() {
 
 trap cleanup EXIT
 
-runtest_gold() {
+runtest() {
     local lib=$1
+    local mode=$2
 
-    pytest \
-    -v \
-    --continue-on-collection-errors \
-    --cache-clear \
-    --numprocesses=${NUM_PROCESSES} \
-    --dist=worksteal \
-    ${TEST_DIR}/test_${lib}*.py
-}
-
-runtest_cudf_pandas() {
-    local lib=$1
+    local plugin=""
+    if [ "$mode" = "cudf" ]; then
+        plugin="-p cudf.pandas"
+    fi
 
     pytest \
-    -p cudf.pandas \
+    $plugin \
     -v \
     --continue-on-collection-errors \
     --cache-clear \
@@ -38,8 +32,8 @@ main() {
     local lib=$1
 
     # generation phase
-    runtest_gold ${lib}
-    runtest_cudf_pandas ${lib}
+    runtest ${lib} "gold"
+    runtest ${lib} "cudf"
 
     # assertion phase
     pytest \
diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh
index 89b28c30e39..f8ddbaba0f3 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/test.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh
@@ -72,7 +72,7 @@ main() {
             fi
         done
 
-        TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh ${lib}
+        TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh ${lib}
 
         rapids-logger "Test script exiting with value: ${EXITCODE}"
     done

From 1b6f02d536d253465d2c601f222fb0acede8a942 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 4 Sep 2024 12:02:40 -0500
Subject: [PATCH 167/270] Multi-file and Parquet-aware prefetching from remote
 storage (#16657)

Follow up to https://github.com/rapidsai/cudf/pull/16613
Supersedes https://github.com/rapidsai/cudf/pull/16166

Improves remote-IO read performance when multiple files are read at once. Also enables partial IO for remote Parquet files (previously removed in `24.10` by https://github.com/rapidsai/cudf/pull/16589).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16657
---
 python/cudf/cudf/io/parquet.py    |  40 +++++++++
 python/cudf/cudf/tests/test_s3.py |  47 ++++++++++
 python/cudf/cudf/utils/ioutils.py | 141 ++++++++++++++++++++++++++----
 3 files changed, 212 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 526f12aa94e..62be7378e9e 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -577,11 +577,51 @@ def read_parquet(
         )
     filepath_or_buffer = paths if paths else filepath_or_buffer
 
+    # Prepare remote-IO options
+    prefetch_options = kwargs.pop("prefetch_options", {})
+    if not ioutils._is_local_filesystem(fs):
+        # The default prefetch method depends on the
+        # `row_groups` argument. In most cases we will use
+        # method="all" by default, because it is fastest
+        # when we need to read most of the file(s).
+        # If a (simple) `row_groups` selection is made, we
+        # use method="parquet" to avoid transferring the
+        # entire file over the network
+        method = prefetch_options.get("method")
+        _row_groups = None
+        if method in (None, "parquet"):
+            if row_groups is None:
+                # If the user didn't specify a method, don't use
+                # 'parquet' prefetcher for column projection alone.
+                method = method or "all"
+            elif all(r == row_groups[0] for r in row_groups):
+                # Row group selection means we are probably
+                # reading half the file or less. We should
+                # avoid a full file transfer by default.
+                method = "parquet"
+                _row_groups = row_groups[0]
+            elif (method := method or "all") == "parquet":
+                raise ValueError(
+                    "The 'parquet' prefetcher requires a uniform "
+                    "row-group selection for all paths within the "
+                    "same `read_parquet` call. "
+                    "Got: {row_groups}"
+                )
+        if method == "parquet":
+            prefetch_options = prefetch_options.update(
+                {
+                    "method": method,
+                    "columns": columns,
+                    "row_groups": _row_groups,
+                }
+            )
+
     filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         fs=fs,
         storage_options=storage_options,
         bytes_per_thread=bytes_per_thread,
+        prefetch_options=prefetch_options,
     )
 
     # Warn user if they are not using cudf for IO
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 3b23a53091e..0958b68084d 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -229,6 +229,53 @@ def test_read_parquet(
     assert_eq(expect, got2)
 
 
+@pytest.mark.parametrize("method", ["all", "parquet"])
+@pytest.mark.parametrize("blocksize", [1024 * 1024, 1024])
+def test_read_parquet_prefetch_options(
+    s3_base,
+    s3so,
+    pdf,
+    method,
+    blocksize,
+):
+    bucket = "parquet"
+    fname_1 = "test_parquet_reader_prefetch_options_1.parquet"
+    buffer_1 = BytesIO()
+    pdf.to_parquet(path=buffer_1)
+    buffer_1.seek(0)
+
+    fname_2 = "test_parquet_reader_prefetch_options_2.parquet"
+    buffer_2 = BytesIO()
+    pdf_2 = pdf.copy()
+    pdf_2["Integer"] += 1
+    pdf_2.to_parquet(path=buffer_2)
+    buffer_2.seek(0)
+
+    with s3_context(
+        s3_base=s3_base,
+        bucket=bucket,
+        files={
+            fname_1: buffer_1,
+            fname_2: buffer_2,
+        },
+    ):
+        got = cudf.read_parquet(
+            [
+                f"s3://{bucket}/{fname_1}",
+                f"s3://{bucket}/{fname_2}",
+            ],
+            storage_options=s3so,
+            prefetch_options={
+                "method": method,
+                "blocksize": blocksize,
+            },
+            columns=["String", "Integer"],
+        )
+
+    expect = pd.concat([pdf, pdf_2], ignore_index=True)[["String", "Integer"]]
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize("bytes_per_thread", [32, 1024])
 @pytest.mark.parametrize("columns", [None, ["List", "Struct"]])
 @pytest.mark.parametrize("index", [None, "Integer"])
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 6b146be0fa3..1627107b57d 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import datetime
+import functools
+import operator
 import os
 import urllib
 import warnings
@@ -18,6 +20,12 @@
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
 
+try:
+    import fsspec.parquet as fsspec_parquet
+
+except ImportError:
+    fsspec_parquet = None
+
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
 _ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
 
@@ -187,6 +195,11 @@
 allow_mismatched_pq_schemas : boolean, default False
     If True, enables reading (matching) columns specified in `columns` and `filters`
     options from the input files with otherwise mismatched schemas.
+prefetch_options : dict, default None
+    WARNING: This is an experimental feature and may be removed at any
+    time without warning or deprecation period.
+    Dictionary of options to use to prefetch bytes from remote storage.
+    These options are passed through to `get_reader_filepath_or_buffer`.
 
 Returns
 -------
@@ -1439,6 +1452,14 @@
     Glob pattern to use when expanding directories into file paths
     (e.g. "*.json"). If this parameter is not specified, directories
     will not be expanded.
+prefetch_options : dict, default None
+    WARNING: This is an experimental feature and may be removed at any
+    time without warning or deprecation period.
+    Dictionary of options to use to prefetch bytes from remote storage.
+    These options are only used when `path_or_data` is a list of remote
+    paths. If 'method' is set to 'all' (the default), the only supported
+    option is 'blocksize' (default 256 MB). If method is set to 'parquet',
+    'columns' and 'row_groups' are also supported (default None).
 
 Returns
 -------
@@ -1620,6 +1641,7 @@ def get_reader_filepath_or_buffer(
     warn_on_raw_text_input=None,
     warn_meta=None,
     expand_dir_pattern=None,
+    prefetch_options=None,
 ):
     """{docstring}"""
 
@@ -1690,26 +1712,15 @@ def get_reader_filepath_or_buffer(
                 raw_text_input = True
 
         elif fs is not None:
-            # TODO: We can use cat_ranges and/or parquet-aware logic
-            # to copy all remote data into host memory at once here.
-            # The current solution iterates over files, and copies
-            # ALL data from each file (even when we are performing
-            # partial IO, and don't need the entire file)
             if len(paths) == 0:
                 raise FileNotFoundError(
                     f"{input_sources} could not be resolved to any files"
                 )
-            filepaths_or_buffers = [
-                BytesIO(
-                    _fsspec_data_transfer(
-                        fpath,
-                        fs=fs,
-                        mode=mode,
-                        bytes_per_thread=bytes_per_thread,
-                    )
-                )
-                for fpath in paths
-            ]
+            filepaths_or_buffers = _prefetch_remote_buffers(
+                paths,
+                fs,
+                **(prefetch_options or {}),
+            )
         else:
             raw_text_input = True
 
@@ -2099,3 +2110,101 @@ def _read_byte_ranges(
 
     for worker in workers:
         worker.join()
+
+
+def _get_remote_bytes_all(
+    remote_paths, fs, *, blocksize=_BYTES_PER_THREAD_DEFAULT
+):
+    # TODO: Experiment with a heuristic to avoid the fs.sizes
+    # call when we are reading many files at once (the latency
+    # of collecting the file sizes is unnecessary in this case)
+    if max(sizes := fs.sizes(remote_paths)) <= blocksize:
+        # Don't bother breaking up individual files
+        return fs.cat_ranges(remote_paths, None, None)
+    else:
+        # Construct list of paths, starts, and ends
+        paths, starts, ends = map(
+            list,
+            zip(
+                *(
+                    (r, j, min(j + blocksize, s))
+                    for r, s in zip(remote_paths, sizes)
+                    for j in range(0, s, blocksize)
+                )
+            ),
+        )
+
+        # Collect the byte ranges
+        chunks = fs.cat_ranges(paths, starts, ends)
+
+        # Construct local byte buffers
+        # (Need to make sure path offsets are ordered correctly)
+        unique_count = dict(zip(*np.unique(paths, return_counts=True)))
+        offset = np.cumsum([0] + [unique_count[p] for p in remote_paths])
+        buffers = [
+            functools.reduce(operator.add, chunks[offset[i] : offset[i + 1]])
+            for i in range(len(remote_paths))
+        ]
+        return buffers
+
+
+def _get_remote_bytes_parquet(
+    remote_paths,
+    fs,
+    *,
+    columns=None,
+    row_groups=None,
+    blocksize=_BYTES_PER_THREAD_DEFAULT,
+):
+    if fsspec_parquet is None or (columns is None and row_groups is None):
+        return _get_remote_bytes_all(remote_paths, fs, blocksize=blocksize)
+
+    sizes = fs.sizes(remote_paths)
+    data = fsspec_parquet._get_parquet_byte_ranges(
+        remote_paths,
+        fs,
+        columns=columns,
+        row_groups=row_groups,
+        max_block=blocksize,
+    )
+
+    buffers = []
+    for size, path in zip(sizes, remote_paths):
+        path_data = data[path]
+        buf = np.empty(size, dtype="b")
+        for range_offset in path_data.keys():
+            chunk = path_data[range_offset]
+            buf[range_offset[0] : range_offset[1]] = np.frombuffer(
+                chunk, dtype="b"
+            )
+        buffers.append(buf.tobytes())
+    return buffers
+
+
+def _prefetch_remote_buffers(
+    paths,
+    fs,
+    *,
+    method="all",
+    **prefetch_options,
+):
+    # Gather bytes ahead of time for remote filesystems
+    if fs and paths and not _is_local_filesystem(fs):
+        try:
+            prefetcher = {
+                "parquet": _get_remote_bytes_parquet,
+                "all": _get_remote_bytes_all,
+            }[method]
+        except KeyError:
+            raise ValueError(
+                f"{method} is not a supported remote-data prefetcher."
+                " Expected 'parquet' or 'all'."
+            )
+        return prefetcher(
+            paths,
+            fs,
+            **prefetch_options,
+        )
+
+    else:
+        return paths

From ad1369d2d6eabf4b0ae480a10463a74f3034aece Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 5 Sep 2024 01:11:07 +0200
Subject: [PATCH 168/270] CI: Test against old versions of key dependencies
 (#16570)

This adds explicit tests with old versions of key dependencies. Specifically:
- `numba==0.57`
- `numpy==1.23`
- `pandas==2.0`
- ~`fsspec==0.6.0`~ excluded it. `transformers==4.39.3` requires `huggingface_hub` which requires `fsspec>=2023.5.0`.  In principle one could include it e.g. only for conda which doesn't pull in `transformers`, but that seemed not worth the trouble?
- `cupy==12.0.0`
- `pyarrow==16.1.0`

See also https://github.com/rapidsai/build-planning/issues/81

(Marking as draft until I see that things work.)

Authors:
  - Sebastian Berg (https://github.com/seberg)
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/16570
---
 ci/cudf_pandas_scripts/run_tests.sh           |  13 +-
 ci/test_python_common.sh                      |   3 +-
 ci/test_wheel_cudf.sh                         |  14 ++
 ci/test_wheel_cudf_polars.sh                  |  11 ++
 ci/test_wheel_dask_cudf.sh                    |  13 ++
 dependencies.yaml                             |  22 +++
 .../cudf/cudf/tests/indexes/test_interval.py  |   4 +
 .../test_avro_reader_fastavro_integration.py  |   5 +
 python/cudf/cudf/tests/test_binops.py         |  41 +++++-
 python/cudf/cudf/tests/test_categorical.py    |   5 +
 python/cudf/cudf/tests/test_concat.py         |  99 ++++++++-----
 python/cudf/cudf/tests/test_csv.py            |  12 +-
 python/cudf/cudf/tests/test_dataframe.py      |  19 ++-
 python/cudf/cudf/tests/test_datetime.py       |  35 ++++-
 python/cudf/cudf/tests/test_doctests.py       |   5 +
 python/cudf/cudf/tests/test_groupby.py        | 112 +++++++++++++++
 python/cudf/cudf/tests/test_index.py          |  37 ++++-
 python/cudf/cudf/tests/test_indexing.py       |   8 ++
 python/cudf/cudf/tests/test_interpolate.py    |   4 +
 python/cudf/cudf/tests/test_interval.py       |   5 +
 python/cudf/cudf/tests/test_join_order.py     | 130 +++++++++++++++++-
 python/cudf/cudf/tests/test_mvc.py            |   8 +-
 python/cudf/cudf/tests/test_numerical.py      |   3 +-
 python/cudf/cudf/tests/test_orc.py            |   8 +-
 python/cudf/cudf/tests/test_parquet.py        |   5 +
 python/cudf/cudf/tests/test_reductions.py     |   5 +
 python/cudf/cudf/tests/test_replace.py        |  20 ++-
 python/cudf/cudf/tests/test_resampling.py     |   9 ++
 python/cudf/cudf/tests/test_reshape.py        |  17 ++-
 python/cudf/cudf/tests/test_stats.py          |   8 ++
 .../cudf_pandas_tests/test_cudf_pandas.py     |  12 +-
 .../dask_cudf/tests/test_applymap.py          |   6 +
 .../dask_cudf/tests/test_distributed.py       |   5 +
 .../dask_cudf/dask_cudf/tests/test_groupby.py |   5 +
 34 files changed, 638 insertions(+), 70 deletions(-)

diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 8b85695c861..1c2724a9a5d 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -54,8 +54,19 @@ else
     RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
     RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
-    # echo to expand wildcard before adding `[extra]` requires for pip
+    echo "" > ./constraints.txt
+    if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
+        # `test_python` constraints are for `[test]` not `[cudf-pandas-tests]`
+        rapids-dependency-file-generator \
+            --output requirements \
+            --file-key test_python \
+            --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
+        | tee ./constraints.txt
+    fi
+
     python -m pip install \
+        -v \
+        --constraint ./constraints.txt \
         "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,cudf-pandas-tests]" \
         "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
         "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
index e8849588aa5..d0675b0431a 100755
--- a/ci/test_python_common.sh
+++ b/ci/test_python_common.sh
@@ -14,7 +14,8 @@ ENV_YAML_DIR="$(mktemp -d)"
 rapids-dependency-file-generator \
   --output conda \
   --file-key test_python \
-  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
+    | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test
 
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 6861d699695..28ded2f8e0f 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -10,8 +10,22 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from
 RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
+rapids-logger "Install cudf, pylibcudf, and test requirements"
+
+# Constrain to minimum dependency versions if job is set up as "oldest"
+echo "" > ./constraints.txt
+if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
+    rapids-dependency-file-generator \
+        --output requirements \
+        --file-key py_test_cudf \
+        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
+      | tee ./constraints.txt
+fi
+
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
+    -v \
+    --constraint ./constraints.txt \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
   "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 0baf6c9e277..9844090258a 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -25,9 +25,20 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 rapids-logger "Installing cudf_polars and its dependencies"
+# Constraint to minimum dependency versions if job is set up as "oldest"
+echo "" > ./constraints.txt
+if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
+    rapids-dependency-file-generator \
+        --output requirements \
+        --file-key py_test_cudf_polars \
+        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
+      | tee ./constraints.txt
+fi
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
+    -v \
+    --constraint ./constraints.txt \
     "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
     "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
     "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index fa74b2398f7..0d39807d56c 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -11,8 +11,21 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from
 RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
+rapids-logger "Install dask_cudf, cudf, pylibcudf, and test requirements"
+# Constraint to minimum dependency versions if job is set up as "oldest"
+echo "" > ./constraints.txt
+if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
+    rapids-dependency-file-generator \
+        --output requirements \
+        --file-key py_test_dask_cudf \
+        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
+      | tee ./constraints.txt
+fi
+
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
+  -v \
+  --constraint ./constraints.txt \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
   "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
diff --git a/dependencies.yaml b/dependencies.yaml
index c6851d9cb90..f8b231efd6d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -696,6 +696,28 @@ dependencies:
           - pytest<8
           - pytest-cov
           - pytest-xdist
+    specific:
+      # Define additional constraints for testing with oldest dependencies.
+      - output_types: [conda, requirements]
+        matrices:
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - numba==0.57.*
+              - numpy==1.23.*
+              - pandas==2.0.*
+              - pyarrow==14.0.0
+              - cupy==12.0.0  # ignored as pip constraint
+          - matrix:
+            packages:
+      - output_types: requirements
+        # Using --constraints for pip install, so we list cupy multiple times
+        matrices:
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - cupy-cuda11x==12.0.0
+              - cupy-cuda12x==12.0.0
+          - matrix:
+            packages:
   test_python_pylibcudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index 6653a94c9be..25edf788daf 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -149,6 +149,10 @@ def test_interval_range_periods_basic_dtype(start_t, end_t, periods_t):
     assert_eq(pindex, gindex)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Does not warn on older versions of pandas",
+)
 def test_interval_range_periods_warnings():
     start_val, end_val, periods_val = 0, 4, 1.0
 
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 2ec1d1d2f28..9d69e626c3d 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -23,6 +23,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.testing import assert_eq
 from cudf.testing.dataset_generator import rand_dataframe
 
@@ -302,6 +303,10 @@ def get_days_from_epoch(date: datetime.date | None) -> int | None:
 @pytest.mark.parametrize("namespace", [None, "root_ns"])
 @pytest.mark.parametrize("nullable", [True, False])
 @pytest.mark.parametrize("prepend_null", [True, False])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas (datetime(9999, ...) too large)",
+)
 def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null):
     avro_type = {"logicalType": "date", "type": "int"}
     if nullable:
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 4256ec872e6..2e8519509e2 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -13,7 +13,11 @@
 
 import cudf
 from cudf import Index, Series
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing import _utils as utils, assert_eq
 from cudf.utils.dtypes import (
@@ -1781,6 +1785,20 @@ def test_datetime_dateoffset_binaryop(
             reason="https://github.com/pandas-dev/pandas/issues/57448",
         )
     )
+    if (
+        not PANDAS_GE_220
+        and dtype in {"datetime64[ms]", "datetime64[s]"}
+        and frequency in ("microseconds", "nanoseconds")
+        and n_periods != 0
+    ):
+        pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595")
+    if (
+        not PANDAS_GE_220
+        and dtype == "datetime64[us]"
+        and frequency == "nanoseconds"
+        and n_periods != 0
+    ):
+        pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595")
 
     date_col = [
         f"2000-01-01 00:00:{components}",
@@ -1834,7 +1852,11 @@ def test_datetime_dateoffset_binaryop(
     "ignore:Discarding nonzero nanoseconds:UserWarning"
 )
 @pytest.mark.parametrize("op", [operator.add, operator.sub])
-def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
+def test_datetime_dateoffset_binaryop_multiple(request, date_col, kwargs, op):
     gsr = cudf.Series(date_col, dtype="datetime64[ns]")
     psr = gsr.to_pandas()
 
@@ -1873,6 +1895,21 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
 def test_datetime_dateoffset_binaryop_reflected(
     n_periods, frequency, dtype, components
 ):
+    if (
+        not PANDAS_GE_220
+        and dtype in {"datetime64[ms]", "datetime64[s]"}
+        and frequency in ("microseconds", "nanoseconds")
+        and n_periods != 0
+    ):
+        pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595")
+    if (
+        not PANDAS_GE_220
+        and dtype == "datetime64[us]"
+        and frequency == "nanoseconds"
+        and n_periods != 0
+    ):
+        pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595")
+
     date_col = [
         f"2000-01-01 00:00:{components}",
         f"2000-01-31 00:00:{components}",
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index ae58af8ebce..cd1ad21ae59 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -11,6 +11,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.testing import assert_eq
 from cudf.testing._utils import NUMERIC_TYPES, assert_exceptions_equal
 
@@ -858,6 +859,10 @@ def test_cat_from_scalar(scalar):
     assert_eq(ps, gs)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Does not warn on older versions of pandas",
+)
 def test_cat_groupby_fillna():
     ps = pd.Series(["a", "b", "c"], dtype="category")
     gs = cudf.from_pandas(ps)
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index c1c03de48d4..8da589ba45b 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -9,6 +9,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_GE_220
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing import assert_eq
 from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
@@ -451,45 +452,75 @@ def test_concat_mixed_input():
         [pd.Series([1, 2, 3]), pd.DataFrame({"a": []})],
         [pd.Series([], dtype="float64"), pd.DataFrame({"a": []})],
         [pd.Series([], dtype="float64"), pd.DataFrame({"a": [1, 2]})],
-        [
-            pd.Series([1, 2, 3.0, 1.2], name="abc"),
-            pd.DataFrame({"a": [1, 2]}),
-        ],
-        [
-            pd.Series(
-                [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130]
-            ),
-            pd.DataFrame({"a": [1, 2]}),
-        ],
-        [
-            pd.Series(
-                [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"]
+        pytest.param(
+            [
+                pd.Series([1, 2, 3.0, 1.2], name="abc"),
+                pd.DataFrame({"a": [1, 2]}),
+            ],
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_220,
+                reason="https://github.com/pandas-dev/pandas/pull/56365",
             ),
-            pd.DataFrame({"a": [1, 2]}, index=["a", "b"]),
-        ],
-        [
-            pd.Series(
-                [1, 2, 3.0, 1.2, 8, 100],
-                name="New name",
-                index=["a", "b", "c", "d", "e", "f"],
+        ),
+        pytest.param(
+            [
+                pd.Series(
+                    [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130]
+                ),
+                pd.DataFrame({"a": [1, 2]}),
+            ],
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_220,
+                reason="https://github.com/pandas-dev/pandas/pull/56365",
             ),
-            pd.DataFrame(
-                {"a": [1, 2, 4, 10, 11, 12]},
-                index=["a", "b", "c", "d", "e", "f"],
+        ),
+        pytest.param(
+            [
+                pd.Series(
+                    [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"]
+                ),
+                pd.DataFrame({"a": [1, 2]}, index=["a", "b"]),
+            ],
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_220,
+                reason="https://github.com/pandas-dev/pandas/pull/56365",
             ),
-        ],
-        [
-            pd.Series(
-                [1, 2, 3.0, 1.2, 8, 100],
-                name="New name",
-                index=["a", "b", "c", "d", "e", "f"],
+        ),
+        pytest.param(
+            [
+                pd.Series(
+                    [1, 2, 3.0, 1.2, 8, 100],
+                    name="New name",
+                    index=["a", "b", "c", "d", "e", "f"],
+                ),
+                pd.DataFrame(
+                    {"a": [1, 2, 4, 10, 11, 12]},
+                    index=["a", "b", "c", "d", "e", "f"],
+                ),
+            ],
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_220,
+                reason="https://github.com/pandas-dev/pandas/pull/56365",
             ),
-            pd.DataFrame(
-                {"a": [1, 2, 4, 10, 11, 12]},
-                index=["a", "b", "c", "d", "e", "f"],
+        ),
+        pytest.param(
+            [
+                pd.Series(
+                    [1, 2, 3.0, 1.2, 8, 100],
+                    name="New name",
+                    index=["a", "b", "c", "d", "e", "f"],
+                ),
+                pd.DataFrame(
+                    {"a": [1, 2, 4, 10, 11, 12]},
+                    index=["a", "b", "c", "d", "e", "f"],
+                ),
+            ]
+            * 7,
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_220,
+                reason="https://github.com/pandas-dev/pandas/pull/56365",
             ),
-        ]
-        * 7,
+        ),
     ],
 )
 def test_concat_series_dataframe_input(objs):
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 40ba415e681..cee3d23eadc 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -16,9 +16,13 @@
 
 import cudf
 from cudf import read_csv
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.testing import assert_eq
-from cudf.testing._utils import assert_exceptions_equal
+from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
 
 
 def make_numeric_dataframe(nrows, dtype):
@@ -1270,14 +1274,14 @@ def test_csv_reader_delim_whitespace():
     # with header row
     with pytest.warns(FutureWarning):
         cu_df = read_csv(StringIO(buffer), delim_whitespace=True)
-    with pytest.warns(FutureWarning):
+    with expect_warning_if(PANDAS_GE_220):
         pd_df = pd.read_csv(StringIO(buffer), delim_whitespace=True)
     assert_eq(pd_df, cu_df)
 
     # without header row
     with pytest.warns(FutureWarning):
         cu_df = read_csv(StringIO(buffer), delim_whitespace=True, header=None)
-    with pytest.warns(FutureWarning):
+    with expect_warning_if(PANDAS_GE_220):
         pd_df = pd.read_csv(
             StringIO(buffer), delim_whitespace=True, header=None
         )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 9122a1074ac..f4d1578bda7 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -26,7 +26,11 @@
 
 import cudf
 from cudf.api.extensions import no_default
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.core.column import column
 from cudf.errors import MixedTypeError
@@ -3561,8 +3565,11 @@ def test_dataframe_empty_sort_index():
 @pytest.mark.parametrize("inplace", [True, False])
 @pytest.mark.parametrize("na_position", ["first", "last"])
 def test_dataframe_sort_index(
-    index, axis, ascending, inplace, ignore_index, na_position
+    request, index, axis, ascending, inplace, ignore_index, na_position
 ):
+    if not PANDAS_GE_220 and axis in (1, "columns") and ignore_index:
+        pytest.skip(reason="Bug fixed in pandas-2.2")
+
     pdf = pd.DataFrame(
         {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]},
         index=index,
@@ -3612,6 +3619,10 @@ def test_dataframe_sort_index(
 @pytest.mark.parametrize("ignore_index", [True, False])
 @pytest.mark.parametrize("inplace", [True, False])
 @pytest.mark.parametrize("na_position", ["first", "last"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_dataframe_mulitindex_sort_index(
     request, axis, level, ascending, inplace, ignore_index, na_position
 ):
@@ -6747,6 +6758,10 @@ def test_dataframe_init_from_arrays_cols(data, cols, index):
         None,
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_dataframe_assign_scalar(request, col_data, assign_val):
     request.applymarker(
         pytest.mark.xfail(
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 7be4faa42c3..4a2345fc009 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -14,7 +14,11 @@
 import cudf
 import cudf.testing.dataset_generator as dataset_generator
 from cudf import DataFrame, Series
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.core.index import DatetimeIndex
 from cudf.testing import assert_eq
 from cudf.testing._utils import (
@@ -801,6 +805,10 @@ def test_to_datetime_different_formats_notimplemented():
         cudf.to_datetime(["2015-02-01", "2015-02-01 10:10:10"])
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas.",
+)
 def test_datetime_can_cast_safely():
     sr = cudf.Series(
         ["1679-01-01", "2000-01-31", "2261-01-01"], dtype="datetime64[ms]"
@@ -847,6 +855,10 @@ def test_datetime_array_timeunit_cast(dtype):
 
 
 @pytest.mark.parametrize("timeunit", ["D", "W", "M", "Y"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_datetime_scalar_timeunit_cast(timeunit):
     testscalar = np.datetime64("2016-11-20", timeunit)
 
@@ -1535,6 +1547,10 @@ def test_date_range_start_end_periods(start, end, periods):
     )
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_date_range_start_end_freq(start, end, freq):
     if isinstance(freq, str):
         _gfreq = _pfreq = freq
@@ -1551,6 +1567,10 @@ def test_date_range_start_end_freq(start, end, freq):
     )
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_date_range_start_freq_periods(start, freq, periods):
     if isinstance(freq, str):
         _gfreq = _pfreq = freq
@@ -1643,6 +1663,9 @@ def test_date_range_raise_overflow():
     ],
 )
 def test_date_range_raise_unsupported(freqstr_unsupported):
+    if not PANDAS_GE_220 and freqstr_unsupported.endswith("E"):
+        pytest.skip(reason="YE, etc. support was added in pandas 2.2")
+
     s, e = "2001-01-01", "2008-01-31"
     pd.date_range(start=s, end=e, freq=freqstr_unsupported)
     with pytest.raises(ValueError, match="does not yet support"):
@@ -1654,7 +1677,7 @@ def test_date_range_raise_unsupported(freqstr_unsupported):
     if freqstr_unsupported != "3MS":
         freqstr_unsupported = freqstr_unsupported.lower()
         with pytest.raises(ValueError, match="does not yet support"):
-            with pytest.warns(FutureWarning):
+            with expect_warning_if(PANDAS_GE_220):
                 cudf.date_range(start=s, end=e, freq=freqstr_unsupported)
 
 
@@ -1995,6 +2018,10 @@ def test_first(idx, offset):
         )
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="warning not present in older pandas versions",
+)
 def test_first_start_at_end_of_month(idx, offset):
     p = pd.Series(range(len(idx)), index=idx)
     g = cudf.from_pandas(p)
@@ -2319,6 +2346,10 @@ def test_datetime_to_str(data, dtype):
     assert_eq(actual.to_pandas(nullable=True), expected)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_datetime_string_to_datetime_resolution_loss_raises():
     data = ["2020-01-01 00:00:00.00001"]
     dtype = "datetime64[s]"
diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py
index 794660cffcb..5d3d18cbe95 100644
--- a/python/cudf/cudf/tests/test_doctests.py
+++ b/python/cudf/cudf/tests/test_doctests.py
@@ -11,6 +11,7 @@
 from packaging import version
 
 import cudf
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 
 pytestmark = pytest.mark.filterwarnings("ignore::FutureWarning")
 
@@ -96,6 +97,10 @@ def prinoptions(cls):
         itertools.chain(*[_find_doctests_in_obj(mod) for mod in tests]),
         ids=lambda docstring: docstring.name,
     )
+    @pytest.mark.skipif(
+        PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+        reason="Doctests not expected to pass on older versions of pandas",
+    )
     def test_docstring(self, docstring):
         # We ignore differences in whitespace in the doctest output, and enable
         # the use of an ellipsis "..." to match any string in the doctest
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 74f04c0584f..0aaa71e50d7 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -188,6 +188,10 @@ def test_groupby_as_index_single_agg(pdf, gdf, as_index):
 
 @pytest.mark.parametrize("engine", ["cudf", "jit"])
 @pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Include groups missing on old versions of pandas",
+)
 def test_groupby_as_index_apply(pdf, gdf, as_index, engine):
     gdf = gdf.groupby("y", as_index=as_index).apply(
         lambda df: df["x"].mean(), engine=engine
@@ -298,6 +302,10 @@ def assert_values_equal(arr):
             assert_values_equal(pddf[k].values)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply():
     np.random.seed(0)
     df = DataFrame()
@@ -338,6 +346,10 @@ def f3(df, k, L, m):
 
 
 @pytest.mark.parametrize("func,args", create_test_groupby_apply_args_params())
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_args(func, args):
     np.random.seed(0)
     df = DataFrame()
@@ -500,6 +512,10 @@ def func(df):
     "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"]
 )
 @pytest.mark.parametrize("dataset", ["small", "large", "nans"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Include groups missing on old versions of pandas",
+)
 def test_groupby_apply_jit_unary_reductions(
     func, dtype, dataset, groupby_jit_datasets
 ):
@@ -530,6 +546,10 @@ def func(df):
 
 
 # test unary index reductions for special values
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def groupby_apply_jit_idx_reductions_special_vals_inner(
     func, data, dtype, special_val
 ):
@@ -555,6 +575,10 @@ def func(df):
 @pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"])
 @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
 @pytest.mark.parametrize("dataset", ["small", "large", "nans"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Include groups missing on old versions of pandas",
+)
 def test_groupby_apply_jit_reductions_special_vals(
     func, dtype, dataset, groupby_jit_datasets, special_val
 ):
@@ -583,6 +607,10 @@ def test_groupby_apply_jit_reductions_special_vals(
     ],
 )
 @pytest.mark.parametrize("dataset", ["small", "large", "nans"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="include_groups keyword new in pandas 2.2",
+)
 def test_groupby_apply_jit_idx_reductions_special_vals(
     func, dtype, dataset, groupby_jit_datasets, special_val
 ):
@@ -593,6 +621,10 @@ def test_groupby_apply_jit_idx_reductions_special_vals(
 
 
 @pytest.mark.parametrize("dtype", ["int32"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_jit_sum_integer_overflow(dtype):
     max = np.iinfo(dtype).max
 
@@ -627,6 +659,10 @@ def func(group):
         "large",
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype):
     dataset = groupby_jit_datasets[dataset]
 
@@ -653,6 +689,10 @@ def func(group):
 
 
 @pytest.mark.parametrize("dtype", ["int32", "int64"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_jit_correlation_zero_variance(dtype):
     # pearson correlation is undefined when the variance of either
     # variable is zero. This test ensures that the jit implementation
@@ -711,6 +751,10 @@ def func(group):
 
 
 @pytest.mark.parametrize("dtype", ["uint8", "str"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_unsupported_dtype(dtype):
     df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
     df["b"] = df["b"].astype(dtype)
@@ -739,6 +783,10 @@ def func(group):
         lambda df: df["val1"].mean() + df["val2"].std(),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_jit_basic(func, groupby_jit_data_small):
     run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"])
 
@@ -759,12 +807,20 @@ def f3(df, k, L, m):
 @pytest.mark.parametrize(
     "func,args", create_test_groupby_apply_jit_args_params()
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_jit_args(func, args, groupby_jit_data_small):
     run_groupby_apply_jit_test(
         groupby_jit_data_small, func, ["key1", "key2"], *args
     )
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_jit_block_divergence():
     # https://github.com/rapidsai/cudf/issues/12686
     df = cudf.DataFrame(
@@ -782,6 +838,10 @@ def diverging_block(grp_df):
     run_groupby_apply_jit_test(df, diverging_block, ["a"])
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_caching():
     # Make sure similar functions that differ
     # by simple things like constants actually
@@ -818,6 +878,10 @@ def f(group):
     assert precompiled.currsize == 3
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_no_bytecode_fallback():
     # tests that a function which contains no bytecode
     # attribute, but would still be executable using
@@ -836,6 +900,10 @@ def f(group):
     assert_groupby_results_equal(expect, got)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_return_col_from_df():
     # tests a UDF that consists of purely colwise
     # ops, such as `lambda group: group.x + group.y`
@@ -862,6 +930,10 @@ def func(df):
 
 
 @pytest.mark.parametrize("func", [lambda group: group.sum()])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_return_df(func):
     # tests a UDF that reduces over a dataframe
     # and produces a series with the original column names
@@ -1940,6 +2012,10 @@ def test_groupby_agg_combinations(agg):
     )
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Include groups missing on old versions of pandas",
+)
 def test_groupby_apply_noempty_group():
     pdf = pd.DataFrame(
         {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]}
@@ -2208,6 +2284,10 @@ def f3(x, k, L, m):
 @pytest.mark.parametrize(
     "func,args", create_test_groupby_apply_return_scalars_params()
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_return_scalars(func, args):
     pdf = pd.DataFrame(
         {
@@ -2266,6 +2346,10 @@ def f5(x, k, L, m):
 @pytest.mark.parametrize(
     "func,args", create_test_groupby_apply_return_series_dataframe_params()
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Include groups missing on old versions of pandas",
+)
 def test_groupby_apply_return_series_dataframe(func, args):
     pdf = pd.DataFrame(
         {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]}
@@ -2744,6 +2828,10 @@ def test_groupby_diff_row_zero_shift(nelem):
 
 # TODO: test for category columns when cudf.Scalar supports category type
 @pytest.mark.parametrize("nelem", [10, 100, 1000])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="warning not present in older pandas versions",
+)
 def test_groupby_fillna_multi_value(nelem):
     t = rand_dataframe(
         dtypes_meta=[
@@ -2790,6 +2878,10 @@ def test_groupby_fillna_multi_value(nelem):
 # TODO: test for category columns when cudf.Scalar supports category type
 # TODO: cudf.fillna does not support decimal column to column fill yet
 @pytest.mark.parametrize("nelem", [10, 100, 1000])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="warning not present in older pandas versions",
+)
 def test_groupby_fillna_multi_value_df(nelem):
     t = rand_dataframe(
         dtypes_meta=[
@@ -2843,6 +2935,10 @@ def test_groupby_fillna_multi_value_df(nelem):
     "data", [[1, None, 2, None, 3, None], [1, 2, 3, 4, 5, 6]]
 )
 @pytest.mark.parametrize("args", [{"value": 42}, {"method": "ffill"}])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="warning not present in older pandas versions",
+)
 def test_groupby_various_by_fillna(by, data, args):
     ps = pd.Series(data)
     gs = cudf.from_pandas(ps)
@@ -3146,6 +3242,10 @@ def test_groupby_freq_s(label, closed):
         ),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Warnings only given on newer versions.",
+)
 def test_groupby_get_group(pdf, group, name, obj):
     gdf = cudf.from_pandas(pdf)
 
@@ -3644,6 +3744,10 @@ def test_group_by_pandas_sort_order(groups, sort):
         "last",
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_group_by_empty_reduction(dtype, reduce_op):
     gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype)
     pdf = gdf.to_pandas()
@@ -3664,6 +3768,10 @@ def test_group_by_empty_reduction(dtype, reduce_op):
     "apply_op",
     ["sum", "min", "max", "idxmax"],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_group_by_empty_apply(request, dtype, apply_op):
     request.applymarker(
         pytest.mark.xfail(
@@ -3719,6 +3827,10 @@ def test_groupby_consecutive_operations():
     assert_groupby_results_equal(actual, expected, check_dtype=False)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Warning only given on newer versions.",
+)
 def test_categorical_grouping_pandas_compatibility():
     gdf = cudf.DataFrame(
         {
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 722a64cb553..3f483219423 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -16,6 +16,11 @@
 
 import cudf
 from cudf.api.extensions import no_default
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex
 from cudf.testing import assert_eq
 from cudf.testing._utils import (
@@ -791,9 +796,27 @@ def test_index_to_series(data):
     "name_data,name_other",
     [("abc", "c"), (None, "abc"), ("abc", pd.NA), ("abc", "abc")],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_index_difference(data, other, sort, name_data, name_other):
     pd_data = pd.Index(data, name=name_data)
     pd_other = pd.Index(other, name=name_other)
+    if (
+        not PANDAS_GE_220
+        and isinstance(pd_data.dtype, pd.CategoricalDtype)
+        and not isinstance(pd_other.dtype, pd.CategoricalDtype)
+        and pd_other.isnull().any()
+    ):
+        pytest.skip(reason="https://github.com/pandas-dev/pandas/issues/57318")
+
+    if (
+        not PANDAS_GE_220
+        and len(pd_other) == 0
+        and len(pd_data) != len(pd_data.unique())
+    ):
+        pytest.skip(reason="Bug fixed in pandas-2.2+")
 
     gd_data = cudf.from_pandas(pd_data)
     gd_other = cudf.from_pandas(pd_other)
@@ -1017,6 +1040,10 @@ def test_index_equal_misc(data, other):
         ["abcd", "defgh", "werty", "poiu"],
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Does not warn on older versions of pandas",
+)
 def test_index_append(data, other):
     pd_data = pd.Index(data)
     pd_other = pd.Index(other)
@@ -1220,6 +1247,10 @@ def test_index_append_error(data, other):
         ),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Does not warn on older versions of pandas",
+)
 def test_index_append_list(data, other):
     pd_data = data
     pd_other = other
@@ -2084,6 +2115,10 @@ def test_get_indexer_multi_numeric_deviate(key, method):
 
 
 @pytest.mark.parametrize("method", ["ffill", "bfill"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_get_indexer_multi_error(method):
     pi = pd.MultiIndex.from_tuples(
         [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 10), (1, 1, 1), (2, 2, 1)]
@@ -2527,7 +2562,7 @@ def test_isin_index(index, values):
     )
     with expect_warning_if(is_dt_str):
         got = gidx.isin(values)
-    with expect_warning_if(is_dt_str):
+    with expect_warning_if(PANDAS_GE_220 and is_dt_str):
         expected = pidx.isin(values)
 
     assert_eq(got, expected)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 9df2852dde8..00ae99466bb 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1016,6 +1016,10 @@ def test_series_setitem_iloc(key, value, nulls):
         (slice(0, 2), [0.5, 0.25]),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_series_setitem_dtype(key, value):
     psr = pd.Series([1, 2, 3], dtype="int32")
     gsr = cudf.from_pandas(psr)
@@ -1634,6 +1638,10 @@ def test_dataframe_loc_iloc_inplace_update_with_RHS_dataframe(
     assert_eq(expected, actual)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="No warning in older versions of pandas",
+)
 def test_dataframe_loc_inplace_update_with_invalid_RHS_df_columns():
     gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
     pdf = gdf.to_pandas()
diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py
index a4f0b9fc97e..c76a49103e2 100644
--- a/python/cudf/cudf/tests/test_interpolate.py
+++ b/python/cudf/cudf/tests/test_interpolate.py
@@ -125,6 +125,10 @@ def test_interpolate_series_values_or_index(data, index, method):
         ),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Does not fail on older versions of pandas",
+)
 def test_interpolate_dataframe_error_cases(data, kwargs):
     gsr = cudf.DataFrame(data)
     psr = gsr.to_pandas()
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 2d194107658..5e1dd33fbf1 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -6,6 +6,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_GE_220
 from cudf.testing import assert_eq
 
 
@@ -168,6 +169,10 @@ def test_interval_index_unique():
 
 @pytest.mark.parametrize("box", [pd.Series, pd.IntervalIndex])
 @pytest.mark.parametrize("tz", ["US/Eastern", None])
+@pytest.mark.skipif(
+    condition=not PANDAS_GE_220,
+    reason="ME frequency new in pandas 2.2",
+)
 def test_interval_with_datetime(tz, box):
     dti = pd.date_range(
         start=pd.Timestamp("20180101", tz=tz),
diff --git a/python/cudf/cudf/tests/test_join_order.py b/python/cudf/cudf/tests/test_join_order.py
index 9ea4ba007d2..9a95f0e01ab 100644
--- a/python/cudf/cudf/tests/test_join_order.py
+++ b/python/cudf/cudf/tests/test_join_order.py
@@ -1,13 +1,19 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 import itertools
+import operator
 import string
+from collections import defaultdict
 
 import numpy as np
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.testing import assert_eq
 
 
@@ -35,10 +41,124 @@ def right():
 # Behaviour in sort=False case didn't match documentation in many
 # cases prior to https://github.com/pandas-dev/pandas/pull/54611
 # (released as part of pandas 2.2)
-def expected(left, right, sort, *, how):
-    left = left.to_pandas()
-    right = right.to_pandas()
-    return left.merge(right, on="key", how=how, sort=sort)
+if PANDAS_GE_220:
+    # Behaviour in sort=False case didn't match documentation in many
+    # cases prior to https://github.com/pandas-dev/pandas/pull/54611
+    # (released as part of pandas 2.2)
+    def expected(left, right, sort, *, how):
+        left = left.to_pandas()
+        right = right.to_pandas()
+        return left.merge(right, on="key", how=how, sort=sort)
+
+else:
+
+    def expect_inner(left, right, sort):
+        left_key = left.key.values_host.tolist()
+        left_val = left.val.values_host.tolist()
+        right_key = right.key.values_host.tolist()
+        right_val = right.val.values_host.tolist()
+
+        right_have = defaultdict(list)
+        for i, k in enumerate(right_key):
+            right_have[k].append(i)
+        keys = []
+        val_x = []
+        val_y = []
+        for k, v in zip(left_key, left_val):
+            if k not in right_have:
+                continue
+            for i in right_have[k]:
+                keys.append(k)
+                val_x.append(v)
+                val_y.append(right_val[i])
+
+        if sort:
+            # Python sort is stable, so this will preserve input order for
+            # equal items.
+            keys, val_x, val_y = zip(
+                *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0))
+            )
+        return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y})
+
+    def expect_left(left, right, sort):
+        left_key = left.key.values_host.tolist()
+        left_val = left.val.values_host.tolist()
+        right_key = right.key.values_host.tolist()
+        right_val = right.val.values_host.tolist()
+
+        right_have = defaultdict(list)
+        for i, k in enumerate(right_key):
+            right_have[k].append(i)
+        keys = []
+        val_x = []
+        val_y = []
+        for k, v in zip(left_key, left_val):
+            if k not in right_have:
+                right_vals = [None]
+            else:
+                right_vals = [right_val[i] for i in right_have[k]]
+
+            for rv in right_vals:
+                keys.append(k)
+                val_x.append(v)
+                val_y.append(rv)
+
+        if sort:
+            # Python sort is stable, so this will preserve input order for
+            # equal items.
+            keys, val_x, val_y = zip(
+                *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0))
+            )
+        return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y})
+
+    def expect_outer(left, right, sort):
+        left_key = left.key.values_host.tolist()
+        left_val = left.val.values_host.tolist()
+        right_key = right.key.values_host.tolist()
+        right_val = right.val.values_host.tolist()
+        right_have = defaultdict(list)
+        for i, k in enumerate(right_key):
+            right_have[k].append(i)
+        keys = []
+        val_x = []
+        val_y = []
+        for k, v in zip(left_key, left_val):
+            if k not in right_have:
+                right_vals = [None]
+            else:
+                right_vals = [right_val[i] for i in right_have[k]]
+            for rv in right_vals:
+                keys.append(k)
+                val_x.append(v)
+                val_y.append(rv)
+        left_have = set(left_key)
+        for k, v in zip(right_key, right_val):
+            if k not in left_have:
+                keys.append(k)
+                val_x.append(None)
+                val_y.append(v)
+
+        # Python sort is stable, so this will preserve input order for
+        # equal items.
+        # outer joins are always sorted, but we test both sort values
+        keys, val_x, val_y = zip(
+            *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0))
+        )
+        return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y})
+
+    def expected(left, right, sort, *, how):
+        if how == "inner":
+            return expect_inner(left, right, sort)
+        elif how == "outer":
+            return expect_outer(left, right, sort)
+        elif how == "left":
+            return expect_left(left, right, sort)
+        elif how == "right":
+            return expect_left(right, left, sort).rename(
+                {"val_x": "val_y", "val_y": "val_x"}, axis=1
+            )
+        else:
+            raise NotImplementedError()
 
 
 @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
diff --git a/python/cudf/cudf/tests/test_mvc.py b/python/cudf/cudf/tests/test_mvc.py
index 7dd25ebc500..055bc5757b3 100644
--- a/python/cudf/cudf/tests/test_mvc.py
+++ b/python/cudf/cudf/tests/test_mvc.py
@@ -1,8 +1,9 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 import subprocess
 import sys
 
 import pytest
+from packaging import version
 
 IS_CUDA_11 = False
 IS_CUDA_12 = False
@@ -14,9 +15,12 @@
 # do not test cuda 12 if pynvjitlink isn't present
 HAVE_PYNVJITLINK = False
 try:
+    import numba
     import pynvjitlink  # noqa: F401
 
-    HAVE_PYNVJITLINK = True
+    HAVE_PYNVJITLINK = version.parse(numba.__version__) >= version.parse(
+        "0.58"
+    )
 except ModuleNotFoundError:
     pass
 
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index 1b0589254f5..b1a2f081cd2 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -5,6 +5,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_GE_220
 from cudf.testing import assert_eq
 from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if
 from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes
@@ -373,7 +374,7 @@ def test_to_numeric_error(data, errors):
         ):
             cudf.to_numeric(data, errors=errors)
     else:
-        with expect_warning_if(errors == "ignore"):
+        with expect_warning_if(PANDAS_GE_220 and errors == "ignore"):
             expect = pd.to_numeric(data, errors=errors)
         with expect_warning_if(errors == "ignore"):
             got = cudf.to_numeric(data, errors=errors)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index e0884a5819a..c2a30b76bea 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1679,7 +1679,13 @@ def run_orc_columns_and_index_param(index_obj, index, columns):
     "columns",
     [
         None,
-        [],
+        pytest.param(
+            [],
+            marks=pytest.mark.skipif(
+                PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+                reason="Bug in older version of pandas",
+            ),
+        ),
     ],
 )
 def test_orc_columns_and_index_param(index_obj, index, columns):
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 6623c537ddf..8b59a7eef08 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -23,6 +23,7 @@
 
 import cudf
 from cudf._lib.parquet import read_parquet_chunked
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.io.parquet import (
     ParquetDatasetWriter,
     ParquetWriter,
@@ -3034,6 +3035,10 @@ def test_parquet_reader_rle_boolean(datadir):
 #                a list column in a schema, the cudf reader was confusing
 #                nesting information between a list column and a subsequent
 #                string column, ultimately causing a crash.
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Older versions of pandas do not have DataFrame.map()",
+)
 def test_parquet_reader_one_level_list2(datadir):
     # we are reading in a file containing binary types, but cudf returns
     # those as strings. so we have to massage the pandas data to get
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index a70a2ea15dd..f276f394cd0 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -10,6 +10,7 @@
 
 import cudf
 from cudf import Series
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing import _utils as utils, assert_eq
 from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if, gen_rand
@@ -342,6 +343,10 @@ def test_any_all_axis_none(data, op):
         "median",
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Warning not given on older versions of pandas",
+)
 def test_reductions_axis_none_warning(op):
     df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]})
     pdf = df.to_pandas()
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index e5ee0127a74..3a8928297c0 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -10,7 +10,11 @@
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing import assert_eq
 from cudf.testing._utils import (
@@ -66,7 +70,7 @@ def test_series_replace_all(gsr, to_replace, value):
     )
     with expect_warning_if(expect_warn):
         actual = gsr.replace(to_replace=gd_to_replace, value=gd_value)
-    with expect_warning_if(expect_warn):
+    with expect_warning_if(expect_warn and PANDAS_GE_220):
         if pd_value is None:
             # TODO: Remove this workaround once cudf
             # introduces `no_default` values
@@ -91,7 +95,7 @@ def test_series_replace():
 
     # Categorical
     psr3 = pd.Series(["one", "two", "three"], dtype="category")
-    with pytest.warns(FutureWarning):
+    with expect_warning_if(PANDAS_GE_220, FutureWarning):
         psr4 = psr3.replace("one", "two")
     sr3 = cudf.from_pandas(psr3)
     with pytest.warns(FutureWarning):
@@ -100,7 +104,7 @@ def test_series_replace():
         psr4.sort_values().reset_index(drop=True),
         sr4.sort_values().reset_index(drop=True),
     )
-    with pytest.warns(FutureWarning):
+    with expect_warning_if(PANDAS_GE_220, FutureWarning):
         psr5 = psr3.replace("one", "five")
     with pytest.warns(FutureWarning):
         sr5 = sr3.replace("one", "five")
@@ -517,7 +521,7 @@ def test_fillna_categorical(psr_data, fill_value, inplace):
             pd.date_range(
                 "2010-01-01",
                 "2020-01-10",
-                freq="1YE",
+                freq="1YE" if PANDAS_GE_220 else "1y",
             )
         ),
         pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"),
@@ -564,7 +568,7 @@ def test_fillna_categorical(psr_data, fill_value, inplace):
             pd.date_range(
                 "2010-01-01",
                 "2020-01-10",
-                freq="1YE",
+                freq="1YE" if PANDAS_GE_220 else "1y",
             )
         )
         + pd.Timedelta("1d"),
@@ -1069,6 +1073,10 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
         ),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Warning not given on older versions of pandas",
+)
 def test_replace_inplace(pframe, replace_args):
     gpu_frame = cudf.from_pandas(pframe)
     pandas_frame = pframe.copy()
diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py
index 95fa8e9a50a..a61477981f8 100644
--- a/python/cudf/cudf/tests/test_resampling.py
+++ b/python/cudf/cudf/tests/test_resampling.py
@@ -5,6 +5,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.testing import assert_eq
 
 
@@ -147,6 +148,10 @@ def test_dataframe_resample_level():
         ("10D", "1D", "s"),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq):
     # test that we cast to the appropriate frequency
     # when resampling:
@@ -164,6 +169,10 @@ def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq):
     assert got.index.dtype == np.dtype(f"datetime64[{out_freq}]")
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_resampling_downsampling_ms():
     pdf = pd.DataFrame(
         {
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index 50db4302b75..4235affd4d1 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -8,10 +8,19 @@
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing import assert_eq
-from cudf.testing._utils import ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES
+from cudf.testing._utils import (
+    ALL_TYPES,
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    expect_warning_if,
+)
 
 pytest_xfail = pytest.mark.xfail
 pytestmark = pytest.mark.spilling
@@ -220,7 +229,7 @@ def test_df_stack_multiindex_column_axis(columns, index, level, dropna):
 
     with pytest.warns(FutureWarning):
         got = gdf.stack(level=level, dropna=dropna, future_stack=False)
-    with pytest.warns(FutureWarning):
+    with expect_warning_if(PANDAS_GE_220, FutureWarning):
         expect = pdf.stack(level=level, dropna=dropna, future_stack=False)
 
     assert_eq(expect, got, check_dtype=False)
@@ -265,7 +274,7 @@ def test_df_stack_multiindex_column_axis_pd_example(level):
 
     df = pd.DataFrame(np.random.randn(4, 4), columns=columns)
 
-    with pytest.warns(FutureWarning):
+    with expect_warning_if(PANDAS_GE_220, FutureWarning):
         expect = df.stack(level=level, future_stack=False)
     gdf = cudf.from_pandas(df)
     with pytest.warns(FutureWarning):
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index d5f63fdab77..f952cea07f8 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -447,6 +447,10 @@ def test_cov1d(data1, data2):
     ],
 )
 @pytest.mark.parametrize("method", ["spearman", "pearson"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Warnings missing on older pandas (scipy version seems unrelated?)",
+)
 def test_corr1d(data1, data2, method):
     if method == "spearman":
         # Pandas uses scipy.stats.spearmanr code-path
@@ -585,6 +589,10 @@ def test_min_count_ops(data, ops, skipna, min_count):
     ],
 )
 @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_cov_corr_datetime_timedelta(data1, data2, dtype):
     gsr1 = cudf.Series(data1, dtype=dtype)
     gsr2 = cudf.Series(data2, dtype=dtype)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 505d5d0b9cc..d10c531d757 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -23,6 +23,7 @@
 from numba import NumbaDeprecationWarning
 from pytz import utc
 
+from cudf.core._compat import PANDAS_GE_220
 from cudf.pandas import LOADED, Profiler
 from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object
 
@@ -536,12 +537,15 @@ def test_array_ufunc(series):
 @pytest.mark.xfail(strict=False, reason="Fails in CI, passes locally.")
 def test_groupby_apply_func_returns_series(dataframe):
     pdf, df = dataframe
+    if PANDAS_GE_220:
+        kwargs = {"include_groups": False}
+    else:
+        kwargs = {}
+
     expect = pdf.groupby("a").apply(
-        lambda group: pd.Series({"x": 1}), include_groups=False
-    )
-    got = df.groupby("a").apply(
-        lambda group: xpd.Series({"x": 1}), include_groups=False
+        lambda group: pd.Series({"x": 1}), **kwargs
     )
+    got = df.groupby("a").apply(lambda group: xpd.Series({"x": 1}), **kwargs)
     tm.assert_equal(expect, got)
 
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_applymap.py b/python/dask_cudf/dask_cudf/tests/test_applymap.py
index d84235481c3..e4e79b7b8cf 100644
--- a/python/dask_cudf/dask_cudf/tests/test_applymap.py
+++ b/python/dask_cudf/dask_cudf/tests/test_applymap.py
@@ -5,6 +5,8 @@
 
 from dask import dataframe as dd
 
+from cudf.core._compat import PANDAS_GE_210
+
 from dask_cudf.tests.utils import _make_random_frame
 
 
@@ -18,6 +20,10 @@
     ],
 )
 @pytest.mark.parametrize("has_na", [True, False])
+@pytest.mark.skipif(
+    not PANDAS_GE_210,
+    reason="DataFrame.map requires pandas>=2.1.0",
+)
 def test_applymap_basic(func, has_na):
     size = 2000
     pdf, dgdf = _make_random_frame(size, include_na=False)
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index be10b0d4843..d03180852eb 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -80,6 +80,11 @@ def test_str_series_roundtrip():
 
 
 def test_p2p_shuffle():
+    pytest.importorskip(
+        "pyarrow",
+        minversion="14.0.1",
+        reason="P2P shuffling requires pyarrow>=14.0.1",
+    )
     # Check that we can use `shuffle_method="p2p"`
     with dask_cuda.LocalCUDACluster(n_workers=1) as cluster:
         with Client(cluster):
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index cf916b713b2..7b9f0ca328a 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -9,6 +9,7 @@
 from dask.utils_test import hlg_layer
 
 import cudf
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.testing._utils import expect_warning_if
 
 import dask_cudf
@@ -316,6 +317,10 @@ def test_groupby_dropna_cudf(dropna, by):
         (None, ["a", "d"]),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_dropna_dask(dropna, by):
     # NOTE: This test is borrowed from upstream dask
     #       (dask/dask/dataframe/tests/test_groupby.py)

From e1ab1e799d7a29289419014e19ec5c6f2e99ae91 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 5 Sep 2024 09:48:03 -0400
Subject: [PATCH 169/270] Make isinstance check pass for proxy ndarrays
 (#16601)

Closes #14537.

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16601
---
 python/cudf/cudf/pandas/_wrappers/numpy.py    | 23 +++++++++
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 26 +++++++++-
 python/cudf/cudf/pandas/proxy_base.py         | 22 ++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 50 ++++++++++++++++++-
 4 files changed, 119 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf/pandas/proxy_base.py

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index 90ac5198270..d5e669cb58f 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -10,10 +10,13 @@
 from packaging import version
 
 from ..fast_slow_proxy import (
+    _fast_slow_function_call,
     _FastSlowAttribute,
+    is_proxy_object,
     make_final_proxy_type,
     make_intermediate_proxy_type,
 )
+from ..proxy_base import ProxyNDarrayBase
 from .common import (
     array_interface,
     array_method,
@@ -105,18 +108,38 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
         return super(cls, cls)._fsproxy_wrap(arr, constructor)
 
 
+def ndarray__array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+    result, _ = _fast_slow_function_call(
+        getattr(ufunc, method),
+        *inputs,
+        **kwargs,
+    )
+    if isinstance(result, tuple):
+        if is_proxy_object(result[0]) and isinstance(
+            result[0]._fsproxy_wrapped, numpy.ndarray
+        ):
+            return tuple(numpy.asarray(x) for x in result)
+    elif is_proxy_object(result) and isinstance(
+        result._fsproxy_wrapped, numpy.ndarray
+    ):
+        return numpy.asarray(result)
+    return result
+
+
 ndarray = make_final_proxy_type(
     "ndarray",
     cupy.ndarray,
     numpy.ndarray,
     fast_to_slow=cupy.ndarray.get,
     slow_to_fast=cupy.asarray,
+    bases=(ProxyNDarrayBase,),
     additional_attributes={
         "__array__": array_method,
         # So that pa.array(wrapped-numpy-array) works
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
         "__array_interface__": array_interface,
+        "__array_ufunc__": ndarray__array_ufunc__,
         # ndarrays are unhashable
         "__hash__": None,
         # iter(cupy-array) produces an iterable of zero-dim device
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 4b0fd9a5b36..afa1ce5f86c 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -19,6 +19,7 @@
 from ..options import _env_get_bool
 from ..testing import assert_eq
 from .annotation import nvtx
+from .proxy_base import ProxyNDarrayBase
 
 
 def call_operator(fn, args, kwargs):
@@ -564,7 +565,17 @@ def _fsproxy_wrap(cls, value, func):
         _FinalProxy subclasses can override this classmethod if they
         need particular behaviour when wrapped up.
         """
-        proxy = object.__new__(cls)
+        # TODO: Replace the if-elif-else using singledispatch helper function
+        base_class = _get_proxy_base_class(cls)
+        if base_class is object:
+            proxy = base_class.__new__(cls)
+        elif base_class is ProxyNDarrayBase:
+            proxy = base_class.__new__(cls, value)
+        else:
+            raise TypeError(
+                f"Cannot create an proxy instance of {cls.__name__} using base class {base_class.__name__}. "
+                f"Expected either 'object' or another type in 'PROXY_BASE_CLASSES'"
+            )
         proxy._fsproxy_wrapped = value
         return proxy
 
@@ -1193,6 +1204,19 @@ def is_proxy_object(obj: Any) -> bool:
     return False
 
 
+def _get_proxy_base_class(cls):
+    """Returns the proxy base class if one exists"""
+    for proxy_class in PROXY_BASE_CLASSES:
+        if proxy_class in cls.__mro__:
+            return proxy_class
+    return object
+
+
+PROXY_BASE_CLASSES: set[type] = {
+    ProxyNDarrayBase,
+}
+
+
 NUMPY_TYPES: set[str] = set(np.sctypeDict.values())
 
 
diff --git a/python/cudf/cudf/pandas/proxy_base.py b/python/cudf/cudf/pandas/proxy_base.py
new file mode 100644
index 00000000000..6f732834e94
--- /dev/null
+++ b/python/cudf/cudf/pandas/proxy_base.py
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import cupy as cp
+import numpy as np
+
+
+class ProxyNDarrayBase(np.ndarray):
+    def __new__(cls, arr):
+        if isinstance(arr, cp.ndarray):
+            arr = arr.get()
+        if not isinstance(arr, np.ndarray):
+            raise TypeError(
+                "Unsupported array type. Must be numpy.ndarray or cupy.ndarray"
+            )
+        return np.asarray(arr, dtype=arr.dtype).view(cls)
+
+    def __array_finalize__(self, obj):
+        if obj is None:
+            return
+        self._fsproxy_wrapped = getattr(obj, "_fsproxy_wrapped", obj)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index d10c531d757..c4ab4b0a853 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -14,18 +14,20 @@
 import types
 from io import BytesIO, StringIO
 
+import cupy as cp
 import jupyter_client
 import nbformat
 import numpy as np
 import pyarrow as pa
 import pytest
 from nbconvert.preprocessors import ExecutePreprocessor
-from numba import NumbaDeprecationWarning
+from numba import NumbaDeprecationWarning, vectorize
 from pytz import utc
 
 from cudf.core._compat import PANDAS_GE_220
 from cudf.pandas import LOADED, Profiler
 from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object
+from cudf.testing import assert_eq
 
 if not LOADED:
     raise ImportError("These tests must be run with cudf.pandas loaded")
@@ -1690,3 +1692,49 @@ def test_notebook_slow_repr():
         assert (
             string in html_result
         ), f"Expected string {string} not found in the output"
+
+
+def test_numpy_ndarray_isinstancecheck(array):
+    arr1, arr2 = array
+    assert isinstance(arr1, np.ndarray)
+    assert isinstance(arr2, np.ndarray)
+
+
+def test_numpy_ndarray_np_ufunc(array):
+    arr1, arr2 = array
+
+    @np.vectorize
+    def add_one_ufunc(arr):
+        return arr + 1
+
+    assert_eq(add_one_ufunc(arr1), add_one_ufunc(arr2))
+
+
+def test_numpy_ndarray_cp_ufunc(array):
+    arr1, arr2 = array
+
+    @cp.vectorize
+    def add_one_ufunc(arr):
+        return arr + 1
+
+    assert_eq(add_one_ufunc(cp.asarray(arr1)), add_one_ufunc(arr2))
+
+
+def test_numpy_ndarray_numba_ufunc(array):
+    arr1, arr2 = array
+
+    @vectorize
+    def add_one_ufunc(arr):
+        return arr + 1
+
+    assert_eq(add_one_ufunc(arr1), add_one_ufunc(arr2))
+
+
+def test_numpy_ndarray_numba_cuda_ufunc(array):
+    arr1, arr2 = array
+
+    @vectorize(["int64(int64)"], target="cuda")
+    def add_one_ufunc(a):
+        return a + 1
+
+    assert_eq(cp.asarray(add_one_ufunc(arr1)), cp.asarray(add_one_ufunc(arr2)))

From 949f1719226f0b27a4df8fedbf4624f46fb0589d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 5 Sep 2024 09:52:01 -0400
Subject: [PATCH 170/270] Performance improvement for strings::slice for wide
 strings (#16574)

Improves performance of wide strings (avg > 64 bytes) when using `cudf::strings::slice_strings`.
Addresses some concerns from issue #15924

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16574
---
 cpp/src/strings/slice.cu | 182 ++++++++++++++++++++++++++++++---------
 1 file changed, 141 insertions(+), 41 deletions(-)

diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index cf82a837c51..d8324a9b08e 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/slice.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -32,6 +33,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -40,6 +43,9 @@ namespace cudf {
 namespace strings {
 namespace detail {
 namespace {
+
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 128;
+
 /**
  * @brief Function logic for compute_substrings_from_fn API
  *
@@ -51,17 +57,19 @@ struct substring_from_fn {
   IndexIterator const starts;
   IndexIterator const stops;
 
-  __device__ string_view operator()(size_type idx) const
+  __device__ string_index_pair operator()(size_type idx) const
   {
-    if (d_column.is_null(idx)) { return string_view{nullptr, 0}; }
+    if (d_column.is_null(idx)) { return string_index_pair{nullptr, 0}; }
     auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
     auto const start  = std::max(starts[idx], 0);
-    if (start >= length) { return string_view{}; }
+    if (start >= length) { return string_index_pair{"", 0}; }
 
-    auto const stop = stops[idx];
-    auto const end  = (((stop < 0) || (stop > length)) ? length : stop);
-    return start < end ? d_str.substr(start, end - start) : string_view{};
+    auto const stop    = stops[idx];
+    auto const end     = (((stop < 0) || (stop > length)) ? length : stop);
+    auto const sub_str = start < end ? d_str.substr(start, end - start) : string_view{};
+    return sub_str.empty() ? string_index_pair{"", 0}
+                           : string_index_pair{sub_str.data(), sub_str.size_bytes()};
   }
 
   substring_from_fn(column_device_view const& d_column, IndexIterator starts, IndexIterator stops)
@@ -70,6 +78,82 @@ struct substring_from_fn {
   }
 };
 
+template <typename IndexIterator>
+CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,
+                                       IndexIterator starts,
+                                       IndexIterator stops,
+                                       string_index_pair* d_output)
+{
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = idx / cudf::detail::warp_size;
+  if (str_idx >= d_strings.size()) { return; }
+
+  namespace cg    = cooperative_groups;
+  auto const warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+
+  if (d_strings.is_null(str_idx)) {
+    if (warp.thread_rank() == 0) { d_output[str_idx] = string_index_pair{nullptr, 0}; }
+    return;
+  }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    if (warp.thread_rank() == 0) { d_output[str_idx] = string_index_pair{"", 0}; }
+    return;
+  }
+
+  auto const start = max(starts[str_idx], 0);
+  auto stop        = [stop = stops[str_idx]] {
+    return (stop < 0) ? std::numeric_limits<size_type>::max() : stop;
+  }();
+  auto const end = d_str.data() + d_str.size_bytes();
+
+  auto start_counts = thrust::make_pair(0, 0);
+  auto stop_counts  = thrust::make_pair(0, 0);
+
+  auto itr = d_str.data() + warp.thread_rank();
+
+  size_type char_count = 0;
+  size_type byte_count = 0;
+  while (byte_count < d_str.size_bytes()) {
+    if (char_count <= start) { start_counts = {char_count, byte_count}; }
+    if (char_count <= stop) {
+      stop_counts = {char_count, byte_count};
+    } else {
+      break;
+    }
+    size_type const cc = (itr < end) && is_begin_utf8_char(*itr);
+    size_type const bc = (itr < end);
+    char_count += cg::reduce(warp, cc, cg::plus<int>());
+    byte_count += cg::reduce(warp, bc, cg::plus<int>());
+    itr += cudf::detail::warp_size;
+  }
+
+  if (warp.thread_rank() == 0) {
+    if (start >= char_count) {
+      d_output[str_idx] = string_index_pair{"", 0};
+      return;
+    }
+
+    // we are just below start/stop and must now increment up to it from here
+    auto first_byte = start_counts.second;
+    if (start_counts.first < start) {
+      auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte);
+      first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
+    }
+
+    stop           = max(stop, char_count);
+    auto last_byte = stop_counts.second;
+    if (stop_counts.first < stop) {
+      auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte);
+      last_byte += std::get<0>(bytes_to_character_position(sub_str, stop - stop_counts.first));
+    }
+
+    d_output[str_idx] = (first_byte < last_byte)
+                          ? string_index_pair{d_str.data() + first_byte, last_byte - first_byte}
+                          : string_index_pair{"", 0};
+  }
+}
+
 /**
  * @brief Function logic for the substring API.
  *
@@ -149,54 +233,67 @@ struct substring_fn {
  *
  * @tparam IndexIterator Iterator type for character position values
  *
- * @param d_column Input strings column to substring
+ * @param input Input strings column to substring
  * @param starts Start positions index iterator
  * @param stops Stop positions index iterator
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
 template <typename IndexIterator>
-std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_column,
+std::unique_ptr<column> compute_substrings_from_fn(strings_column_view const& input,
                                                    IndexIterator starts,
                                                    IndexIterator stops,
                                                    rmm::cuda_stream_view stream,
                                                    rmm::device_async_resource_ref mr)
 {
-  auto results = rmm::device_uvector<string_view>(d_column.size(), stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::counting_iterator<size_type>(0),
-                    thrust::counting_iterator<size_type>(d_column.size()),
-                    results.begin(),
-                    substring_from_fn{d_column, starts, stops});
-  return make_strings_column(results, string_view{nullptr, 0}, stream, mr);
+  auto results = rmm::device_uvector<string_index_pair>(input.size(), stream);
+
+  auto const d_column = column_device_view::create(input.parent(), stream);
+
+  if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::counting_iterator<size_type>(0),
+                      thrust::counting_iterator<size_type>(input.size()),
+                      results.begin(),
+                      substring_from_fn{*d_column, starts, stops});
+  } else {
+    constexpr thread_index_type block_size = 512;
+    auto const threads =
+      static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size;
+    auto const num_blocks = util::div_rounding_up_safe(threads, block_size);
+    substring_from_kernel<IndexIterator>
+      <<<num_blocks, block_size, 0, stream.value()>>>(*d_column, starts, stops, results.data());
+  }
+  return make_strings_column(results.begin(), results.end(), stream, mr);
 }
 
 }  // namespace
 
 //
-std::unique_ptr<column> slice_strings(strings_column_view const& strings,
+std::unique_ptr<column> slice_strings(strings_column_view const& input,
                                       numeric_scalar<size_type> const& start,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
+  if (input.size() == input.null_count()) {
+    return std::make_unique<column>(input.parent(), stream, mr);
+  }
 
   auto const step_valid = step.is_valid(stream);
-  auto const step_value = step_valid ? step.value(stream) : 0;
+  auto const step_value = step_valid ? step.value(stream) : 1;
   if (step_valid) { CUDF_EXPECTS(step_value != 0, "Step parameter must not be 0"); }
 
-  auto const d_column = column_device_view::create(strings.parent(), stream);
-
   // optimization for (step==1 and start < stop) -- expect this to be most common
-  if (step_value == 1 and start.is_valid(stream) and stop.is_valid(stream)) {
-    auto const start_value = start.value(stream);
-    auto const stop_value  = stop.value(stream);
+  if (step_value == 1) {
+    auto const start_value = start.is_valid(stream) ? start.value(stream) : 0;
+    auto const stop_value =
+      stop.is_valid(stream) ? stop.value(stream) : std::numeric_limits<size_type>::max();
     // note that any negative values here must use the alternate function below
     if ((start_value >= 0) && (start_value < stop_value)) {
       // this is about 2x faster on long strings for this common case
-      return compute_substrings_from_fn(*d_column,
+      return compute_substrings_from_fn(input,
                                         thrust::constant_iterator<size_type>(start_value),
                                         thrust::constant_iterator<size_type>(stop_value),
                                         stream,
@@ -204,31 +301,35 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
     }
   }
 
+  auto const d_column = column_device_view::create(input.parent(), stream);
+
   auto const d_start = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(start));
   auto const d_stop  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
   auto const d_step  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
   auto [offsets, chars] = make_strings_children(
-    substring_fn{*d_column, d_start, d_stop, d_step}, strings.size(), stream, mr);
+    substring_fn{*d_column, d_start, d_stop, d_step}, input.size(), stream, mr);
 
-  return make_strings_column(strings.size(),
+  return make_strings_column(input.size(),
                              std::move(offsets),
                              chars.release(),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
-std::unique_ptr<column> slice_strings(strings_column_view const& strings,
+std::unique_ptr<column> slice_strings(strings_column_view const& input,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
-  CUDF_EXPECTS(starts_column.size() == strings_count,
+  if (input.size() == input.null_count()) {
+    return std::make_unique<column>(input.parent(), stream, mr);
+  }
+
+  CUDF_EXPECTS(starts_column.size() == input.size(),
                "Parameter starts must have the same number of rows as strings.");
-  CUDF_EXPECTS(stops_column.size() == strings_count,
+  CUDF_EXPECTS(stops_column.size() == input.size(),
                "Parameter stops must have the same number of rows as strings.");
   CUDF_EXPECTS(cudf::have_same_types(starts_column, stops_column),
                "Parameters starts and stops must be of the same type.",
@@ -242,17 +343,16 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                "Positions values must be fixed width type.",
                cudf::data_type_error);
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto starts_iter    = cudf::detail::indexalator_factory::make_input_iterator(starts_column);
-  auto stops_iter     = cudf::detail::indexalator_factory::make_input_iterator(stops_column);
-  return compute_substrings_from_fn(*strings_column, starts_iter, stops_iter, stream, mr);
+  auto starts_iter = cudf::detail::indexalator_factory::make_input_iterator(starts_column);
+  auto stops_iter  = cudf::detail::indexalator_factory::make_input_iterator(stops_column);
+  return compute_substrings_from_fn(input, starts_iter, stops_iter, stream, mr);
 }
 
 }  // namespace detail
 
 // external API
 
-std::unique_ptr<column> slice_strings(strings_column_view const& strings,
+std::unique_ptr<column> slice_strings(strings_column_view const& input,
                                       numeric_scalar<size_type> const& start,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
@@ -260,17 +360,17 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings, start, stop, step, stream, mr);
+  return detail::slice_strings(input, start, stop, step, stream, mr);
 }
 
-std::unique_ptr<column> slice_strings(strings_column_view const& strings,
+std::unique_ptr<column> slice_strings(strings_column_view const& input,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings, starts_column, stops_column, stream, mr);
+  return detail::slice_strings(input, starts_column, stops_column, stream, mr);
 }
 
 }  // namespace strings

From 0cc059fb2b81adbdc9593052292838995dc78b10 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 5 Sep 2024 15:07:29 -0700
Subject: [PATCH 171/270] Upgrade to nvcomp 4.0.1 (#16076)

This PR bumps nvcomp to 4.0.1.

Depends on:
- https://github.com/conda-forge/nvcomp-feedstock/pull/15
- https://github.com/rapidsai/rapids-cmake/pull/633
- https://github.com/rapidsai/kvikio/pull/449

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Robert Maynard (https://github.com/robertmaynard)
  - Peixin (https://github.com/pxLi)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/16076
---
 ci/build_wheel_cudf.sh                                  | 2 --
 ci/build_wheel_pylibcudf.sh                             | 2 --
 conda/environments/all_cuda-118_arch-x86_64.yaml        | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml        | 2 +-
 conda/recipes/libcudf/conda_build_config.yaml           | 2 +-
 dependencies.yaml                                       | 2 +-
 java/pom.xml                                            | 4 +---
 java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java | 3 ---
 java/src/main/native/CMakeLists.txt                     | 5 ++---
 python/libcudf/CMakeLists.txt                           | 3 +--
 10 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index e5565c4b53c..fb93b06dbe2 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -23,8 +23,6 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"
 python -m auditwheel repair \
     --exclude libcudf.so \
     --exclude libnvcomp.so \
-    --exclude libnvcomp_bitcomp.so \
-    --exclude libnvcomp_gdeflate.so \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index 0e4745bda28..5e9f7f8a0c4 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -21,8 +21,6 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"
 python -m auditwheel repair \
     --exclude libcudf.so \
     --exclude libnvcomp.so \
-    --exclude libnvcomp_bitcomp.so \
-    --exclude libnvcomp_gdeflate.so \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 7f6967d7287..fa4c77d67b4 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -58,7 +58,7 @@ dependencies:
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
-- nvcomp==3.0.6
+- nvcomp==4.0.1
 - nvtx>=0.2.1
 - openpyxl
 - packaging
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index c1315e73f16..9b487347a5e 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -56,7 +56,7 @@ dependencies:
 - numba>=0.57
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcomp==3.0.6
+- nvcomp==4.0.1
 - nvtx>=0.2.1
 - openpyxl
 - packaging
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 4b1c4cca828..dae04c08aca 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -35,7 +35,7 @@ spdlog_version:
   - ">=1.12.0,<1.13"
 
 nvcomp_version:
-  - "=3.0.6"
+  - "=4.0.1"
 
 zlib_version:
   - ">=1.2.13"
diff --git a/dependencies.yaml b/dependencies.yaml
index f8b231efd6d..a3f0ffeec82 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -354,7 +354,7 @@ dependencies:
           - flatbuffers==24.3.25
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
-          - nvcomp==3.0.6
+          - nvcomp==4.0.1
           - spdlog>=1.12.0,<1.13
   rapids_build_skbuild:
     common:
diff --git a/java/pom.xml b/java/pom.xml
index 9694e741f16..e4f1cdf64e7 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
-  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+  Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -590,8 +590,6 @@
                                         <include>libcudfjni.so</include>
                                         <include>libcufilejni.so</include>
                                         <include>libnvcomp.so</include>
-                                        <include>libnvcomp_gdeflate.so</include>
-                                        <include>libnvcomp_bitcomp.so</include>
                                     </includes>
                                 </resource>
                                 <resource>
diff --git a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
index 7ee590e3c82..58182c3e62e 100755
--- a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
+++ b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
@@ -54,9 +54,6 @@ public class NativeDepsLoader {
    * subsequent stages are loaded.
    */
   private static final String[][] loadOrder = new String[][]{
-      new String[]{
-          "nvcomp_bitcomp", "nvcomp_gdeflate"
-      },
       new String[]{
           "nvcomp"
       },
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index c18a90140b6..32045f3c50e 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -267,9 +267,8 @@ if(TARGET nvcomp::nvcomp)
   add_custom_command(
     TARGET cudfjni
     PRE_LINK
-    COMMAND
-      ${CMAKE_COMMAND} -E copy $<TARGET_FILE:nvcomp::nvcomp> $<TARGET_FILE:nvcomp::nvcomp_gdeflate>
-      $<TARGET_FILE:nvcomp::nvcomp_bitcomp> "${PROJECT_BINARY_DIR}"
+    COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:nvcomp::nvcomp>
+            "${PROJECT_BINARY_DIR}/libnvcomp.so"
     COMMENT "Copying nvcomp libraries to ${PROJECT_BINARY_DIR}"
   )
 endif()
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 96eb6c3bb30..0a8f5c4807d 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -48,6 +48,5 @@ add_subdirectory(../../cpp cudf-cpp)
 # Ensure other libraries needed by libcudf.so get installed alongside it.
 include(cmake/Modules/WheelHelpers.cmake)
 install_aliased_imported_targets(
-  TARGETS cudf nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp DESTINATION
-  ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
+  TARGETS cudf nvcomp::nvcomp DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
 )

From 0e86f621bbf32c6b5a72fa95afd1f74d6fa50aba Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 5 Sep 2024 17:10:27 -0500
Subject: [PATCH 172/270] Add performance tips to cudf.pandas FAQ. (#16693)

This PR adds a section with performance tips to the `cudf.pandas` FAQ.

I based this section on some common user questions about performance, to make it clearer that `cudf.pandas` is designed for optimal performance with large data sizes and provide some alternatives for common needs where `cudf` or `cudf.pandas` aren't the best fit. See these links for examples:

- https://github.com/rapidsai/cudf/issues/14548#issuecomment-1838529130
- https://github.com/rapidsai/cudf/issues/16065
- https://stackoverflow.com/questions/78626099/cudf-is-very-slow

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16693
---
 docs/cudf/source/cudf_pandas/faq.md | 38 ++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index cdf32216619..fa5d203f52c 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -32,7 +32,7 @@ pandas. You can learn more about these edge cases in
 
 We also run nightly tests that track interactions between
 `cudf.pandas` and other third party libraries. See
-[Third-Party Library Compatibility](#does-it-work-with-third-party-libraries).
+[Third-Party Library Compatibility](#does-cudf-pandas-work-with-third-party-libraries).
 
 ## How can I tell if `cudf.pandas` is active?
 
@@ -69,7 +69,38 @@ performance, try to use only functionality that can run entirely on GPU.
 This helps reduce the number of memory transfers needed to fallback to
 CPU.
 
-## Does it work with third-party libraries?
+## How can I improve performance of my workflow with `cudf.pandas`?
+
+Most workflows will see significant performance improvements with
+`cudf.pandas`. However, sometimes things can be slower than expected.
+First, it's important to note that GPUs are good at parallel processing
+of large amounts of data. Small data sizes may be slower on GPU than
+CPU, because of the cost of data transfers. cuDF achieves the highest
+performance with many rows of data. As a _very rough_ rule of thumb,
+`cudf.pandas` shines on workflows with more than 10,000 - 100,000 rows
+of data, depending on the algorithms, data types, and other factors.
+Datasets that are several gigabytes in size and/or have millions of
+rows are a great fit for `cudf.pandas`.
+
+Here are some more tips to improve workflow performance:
+
+- Reshape data so it is long rather than wide (more rows, fewer
+  columns). This improves cuDF's ability to execute in parallel on the
+  entire GPU!
+- Avoid element-wise iteration and mutation. If you can, use pandas
+  functions to manipulate an entire column at once rather than writing
+  raw `for` loops that compute and assign.
+- If your data is really an n-dimensional array with lots of columns
+  where you aim to do lots of math (like adding matrices),
+  [CuPy](https://cupy.dev/) or [NumPy](https://numpy.org/) may be a
+  better choice than pandas or `cudf.pandas`. Array libraries are built
+  for different use cases than DataFrame libraries, and will get optimal
+  performance from using contiguous memory for multidimensional array
+  storage. Use the `.values` method to convert a DataFrame or Series to
+  an array.
+
+(does-cudf-pandas-work-with-third-party-libraries)=
+## Does `cudf.pandas` work with third-party libraries?
 
 `cudf.pandas` is tested with numerous popular third-party libraries.
 `cudf.pandas` will not only work but will accelerate pandas operations
@@ -97,7 +128,7 @@ common interactions with the following Python libraries:
 Please review the section on [Known Limitations](#are-there-any-known-limitations)
 for details about what is expected not to work (and why).
 
-## Can I use this with Dask or PySpark?
+## Can I use `cudf.pandas` with Dask or PySpark?
 
 `cudf.pandas` is not designed for distributed or out-of-core computing
 (OOC) workflows today. If you are looking for accelerated OOC and
@@ -111,6 +142,7 @@ cuDF (learn more in [this
 blog](https://medium.com/rapids-ai/easy-cpu-gpu-arrays-and-dataframes-run-your-dask-code-where-youd-like-e349d92351d)) and the [RAPIDS Accelerator for Apache Spark](https://nvidia.github.io/spark-rapids/)
 provides a similar configuration-based plugin for Spark.
 
+(are-there-any-known-limitations)=
 ## Are there any known limitations?
 
 There are a few known limitations that you should be aware of:

From 715677e2d23f2f5981af51d10e6fb9bd7faa292a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 5 Sep 2024 18:16:26 -0400
Subject: [PATCH 173/270] Add libcudf example with large strings (#15983)

Creating an example that shows reading large strings columns. This uses the 1 billion row challenge input data and provides three examples of loading this data:
- `brc` uses the CSV reader to load the input file in one call and aggregates the results using `groupby`
- `brc_chunks` uses the CSV reader to load the input file in chunks, aggregates each chunk, and computes the results
- `brc_pipeline` same as `brc_chunks` but input chunks are processed in separate threads/streams.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Gregory Kimball (https://github.com/GregoryKimball)
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15983
---
 cpp/examples/billion_rows/CMakeLists.txt      |  34 ++++
 cpp/examples/billion_rows/README.md           |  44 +++++
 cpp/examples/billion_rows/brc.cpp             |  94 ++++++++++
 cpp/examples/billion_rows/brc_chunks.cpp      | 116 ++++++++++++
 cpp/examples/billion_rows/brc_pipeline.cpp    | 171 ++++++++++++++++++
 cpp/examples/billion_rows/common.hpp          |  47 +++++
 cpp/examples/billion_rows/groupby_results.cpp | 112 ++++++++++++
 cpp/examples/billion_rows/groupby_results.hpp |  55 ++++++
 cpp/examples/build.sh                         |   1 +
 9 files changed, 674 insertions(+)
 create mode 100644 cpp/examples/billion_rows/CMakeLists.txt
 create mode 100644 cpp/examples/billion_rows/README.md
 create mode 100644 cpp/examples/billion_rows/brc.cpp
 create mode 100644 cpp/examples/billion_rows/brc_chunks.cpp
 create mode 100644 cpp/examples/billion_rows/brc_pipeline.cpp
 create mode 100644 cpp/examples/billion_rows/common.hpp
 create mode 100644 cpp/examples/billion_rows/groupby_results.cpp
 create mode 100644 cpp/examples/billion_rows/groupby_results.hpp

diff --git a/cpp/examples/billion_rows/CMakeLists.txt b/cpp/examples/billion_rows/CMakeLists.txt
new file mode 100644
index 00000000000..d95bb73b258
--- /dev/null
+++ b/cpp/examples/billion_rows/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(billion_rows)
+rapids_cuda_set_architectures(RAPIDS)
+
+project(
+  billion_rows
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
+
+add_library(groupby_results OBJECT groupby_results.cpp)
+target_link_libraries(groupby_results PRIVATE cudf::cudf)
+
+add_executable(brc brc.cpp)
+target_link_libraries(brc PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>)
+install(TARGETS brc DESTINATION bin/examples/libcudf)
+
+add_executable(brc_chunks brc_chunks.cpp)
+target_link_libraries(brc_chunks PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>)
+install(TARGETS brc_chunks DESTINATION bin/examples/libcudf)
+
+add_executable(brc_pipeline brc_pipeline.cpp)
+target_link_libraries(brc_pipeline PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>)
+install(TARGETS brc_pipeline DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/billion_rows/README.md b/cpp/examples/billion_rows/README.md
new file mode 100644
index 00000000000..73ff7aa19f0
--- /dev/null
+++ b/cpp/examples/billion_rows/README.md
@@ -0,0 +1,44 @@
+# libcudf C++ example for the 1 billion row challenge
+
+This C++ example demonstrates using libcudf APIs to read and process
+a table with 1 billion rows. The 1 billion row challenge is described here:
+https://github.com/gunnarmorling/1brc
+
+The examples load the 1 billion row text file using the CSV reader.
+The file contains around 400 unique city names (string type) along with
+random temperature values (float type).
+Once loaded, the examples performs groupby aggregations to find the
+minimum, maximum, and average temperature for each city.
+
+There are three examples included:
+1. `brc.cpp`
+   Loads the file in one call to the CSV reader.
+   This generally requires a large amount of available GPU memory.
+2. `brc_chunks.cpp`
+   Loads and processes the file in chunks.
+   The number of chunks to use is a parameter to the executable.
+3. `brc_pipeline.cpp`
+   Loads and processes the file in chunks with separate threads/streams.
+   The number of chunks and number of threads to use are parameters to the executable.
+
+An input file can be generated using the instructions from
+https://github.com/gunnarmorling/1brc.
+
+## Compile and execute
+
+```bash
+# Configure project
+cmake -S . -B build/
+# Build
+cmake --build build/ --parallel $PARALLEL_LEVEL
+# Execute
+build/brc input.txt
+# Execute in chunked mode with 25 chunks (default)
+build/brc_chunks input.txt 25
+# Execute in pipeline mode with 25 chunks and 2 threads (defaults)
+build/brc_pipeline input.txt 25 2
+```
+
+If your machine does not come with a pre-built libcudf binary, expect the
+first build to take some time, as it would build libcudf on the host machine.
+It may be sped up by configuring the proper `PARALLEL_LEVEL` number.
diff --git a/cpp/examples/billion_rows/brc.cpp b/cpp/examples/billion_rows/brc.cpp
new file mode 100644
index 00000000000..b7b292cf16e
--- /dev/null
+++ b/cpp/examples/billion_rows/brc.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common.hpp"
+#include "groupby_results.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <string>
+
+using elapsed_t = std::chrono::duration<double>;
+
+int main(int argc, char const** argv)
+{
+  if (argc < 2) {
+    std::cout << "required parameter: input-file-path\n";
+    return 1;
+  }
+
+  auto const input_file = std::string{argv[1]};
+  std::cout << "Input: " << input_file << std::endl;
+
+  auto const mr_name = std::string("pool");
+  auto resource      = create_memory_resource(mr_name);
+  auto stats_mr =
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
+  rmm::mr::set_current_device_resource(&stats_mr);
+  auto stream = cudf::get_default_stream();
+
+  auto start = std::chrono::steady_clock::now();
+
+  auto const csv_result = [input_file, stream] {
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{input_file})
+        .header(-1)
+        .delimiter(';')
+        .doublequote(false)
+        .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::STRING},
+                                             cudf::data_type{cudf::type_id::FLOAT32}})
+        .na_filter(false);
+    return cudf::io::read_csv(in_opts, stream).tbl;
+  }();
+  elapsed_t elapsed = std::chrono::steady_clock::now() - start;
+  std::cout << "File load time: " << elapsed.count() << " seconds\n";
+  auto const csv_table = csv_result->view();
+  std::cout << "Input rows: " << csv_table.num_rows() << std::endl;
+
+  auto const cities = csv_table.column(0);
+  auto const temps  = csv_table.column(1);
+
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
+  aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  aggregations.emplace_back(cudf::make_mean_aggregation<cudf::groupby_aggregation>());
+
+  auto result = compute_results(cities, temps, std::move(aggregations), stream);
+
+  // The other 2 examples employ sorting for the sub-aggregates so enabling
+  // the following line may be more comparable in performance with them.
+  //
+  // result      = cudf::sort_by_key(result->view(), result->view().select({0}), {}, {}, stream);
+
+  stream.synchronize();
+
+  elapsed = std::chrono::steady_clock::now() - start;
+  std::cout << "Number of keys: " << result->num_rows() << std::endl;
+  std::cout << "Process time: " << elapsed.count() << " seconds\n";
+  std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n";
+
+  return 0;
+}
diff --git a/cpp/examples/billion_rows/brc_chunks.cpp b/cpp/examples/billion_rows/brc_chunks.cpp
new file mode 100644
index 00000000000..4a65c59e461
--- /dev/null
+++ b/cpp/examples/billion_rows/brc_chunks.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common.hpp"
+#include "groupby_results.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
+#include <chrono>
+#include <filesystem>
+#include <iostream>
+#include <memory>
+#include <string>
+
+using elapsed_t = std::chrono::duration<double>;
+
+std::unique_ptr<cudf::table> load_chunk(std::string const& input_file,
+                                        std::size_t start,
+                                        std::size_t size,
+                                        rmm::cuda_stream_view stream)
+{
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{input_file})
+      .header(-1)
+      .delimiter(';')
+      .doublequote(false)
+      .byte_range_offset(start)
+      .byte_range_size(size)
+      .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::STRING},
+                                           cudf::data_type{cudf::type_id::FLOAT32}})
+      .na_filter(false);
+  return cudf::io::read_csv(in_opts, stream).tbl;
+}
+
+int main(int argc, char const** argv)
+{
+  if (argc < 2) {
+    std::cout << "required parameter: input-file-path\n";
+    std::cout << "optional parameter: chunk-count\n";
+    return 1;
+  }
+
+  auto const input_file = std::string{argv[1]};
+  auto const divider    = (argc < 3) ? 25 : std::stoi(std::string(argv[2]));
+
+  std::cout << "Input: " << input_file << std::endl;
+  std::cout << "Chunks: " << divider << std::endl;
+
+  auto const mr_name = std::string("pool");
+  auto resource      = create_memory_resource(mr_name);
+  auto stats_mr =
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
+  rmm::mr::set_current_device_resource(&stats_mr);
+  auto stream = cudf::get_default_stream();
+
+  std::filesystem::path p = input_file;
+  auto const file_size    = std::filesystem::file_size(p);
+
+  auto start = std::chrono::steady_clock::now();
+
+  std::vector<std::unique_ptr<cudf::table>> agg_data;
+  std::size_t chunk_size     = file_size / divider + ((file_size % divider) != 0);
+  std::size_t start_pos      = 0;
+  cudf::size_type total_rows = 0;
+  do {
+    auto const input_table = load_chunk(input_file, start_pos, chunk_size, stream);
+    auto const read_rows   = input_table->num_rows();
+    if (read_rows == 0) break;
+
+    auto const cities = input_table->view().column(0);
+    auto const temps  = input_table->view().column(1);
+
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
+    aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+    aggregations.emplace_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+    aggregations.emplace_back(cudf::make_count_aggregation<cudf::groupby_aggregation>());
+    auto result = compute_results(cities, temps, std::move(aggregations), stream);
+
+    agg_data.emplace_back(
+      cudf::sort_by_key(result->view(), result->view().select({0}), {}, {}, stream));
+    start_pos += chunk_size;
+    chunk_size = std::min(chunk_size, file_size - start_pos);
+    total_rows += read_rows;
+  } while (start_pos < file_size && chunk_size > 0);
+
+  // now aggregate the aggregate results
+  auto results = compute_final_aggregates(agg_data, stream);
+  stream.synchronize();
+
+  elapsed_t elapsed = std::chrono::steady_clock::now() - start;
+  std::cout << "Number of keys: " << results->num_rows() << std::endl;
+  std::cout << "Process time: " << elapsed.count() << " seconds\n";
+  std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n";
+
+  return 0;
+}
diff --git a/cpp/examples/billion_rows/brc_pipeline.cpp b/cpp/examples/billion_rows/brc_pipeline.cpp
new file mode 100644
index 00000000000..c65edc163e1
--- /dev/null
+++ b/cpp/examples/billion_rows/brc_pipeline.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common.hpp"
+#include "groupby_results.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
+#include <chrono>
+#include <filesystem>
+#include <iostream>
+#include <memory>
+#include <string>
+
+using elapsed_t  = std::chrono::duration<double>;
+using byte_range = std::pair<std::size_t, std::size_t>;
+using result_t   = std::unique_ptr<cudf::table>;
+
+std::unique_ptr<cudf::table> load_chunk(std::string const& input_file,
+                                        std::size_t start,
+                                        std::size_t size,
+                                        rmm::cuda_stream_view stream)
+{
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{input_file})
+      .header(-1)
+      .delimiter(';')
+      .doublequote(false)
+      .byte_range_offset(start)
+      .byte_range_size(size)
+      .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::STRING},
+                                           cudf::data_type{cudf::type_id::FLOAT32}})
+      .na_filter(false);
+  return cudf::io::read_csv(in_opts, stream).tbl;
+}
+
+struct chunk_fn {
+  std::string input_file;
+  std::vector<result_t>& agg_data;
+  rmm::cuda_stream_view stream;
+
+  std::vector<byte_range> byte_ranges{};
+  bool first_range{};
+
+  void add_range(std::size_t start, std::size_t size)
+  {
+    byte_ranges.push_back(byte_range{start, size});
+    if (!first_range) { first_range = (start == 0); }
+  }
+
+  void operator()()
+  {
+    using namespace std::chrono_literals;
+
+    // process each byte range assigned to this thread
+    for (auto& br : byte_ranges) {
+      auto const input_table = load_chunk(input_file, br.first, br.second, stream);
+      auto const read_rows   = input_table->num_rows();
+      if (read_rows == 0) continue;
+
+      auto const cities = input_table->view().column(0);
+      auto const temps  = input_table->view().column(1);
+
+      std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+      aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
+      aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+      aggregations.emplace_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+      aggregations.emplace_back(cudf::make_count_aggregation<cudf::groupby_aggregation>());
+      auto result = compute_results(cities, temps, std::move(aggregations), stream);
+
+      agg_data.emplace_back(
+        cudf::sort_by_key(result->view(), result->view().select({0}), {}, {}, stream));
+    }
+    // done with this stream
+    stream.synchronize_no_throw();
+  }
+};
+
+int main(int argc, char const** argv)
+{
+  if (argc < 2) {
+    std::cout << "required parameter: input-file-path\n";
+    std::cout << "optional parameters: chunk-count thread-count\n";
+    return 1;
+  }
+
+  auto const input_file   = std::string{argv[1]};
+  auto const divider      = (argc < 3) ? 25 : std::stoi(std::string(argv[2]));
+  auto const thread_count = (argc < 4) ? 2 : std::stoi(std::string(argv[3]));
+
+  std::cout << "Input: " << input_file << std::endl;
+  std::cout << "Chunks: " << divider << std::endl;
+  std::cout << "Threads: " << thread_count << std::endl;
+
+  auto const mr_name = std::string("pool");
+  auto resource      = create_memory_resource(mr_name);
+  auto stats_mr =
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
+  rmm::mr::set_current_device_resource(&stats_mr);
+  auto stream = cudf::get_default_stream();
+
+  std::filesystem::path p = input_file;
+  auto const file_size    = std::filesystem::file_size(p);
+
+  auto start = std::chrono::steady_clock::now();
+
+  std::size_t chunk_size = file_size / divider + ((file_size % divider) != 0);
+  std::size_t start_pos  = 0;
+
+  auto stream_pool = rmm::cuda_stream_pool(thread_count);
+  std::vector<std::vector<result_t>> chunk_results(thread_count);
+
+  std::vector<chunk_fn> chunk_tasks;
+  for (auto& cr : chunk_results) {
+    chunk_tasks.emplace_back(chunk_fn{input_file, cr, stream_pool.get_stream()});
+  }
+  for (std::size_t i = 0; i < divider; ++i) {
+    auto start = i * chunk_size;
+    auto size  = std::min(chunk_size, file_size - start);
+    chunk_tasks[i % thread_count].add_range(start, size);
+  }
+  std::vector<std::thread> threads;
+  for (auto& c : chunk_tasks) {
+    threads.emplace_back(std::thread{c});
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // in case some kernels are still running on the default stream
+  stream.synchronize();
+
+  // combine each thread's agg data into a single vector
+  std::vector<result_t> agg_data(divider);
+  auto begin = agg_data.begin();
+  for (auto& c : chunk_results) {
+    std::move(c.begin(), c.end(), begin);
+    begin += c.size();
+  }
+
+  // now aggregate the aggregate results
+  auto results = compute_final_aggregates(agg_data, stream);
+  stream.synchronize();
+
+  elapsed_t elapsed = std::chrono::steady_clock::now() - start;
+  std::cout << "Number of keys: " << results->num_rows() << std::endl;
+  std::cout << "Process time: " << elapsed.count() << " seconds\n";
+  std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n";
+
+  return 0;
+}
diff --git a/cpp/examples/billion_rows/common.hpp b/cpp/examples/billion_rows/common.hpp
new file mode 100644
index 00000000000..d3063034d28
--- /dev/null
+++ b/cpp/examples/billion_rows/common.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <string>
+
+/**
+ * @brief Create CUDA memory resource
+ */
+auto make_cuda_mr() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+
+/**
+ * @brief Create a pool device memory resource
+ */
+auto make_pool_mr()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    make_cuda_mr(), rmm::percent_of_free_device_memory(50));
+}
+
+/**
+ * @brief Create memory resource for libcudf functions
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(std::string const& name)
+{
+  if (name == "pool") { return make_pool_mr(); }
+  return make_cuda_mr();
+}
diff --git a/cpp/examples/billion_rows/groupby_results.cpp b/cpp/examples/billion_rows/groupby_results.cpp
new file mode 100644
index 00000000000..0a7f24830f6
--- /dev/null
+++ b/cpp/examples/billion_rows/groupby_results.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "groupby_results.hpp"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/binaryop.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/reshape.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+std::unique_ptr<cudf::table> compute_results(
+  cudf::column_view const& cities,
+  cudf::column_view const& temperatures,
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>>&& aggregations,
+  rmm::cuda_stream_view stream)
+{
+  auto groupby_obj      = cudf::groupby::groupby(cudf::table_view({cities}));
+  auto aggregation_reqs = std::vector<cudf::groupby::aggregation_request>{};
+  auto& req             = aggregation_reqs.emplace_back();
+  req.values            = temperatures;
+  req.aggregations      = std::move(aggregations);
+
+  auto result = groupby_obj.aggregate(aggregation_reqs, stream);
+
+  auto rtn = result.first->release();
+  for (auto& r : result.second.front().results) {
+    rtn.emplace_back(std::move(r));
+  }
+
+  return std::make_unique<cudf::table>(std::move(rtn));
+}
+
+std::unique_ptr<cudf::table> compute_final_aggregates(
+  std::vector<std::unique_ptr<cudf::table>>& agg_data, rmm::cuda_stream_view stream)
+{
+  // first combine all the results into a vectors of columns
+  std::vector<cudf::column_view> min_cols, max_cols, sum_cols, count_cols;
+  for (auto& tbl : agg_data) {
+    auto const tv = tbl->view();
+    min_cols.push_back(tv.column(1));
+    max_cols.push_back(tv.column(2));
+    sum_cols.push_back(tv.column(3));
+    count_cols.push_back(tv.column(4));
+  }
+
+  // Create single columns out of the aggregate table results.
+  // This relies on every key appearing in every chunk segment.
+  // All the values for each key become contiguous within the output column.
+  // For example, for N=min_cols.size() (number of unique cities):
+  //   All of the mins for city[i] are in row[i] of each column of vector min_cols.
+  //   The interleave_columns API transposes these into a single column where
+  //   the first N rows are values for city[0],
+  //   the next N rows are values for city[1],
+  //   ...
+  //   the last N rows are values for city[N-1]
+  // The final result for each city is computed using segmented_reduce.
+  auto mins   = cudf::interleave_columns(cudf::table_view{min_cols});
+  auto maxes  = cudf::interleave_columns(cudf::table_view{max_cols});
+  auto sums   = cudf::interleave_columns(cudf::table_view{sum_cols});
+  auto counts = cudf::interleave_columns(cudf::table_view{count_cols});
+
+  // Build the offsets needed for segmented reduce.
+  // These are increasing integer values spaced evenly as per the number of cities (keys).
+  auto const num_keys = agg_data.front()->num_rows();
+  auto const size     = static_cast<cudf::size_type>(num_keys) + 1;
+  auto const start    = cudf::numeric_scalar<cudf::size_type>(0, true, stream);
+  auto const step     = cudf::numeric_scalar<cudf::size_type>(agg_data.size(), true, stream);
+  auto seg_offsets    = cudf::sequence(size, start, step, stream);
+  auto offsets_span   = cudf::device_span<cudf::size_type const>(seg_offsets->view());
+
+  // compute the min/max for each key by using segmented reduce
+  auto min_agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
+  mins         = cudf::segmented_reduce(
+    mins->view(), offsets_span, *min_agg, mins->type(), cudf::null_policy::EXCLUDE, stream);
+  auto max_agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
+  maxes        = cudf::segmented_reduce(
+    maxes->view(), offsets_span, *max_agg, maxes->type(), cudf::null_policy::EXCLUDE, stream);
+
+  // compute the sum and total counts in the same way
+  auto sum_agg = cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>();
+  sums         = cudf::segmented_reduce(
+    sums->view(), offsets_span, *sum_agg, sums->type(), cudf::null_policy::EXCLUDE, stream);
+  counts = cudf::segmented_reduce(
+    counts->view(), offsets_span, *sum_agg, counts->type(), cudf::null_policy::EXCLUDE, stream);
+
+  // compute the means using binary-operation to divide the individual rows sum/count
+  auto means = cudf::binary_operation(
+    sums->view(), counts->view(), cudf::binary_operator::DIV, sums->type(), stream);
+
+  std::vector<std::unique_ptr<cudf::column>> results;
+  results.emplace_back(std::move(mins));
+  results.emplace_back(std::move(maxes));
+  results.emplace_back(std::move(means));
+  return std::make_unique<cudf::table>(std::move(results));
+}
diff --git a/cpp/examples/billion_rows/groupby_results.hpp b/cpp/examples/billion_rows/groupby_results.hpp
new file mode 100644
index 00000000000..d5a88428329
--- /dev/null
+++ b/cpp/examples/billion_rows/groupby_results.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <vector>
+
+/**
+ * @brief Process the cities and temperatures
+ *
+ * Perform the given aggregations using the cities as the keys and the
+ * temperatures as values.
+ *
+ * @param cities The city names
+ * @param temperatures The temperature values
+ * @param aggregations Which groupby aggregations to perform
+ * @param stream CUDA stream to use for launching kernels
+ * @return aggregated results
+ */
+std::unique_ptr<cudf::table> compute_results(
+  cudf::column_view const& cities,
+  cudf::column_view const& temperatures,
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>>&& aggregations,
+  rmm::cuda_stream_view stream = cudf::get_default_stream());
+
+/**
+ * @brief Produce the final aggregations from sub-aggregate results
+ *
+ * @param agg_data Sub-aggregations to summarize
+ * @param stream CUDA stream to use for launching kernels
+ * @return final results
+ */
+std::unique_ptr<cudf::table> compute_final_aggregates(
+  std::vector<std::unique_ptr<cudf::table>>& agg_data,
+  rmm::cuda_stream_view stream = cudf::get_default_stream());
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 2d6f6f316c7..8e8d8bd0b78 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -61,4 +61,5 @@ build_example tpch
 build_example strings
 build_example nested_types
 build_example parquet_io
+build_example billion_rows
 build_example interop

From 7018a33be752da9363db5431560d8d12bf378920 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 5 Sep 2024 19:21:00 -0500
Subject: [PATCH 174/270] Add support for Python 3.12, update Kafka
 dependencies to 2.5.x (#16745)

Contributes to https://github.com/rapidsai/build-planning/issues/40

This PR adds support for Python 3.12.

Other changes required to add that support:

* updating `librdkafka` / `python-confluent-kafka`, `1.9.* -> 2.5.*` ([link to thread](https://github.com/rapidsai/cudf/pull/16745#discussion_r1745871756))
* removing use of `ast.Num` in syntax tree parsing, in favor of checking the `.value` of an `ast.Constant` against a hard-coded list of builtin types ([link to thread](https://github.com/rapidsai/cudf/pull/16745/files#r1745876846))
* ignoring deprecation warnings about `datetime.datetime.utcnow()` ([link to thread](https://github.com/rapidsai/cudf/pull/16745/files#r1746075083))
* skipping doctests that end up running `repr()` on an `OrderedDict` ([link to thread](https://github.com/rapidsai/cudf/pull/16745/files#r1746079415))

## Notes for Reviewers

This is part of ongoing work to add Python 3.12 support across RAPIDS.
It temporarily introduces a build/test matrix including Python 3.12, from https://github.com/rapidsai/shared-workflows/pull/213.

A follow-up PR will revert back to pointing at the `branch-24.10` branch of `shared-workflows` once all
RAPIDS repos have added Python 3.12 support.

### This will fail until all dependencies have been updates to Python 3.12

CI here is expected to fail until all of this project's upstream dependencies support Python 3.12.

This can be merged whenever all CI jobs are passing.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16745
---
 .github/workflows/build.yaml                  | 28 +++++------
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 48 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  6 +--
 .github/workflows/test.yaml                   | 24 +++++-----
 README.md                                     |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  6 +--
 .../all_cuda-125_arch-x86_64.yaml             |  6 +--
 conda/recipes/custreamz/meta.yaml             |  4 +-
 conda/recipes/libcudf/conda_build_config.yaml |  2 +-
 dependencies.yaml                             | 12 +++--
 .../cudf/cudf/core/_internals/expressions.py  |  2 +-
 python/cudf/cudf/core/dataframe.py            |  2 +-
 python/cudf/cudf/core/series.py               |  2 +-
 python/cudf/cudf/tests/pytest.ini             |  2 +
 .../dependencies.yaml                         |  6 ++-
 python/cudf/pyproject.toml                    |  1 +
 python/cudf_polars/pyproject.toml             |  1 +
 python/custreamz/pyproject.toml               |  3 +-
 python/dask_cudf/pyproject.toml               |  3 ++
 python/pylibcudf/pyproject.toml               |  1 +
 21 files changed, 90 insertions(+), 73 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index b5d17022a3a..d6d3e3fdd33 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-libcudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -81,7 +81,7 @@ jobs:
   wheel-publish-libcudf:
     needs: wheel-build-libcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-build-pylibcudf:
     needs: [wheel-publish-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-pylibcudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -113,7 +113,7 @@ jobs:
   wheel-build-cudf:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -134,7 +134,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -157,7 +157,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -169,7 +169,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index 10c803f7921..d670132cca9 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
       with:
         # This selects "ARCH=amd64 + the latest supported Python + CUDA".
         matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 8730804e8b6..a4a8f036174 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -37,7 +37,7 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -104,39 +104,39 @@ jobs:
               - '!notebooks/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
     if: needs.changed-files.outputs.test_cpp == 'true'
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -145,7 +145,7 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -153,7 +153,7 @@ jobs:
   conda-java-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     if: needs.changed-files.outputs.test_java == 'true'
     with:
       build_type: pull-request
@@ -164,7 +164,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -174,7 +174,7 @@ jobs:
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     if: needs.changed-files.outputs.test_notebooks == 'true'
     with:
       build_type: pull-request
@@ -185,7 +185,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -195,7 +195,7 @@ jobs:
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -204,21 +204,21 @@ jobs:
   wheel-build-pylibcudf:
     needs: [checks, wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -226,7 +226,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -235,7 +235,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -247,7 +247,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -256,7 +256,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -265,7 +265,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -276,7 +276,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -287,7 +287,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -299,7 +299,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 45e5191eb54..fe77ad4b6b2 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@python-3.12
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@python-3.12
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@python-3.12
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 8605fa46f68..4af6a0d690d 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -126,7 +126,7 @@ jobs:
       script: ci/cudf_pandas_scripts/run_tests.sh
   third-party-integration-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index f1b010394d6..f62f7885d63 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.10 python=3.11 cuda-version=12.5
+    cudf=24.10 python=3.12 cuda-version=12.5
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index fa4c77d67b4..c96e8706d27 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -43,7 +43,7 @@ dependencies:
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libkvikio==24.10.*,>=0.0.0a0
-- librdkafka>=1.9.0,<1.10.0a0
+- librdkafka>=2.5.0,<2.6.0a0
 - librmm==24.10.*,>=0.0.0a0
 - make
 - moto>=4.0.8
@@ -74,8 +74,8 @@ dependencies:
 - pytest-cov
 - pytest-xdist
 - pytest<8
-- python-confluent-kafka>=1.9.0,<1.10.0a0
-- python>=3.10,<3.12
+- python-confluent-kafka>=2.5.0,<2.6.0a0
+- python>=3.10,<3.13
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 9b487347a5e..e54a44d9f6e 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -42,7 +42,7 @@ dependencies:
 - libcufile-dev
 - libcurand-dev
 - libkvikio==24.10.*,>=0.0.0a0
-- librdkafka>=1.9.0,<1.10.0a0
+- librdkafka>=2.5.0,<2.6.0a0
 - librmm==24.10.*,>=0.0.0a0
 - make
 - moto>=4.0.8
@@ -72,8 +72,8 @@ dependencies:
 - pytest-cov
 - pytest-xdist
 - pytest<8
-- python-confluent-kafka>=1.9.0,<1.10.0a0
-- python>=3.10,<3.12
+- python-confluent-kafka>=2.5.0,<2.6.0a0
+- python>=3.10,<3.13
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index f5ea426e0b1..a031f05a73a 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -39,7 +39,7 @@ requirements:
     - python
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - setuptools
-    - python-confluent-kafka >=1.9.0,<1.10.0a0
+    - python-confluent-kafka >=2.5.0,<2.6.0a0
     - cudf_kafka ={{ version }}
     - cuda-version ={{ cuda_version }}
   run:
@@ -48,7 +48,7 @@ requirements:
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
     - rapids-dask-dependency ={{ minor_version }}
-    - python-confluent-kafka >=1.9.0,<1.10.0a0
+    - python-confluent-kafka >=2.5.0,<2.6.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index dae04c08aca..33fa4b4eccf 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -23,7 +23,7 @@ dlpack_version:
   - ">=0.8,<1.0"
 
 librdkafka_version:
-  - ">=1.9.0,<1.10.0a0"
+  - ">=2.5.0,<2.6.0a0"
 
 fmt_version:
   - ">=10.1.1,<11"
diff --git a/dependencies.yaml b/dependencies.yaml
index a3f0ffeec82..32c1d7a0845 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -352,7 +352,7 @@ dependencies:
           - librmm==24.10.*,>=0.0.0a0
           - libkvikio==24.10.*,>=0.0.0a0
           - flatbuffers==24.3.25
-          - librdkafka>=1.9.0,<1.10.0a0
+          - librdkafka>=2.5.0,<2.6.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==4.0.1
           - spdlog>=1.12.0,<1.13
@@ -550,8 +550,12 @@ dependencies:
             packages:
               - python=3.11
           - matrix:
+              py: "3.12"
             packages:
-              - python>=3.10,<3.12
+              - python=3.12
+          - matrix:
+            packages:
+              - python>=3.10,<3.13
   run_common:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -656,13 +660,13 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - python-confluent-kafka>=1.9.0,<1.10.0a0
+          - python-confluent-kafka>=2.5.0,<2.6.0a0
       - output_types: [conda, requirements, pyproject]
         packages:
           - streamz
       - output_types: [requirements, pyproject]
         packages:
-          - confluent-kafka>=1.9.0,<1.10.0a0
+          - confluent-kafka>=2.5.0,<2.6.0a0
   test_cpp:
     common:
       - output_types: conda
diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
index 67bde5a72b2..90d9118027a 100644
--- a/python/cudf/cudf/core/_internals/expressions.py
+++ b/python/cudf/cudf/core/_internals/expressions.py
@@ -120,7 +120,7 @@ def visit_Name(self, node):
         self.stack.append(ColumnReference(col_id))
 
     def visit_Constant(self, node):
-        if not isinstance(node, (ast.Num, ast.Str)):
+        if not isinstance(node.value, (float, int, str, complex)):
             raise ValueError(
                 f"Unsupported literal {repr(node.value)} of type "
                 "{type(node.value).__name__}"
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7a171fe9e05..58a16a6d504 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2359,7 +2359,7 @@ def to_dict(
         You can also specify the mapping type.
 
         >>> from collections import OrderedDict, defaultdict
-        >>> df.to_dict(into=OrderedDict)
+        >>> df.to_dict(into=OrderedDict)  # doctest: +SKIP
         OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
                      ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 48445f018d3..acd97c2047c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -975,7 +975,7 @@ def to_dict(self, into: type[dict] = dict) -> dict:
         >>> s.to_dict()
         {0: 1, 1: 2, 2: 3, 3: 4}
         >>> from collections import OrderedDict, defaultdict
-        >>> s.to_dict(OrderedDict)
+        >>> s.to_dict(OrderedDict)  # doctest: +SKIP
         OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
         >>> dd = defaultdict(list)
         >>> s.to_dict(dd)
diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
index 710473acb85..2136bca0e28 100644
--- a/python/cudf/cudf/tests/pytest.ini
+++ b/python/cudf/cudf/tests/pytest.ini
@@ -8,6 +8,8 @@ filterwarnings =
     error
     ignore:::.*xdist.*
     ignore:::.*pytest.*
+    # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
+    ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning
     # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+
     ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
     # PerformanceWarning from cupy warming up the JIT cache
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
index 05e1d8178d5..f742f46c7ed 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
@@ -172,8 +172,12 @@ dependencies:
             packages:
               - python=3.11
           - matrix:
+              py: "3.12"
             packages:
-              - python>=3.10,<3.12
+              - python=3.12
+          - matrix:
+            packages:
+              - python>=3.10,<3.13
   test_base:
     common:
       - output_types: conda
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 17d1292980b..5833ee43c07 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -44,6 +44,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.optional-dependencies]
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index f2bab9e6623..984b5487b98 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -30,6 +30,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.optional-dependencies]
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index be5331236a5..5aa474e2862 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "confluent-kafka>=1.9.0,<1.10.0a0",
+    "confluent-kafka>=2.5.0,<2.6.0a0",
     "cudf==24.10.*,>=0.0.0a0",
     "cudf_kafka==24.10.*,>=0.0.0a0",
     "streamz",
@@ -33,6 +33,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.optional-dependencies]
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 93bf532d67f..9ac834586a6 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -34,6 +34,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.entry-points."dask.dataframe.backends"]
@@ -117,6 +118,8 @@ skip = [
 filterwarnings = [
     "error::FutureWarning",
     "error::DeprecationWarning",
+    # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
+    "ignore:.*datetime.*utcnow.*scheduled for removal:DeprecationWarning",
     "ignore:create_block_manager_from_blocks is deprecated and will be removed in a future version. Use public APIs instead.:DeprecationWarning",
     # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
     "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index bfade41353c..3aaca09d8bd 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -34,6 +34,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.optional-dependencies]

From 8d8faefddd72981c6e868a3504d2baf5a37ef8e2 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 6 Sep 2024 15:00:36 -0500
Subject: [PATCH 175/270] allow pandas patch version to float in cudf-pandas
 unit tests (#16763)

#16745 added support for Python 3.12 in this project.

When that was merged, nightly `unit-tests-cudf-pandas` jobs on Python 3.12 started failing, with errors from compiling `pandas`: ([build link](https://github.com/rapidsai/cudf/actions/runs/10733915866/job/29768130164))

That's only happening because we're running `pip install pandas==2.1` in those jobs, which matches exactly `pandas==2.1.0`, which does not have Python 3.12 wheels on PyPI (https://pypi.org/project/pandas/2.1.0/#files).

```text
Collecting pandas==2.1
  Downloading pandas-2.1.0.tar.gz (4.3 MB)
```

`pandas==2.1.1` DOES have Python 3.12 wheels on PyPI (https://pypi.org/project/pandas/2.1.1/#files).

To fix those jobs, this proposes allowing the patch version of `pandas` installed in those CI jobs to float:

* before: `pip install pandas==2.1`
* after: `pip install pandas==2.1.*`

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Mike Sarahan (https://github.com/msarahan)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16763
---
 ci/cudf_pandas_scripts/run_tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 1c2724a9a5d..c6228a4ef33 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -91,7 +91,7 @@ IFS=',' read -r -a versions <<< "$output"
 
 for version in "${versions[@]}"; do
     echo "Installing pandas version: ${version}"
-    python -m pip install "numpy>=1.23,<2.0a0" "pandas==${version}"
+    python -m pip install "numpy>=1.23,<2.0a0" "pandas==${version}.*"
     python -m pytest -p cudf.pandas \
     --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \
     --cov-config=./python/cudf/.coveragerc \

From f97f61c60fe9e387982e4824290bd8903b508b6e Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 6 Sep 2024 18:38:11 -0400
Subject: [PATCH 176/270] Remove xfail from torch-cudf.pandas integration test
 (#16705)

The torch test should no longer fail after #16601.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16705
---
 .../third_party_integration_tests/tests/test_pytorch.py         | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
index ae9db3836a6..ad287471aa0 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
@@ -121,8 +121,6 @@ def test_torch_tensor_ctor():
     return torch.tensor(s.values)
 
 
-@pytest.mark.xfail_cudf_pandas(reason="Known failure, see xdf/#210")
-@pytest.mark.xfail_compare
 def test_torch_tensor_from_numpy():
     s = pd.Series(range(5))
     return torch.from_numpy(s.values)

From aa08fdb0d09b90e8bc4b640ea326712cb1a5b868 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 6 Sep 2024 18:39:11 -0400
Subject: [PATCH 177/270] [DOC] Remove out of date section from cudf.pandas
 docs (#16697)

Proxy numpy arrays now instances of real numpy arrays (#16601), so libraries (eg. numba, torch) which utilize NumPy's C API should now be able to use proxy arrays. This PR updates the cudf.pandas documentation to reflect this.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16697
---
 docs/cudf/source/cudf_pandas/faq.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index fa5d203f52c..34b657488c1 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -151,11 +151,6 @@ There are a few known limitations that you should be aware of:
   [value mutability](https://pandas.pydata.org/pandas-docs/stable/getting_started/overview.html#mutability-and-copying-of-data)
   of Pandas objects is not always guaranteed. You should follow the
   pandas recommendation to favor immutable operations.
-- `cudf.pandas` can't currently interface smoothly with functions that
-  interact with objects using a C API (such as the Python or NumPy C
-  API)
-  - For example, you can write `torch.tensor(df.values)` but not
-    `torch.from_numpy(df.values)`, as the latter uses the NumPy C API
 - For performance reasons, joins and join-based operations are not
   currently implemented to maintain the same row ordering as standard
   pandas

From 478406740a500ce74d8cd4b4bea07fd163256796 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Sat, 7 Sep 2024 03:51:16 -0500
Subject: [PATCH 178/270] Check index bounds in compact protocol reader.
 (#16493)

This adds bounds checking to the compact protocol reader's read function.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16493
---
 cpp/src/io/parquet/compact_protocol_reader.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index afcf6b373a9..b978799b8bc 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -140,6 +140,7 @@ struct parquet_field_bool_list : public parquet_field_list<bool, FieldType::BOOL
     auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
       auto const current_byte = cpr->getb();
       assert_bool_field_type(current_byte);
+      CUDF_EXPECTS(i < val.size(), "Index out of bounds");
       val[i] = current_byte == static_cast<int>(FieldType::BOOLEAN_TRUE);
     };
     bind_read_func(read_value);
@@ -189,6 +190,7 @@ struct parquet_field_int_list : public parquet_field_list<T, EXPECTED_TYPE> {
   parquet_field_int_list(int f, std::vector<T>& v) : parquet_field_list<T, EXPECTED_TYPE>(f, v)
   {
     auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
+      CUDF_EXPECTS(i < val.size(), "Index out of bounds");
       val[i] = cpr->get_zigzag<T>();
     };
     this->bind_read_func(read_value);
@@ -233,6 +235,7 @@ struct parquet_field_string_list : public parquet_field_list<std::string, FieldT
       auto const l = cpr->get_u32();
       CUDF_EXPECTS(l < static_cast<size_t>(cpr->m_end - cpr->m_cur), "string length mismatch");
 
+      CUDF_EXPECTS(i < val.size(), "Index out of bounds");
       val[i].assign(reinterpret_cast<char const*>(cpr->m_cur), l);
       cpr->m_cur += l;
     };
@@ -270,6 +273,7 @@ struct parquet_field_enum_list : public parquet_field_list<Enum, FieldType::I32>
     : parquet_field_list<Enum, FieldType::I32>(f, v)
   {
     auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
+      CUDF_EXPECTS(i < val.size(), "Index out of bounds");
       val[i] = static_cast<Enum>(cpr->get_i32());
     };
     this->bind_read_func(read_value);
@@ -355,6 +359,7 @@ struct parquet_field_struct_list : public parquet_field_list<T, FieldType::STRUC
     : parquet_field_list<T, FieldType::STRUCT>(f, v)
   {
     auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
+      CUDF_EXPECTS(i < val.size(), "Index out of bounds");
       cpr->read(&val[i]);
     };
     this->bind_read_func(read_value);
@@ -399,6 +404,7 @@ struct parquet_field_binary_list
       auto const l = cpr->get_u32();
       CUDF_EXPECTS(l <= static_cast<size_t>(cpr->m_end - cpr->m_cur), "binary length mismatch");
 
+      CUDF_EXPECTS(i < val.size(), "Index out of bounds");
       val[i].resize(l);
       val[i].assign(cpr->m_cur, cpr->m_cur + l);
       cpr->m_cur += l;

From 26a81b66181bab3171ca62f3a4afcbb1b8c9b403 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 9 Sep 2024 05:16:48 -1000
Subject: [PATCH 179/270] Allow read_csv(header=None) to return int column
 labels in `mode.pandas_compatible` (#16769)

closes https://github.com/rapidsai/cudf/issues/16766

If the cudf `read_csv` behavior of always returning string column labels is long standing behavior, we can match the pandas behavior of returning integer column labels in `mode.pandas_compatible`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16769
---
 python/cudf/cudf/_lib/csv.pyx      | 4 +++-
 python/cudf/cudf/tests/test_csv.py | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index e0f57df1368..058e884e08b 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -276,8 +276,10 @@ def read_csv(
                     col_name = df._data.names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
-    if names is not None and len(names) and isinstance(names[0], (int)):
+    if names is not None and len(names) and isinstance(names[0], int):
         df.columns = [int(x) for x in df._data]
+    elif names is None and header == -1 and cudf.get_option("mode.pandas_compatible"):
+        df.columns = [int(x) for x in df._column_names]
 
     # Set index if the index_col parameter is passed
     if index_col is not None and index_col is not False:
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index cee3d23eadc..b6efc8ebd88 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -2269,3 +2269,11 @@ def test_read_compressed_BOM(tmpdir):
         f.write(buffer)
 
     assert_eq(pd.read_csv(fname), cudf.read_csv(fname))
+
+
+def test_read_header_none_pandas_compat_column_type():
+    data = "1\n2\n"
+    with cudf.option_context("mode.pandas_compatible", True):
+        result = cudf.read_csv(StringIO(data), header=None).columns
+    expected = pd.read_csv(StringIO(data), header=None).columns
+    pd.testing.assert_index_equal(result, expected, exact=True)

From 150f1b10ed9c702d5283216b746df685e1708716 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 9 Sep 2024 10:15:57 -1000
Subject: [PATCH 180/270] Add labeling APIs to pylibcudf (#16761)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16761
---
 docs/cudf/source/developer_guide/pylibcudf.md | 17 ++---
 python/cudf/cudf/_lib/labeling.pyx            | 40 +++---------
 python/pylibcudf/pylibcudf/CMakeLists.txt     |  1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |  1 +
 python/pylibcudf/pylibcudf/__init__.py        |  3 +
 python/pylibcudf/pylibcudf/labeling.pxd       | 14 ++++
 python/pylibcudf/pylibcudf/labeling.pyx       | 65 +++++++++++++++++++
 .../pylibcudf/libcudf/CMakeLists.txt          |  4 +-
 .../pylibcudf/pylibcudf/libcudf/labeling.pxd  |  8 +--
 .../pylibcudf/pylibcudf/libcudf/labeling.pyx  |  0
 .../pylibcudf/tests/test_labeling.py          | 25 +++++++
 11 files changed, 134 insertions(+), 44 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/labeling.pxd
 create mode 100644 python/pylibcudf/pylibcudf/labeling.pyx
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/labeling.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_labeling.py

diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
index 4e10459fe2b..39840e72e21 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -186,7 +186,7 @@ Here is an example of appropriate enum usage.
 
 
 ```cython
-# cpp/copying.pxd
+# pylibcudf/libcudf/copying.pxd
 cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
     # cpdef here so that we export both a cdef enum class and a Python enum.Enum.
     cpdef enum class out_of_bounds_policy(bool):
@@ -194,8 +194,9 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         DONT_CHECK
 
 
-# cpp/copying.pyx
-# This file is empty, but is required to compile the Python enum in cpp/copying.pxd
+# pylibcudf/libcudf/copying.pyx
+# This file is empty, but is required to compile the Python enum in pylibcudf/libcudf/copying.pxd
+# Ensure this file is included in pylibcudf/libcudf/CMakeLists.txt
 
 
 # pylibcudf/copying.pxd
@@ -203,21 +204,21 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
 # cimport the enum using the exact name
 # Once https://github.com/cython/cython/issues/5609 is resolved,
 # this import should instead be
-# from cudf._lib.cpp.copying cimport out_of_bounds_policy as OutOfBoundsPolicy
-from cudf._lib.cpp.copying cimport out_of_bounds_policy
+# from pylibcudf.libcudf.copying cimport out_of_bounds_policy as OutOfBoundsPolicy
+from pylibcudf.libcudf.copying cimport out_of_bounds_policy
 
 
 # pylibcudf/copying.pyx
 # Access cpp.copying members that aren't part of this module's public API via
 # this module alias
-from cudf._lib.cpp cimport copying as cpp_copying
-from cudf._lib.cpp.copying cimport out_of_bounds_policy
+from pylibcudf.libcudf cimport copying as cpp_copying
+from pylibcudf.libcudf.copying cimport out_of_bounds_policy
 
 # This import exposes the enum in the public API of this module.
 # It requires a no-cython-lint tag because it will be unused: all typing of
 # parameters etc will need to use the Cython name `out_of_bounds_policy` until
 # the Cython bug is resolved.
-from cudf._lib.cpp.copying import \
+from pylibcudf.libcudf.copying import \
     out_of_bounds_policy as OutOfBoundsPolicy  # no-cython-lint
 ```
 
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
index 2e1959a348d..3966cce8981 100644
--- a/python/cudf/cudf/_lib/labeling.pyx
+++ b/python/cudf/cudf/_lib/labeling.pyx
@@ -1,16 +1,11 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from cudf.core.buffer import acquire_spill_lock
-
 from libcpp cimport bool as cbool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.labeling cimport inclusive, label_bins as cpp_label_bins
+import pylibcudf as plc
 
 from cudf._lib.column cimport Column
+from cudf.core.buffer import acquire_spill_lock
 
 
 # Note that the parameter input shadows a Python built-in in the local scope,
@@ -19,26 +14,11 @@ from cudf._lib.column cimport Column
 @acquire_spill_lock()
 def label_bins(Column input, Column left_edges, cbool left_inclusive,
                Column right_edges, cbool right_inclusive):
-    cdef inclusive c_left_inclusive = \
-        inclusive.YES if left_inclusive else inclusive.NO
-    cdef inclusive c_right_inclusive = \
-        inclusive.YES if right_inclusive else inclusive.NO
-
-    cdef column_view input_view = input.view()
-    cdef column_view left_edges_view = left_edges.view()
-    cdef column_view right_edges_view = right_edges.view()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_label_bins(
-                input_view,
-                left_edges_view,
-                c_left_inclusive,
-                right_edges_view,
-                c_right_inclusive,
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.labeling.label_bins(
+        input.to_pylibcudf(mode="read"),
+        left_edges.to_pylibcudf(mode="read"),
+        left_inclusive,
+        right_edges.to_pylibcudf(mode="read"),
+        right_inclusive
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index a4f17344cb0..f07c8897e34 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -27,6 +27,7 @@ set(cython_sources
     groupby.pyx
     interop.pyx
     join.pyx
+    labeling.pyx
     lists.pyx
     merge.pyx
     null_mask.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index 841efa59bda..b7cf6413c05 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -13,6 +13,7 @@ from . cimport (
     filling,
     groupby,
     join,
+    labeling,
     lists,
     merge,
     null_mask,
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index d3878a89a6a..84b1c29f791 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -24,6 +24,7 @@
     interop,
     io,
     join,
+    labeling,
     lists,
     merge,
     null_mask,
@@ -67,7 +68,9 @@
     "gpumemoryview",
     "groupby",
     "interop",
+    "io",
     "join",
+    "labeling",
     "lists",
     "merge",
     "null_mask",
diff --git a/python/pylibcudf/pylibcudf/labeling.pxd b/python/pylibcudf/pylibcudf/labeling.pxd
new file mode 100644
index 00000000000..6f8797ae7d3
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/labeling.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp cimport bool
+from pylibcudf.libcudf.labeling cimport inclusive
+
+from .column cimport Column
+
+
+cpdef Column label_bins(
+    Column input,
+    Column left_edges,
+    bool left_inclusive,
+    Column right_edges,
+    bool right_inclusive
+)
diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx
new file mode 100644
index 00000000000..b5a7445df36
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/labeling.pyx
@@ -0,0 +1,65 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.libcudf cimport labeling as cpp_labeling
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.labeling cimport inclusive
+
+from pylibcudf.libcudf.labeling import inclusive as Inclusive  # no-cython-lint
+
+from .column cimport Column
+
+
+cpdef Column label_bins(
+    Column input,
+    Column left_edges,
+    bool left_inclusive,
+    Column right_edges,
+    bool right_inclusive
+):
+    """Labels elements based on membership in the specified bins.
+
+    Parameters
+    ----------
+    input : Column
+        Column of input elements to label according to the specified bins.
+    left_edges : Column
+        Column of the left edge of each bin.
+    left_inclusive : bool
+        Whether or not the left edge is inclusive.
+    right_edges : Column
+        Column of the right edge of each bin.
+    right_inclusive : bool
+        Whether or not the right edge is inclusive.
+
+    Returns
+    -------
+    Column
+        Column of integer labels of the elements in `input`
+        according to the specified bins.
+    """
+    cdef unique_ptr[column] c_result
+    cdef inclusive c_left_inclusive = (
+        inclusive.YES
+        if left_inclusive
+        else inclusive.NO
+    )
+    cdef inclusive c_right_inclusive = (
+        inclusive.YES
+        if right_inclusive
+        else inclusive.NO
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_labeling.label_bins(
+                input.view(),
+                left_edges.view(),
+                c_left_inclusive,
+                right_edges.view(),
+                c_right_inclusive,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
index b04e94f1546..2167616690f 100644
--- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
@@ -12,8 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx reduce.pyx replace.pyx
-                   round.pyx stream_compaction.pyx types.pyx unary.pyx
+set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx labeling.pyx reduce.pyx
+                   replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
index ec6ef6b2a41..400c4282f7a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
@@ -1,14 +1,14 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
+from libcpp cimport int
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil:
-    ctypedef enum inclusive:
-        YES "cudf::inclusive::YES"
-        NO "cudf::inclusive::NO"
+    cpdef enum class inclusive(int):
+        YES
+        NO
 
     cdef unique_ptr[column] label_bins (
         const column_view &input,
diff --git a/python/pylibcudf/pylibcudf/libcudf/labeling.pyx b/python/pylibcudf/pylibcudf/libcudf/labeling.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py
new file mode 100644
index 00000000000..f7fb7463b50
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+
+
+@pytest.mark.parametrize("left_inclusive", [True, False])
+@pytest.mark.parametrize("right_inclusive", [True, False])
+def test_label_bins(left_inclusive, right_inclusive):
+    in_col = plc.interop.from_arrow(pa.array([1, 2, 3]))
+    left_edges = plc.interop.from_arrow(pa.array([0, 5]))
+    right_edges = plc.interop.from_arrow(pa.array([4, 6]))
+    result = plc.interop.to_arrow(
+        plc.labeling.label_bins(
+            in_col, left_edges, left_inclusive, right_edges, right_inclusive
+        )
+    )
+    expected = pa.chunked_array([[0, 0, 0]], type=pa.int32())
+    assert result.equals(expected)
+
+
+def test_inclusive_enum():
+    assert plc.labeling.Inclusive.YES == 0
+    assert plc.labeling.Inclusive.NO == 1

From 92f0197197df9e1defbd49903d0b3c25b071d805 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 9 Sep 2024 16:16:17 -0700
Subject: [PATCH 181/270] Simplify the nvCOMP adapter (#16762)

This PR removes the adapter code that allow running with older nvCOMP versions.
Feature status checking has been significantly simplified, and compile-time checks for newer compression types have been removed.
Also removed the fallback to the old version of get_temp_size, since we are now guaranteed to have access to the extended version.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16762
---
 cpp/include/cudf/io/nvcomp_adapter.hpp     |  24 +-
 cpp/src/io/comp/nvcomp_adapter.cpp         | 334 +++------------------
 cpp/src/io/comp/nvcomp_adapter.hpp         |  14 +-
 cpp/src/io/orc/writer_impl.cu              |   8 +-
 cpp/src/io/parquet/writer_impl_helpers.cpp |   2 +-
 cpp/tests/io/comp/decomp_test.cpp          |  46 ++-
 6 files changed, 79 insertions(+), 349 deletions(-)

diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp
index f3260d0cb53..e7fe3cc7214 100644
--- a/cpp/include/cudf/io/nvcomp_adapter.hpp
+++ b/cpp/include/cudf/io/nvcomp_adapter.hpp
@@ -36,33 +36,20 @@ struct feature_status_parameters {
   int lib_patch_version;                 ///< patch version
   bool are_all_integrations_enabled;     ///< all integrations
   bool are_stable_integrations_enabled;  ///< stable integrations
-  int compute_capability_major;          ///< cuda compute major version
 
   /**
-   * @brief Default Constructor
+   * @brief Default constructor using the current version of nvcomp and current environment
+   * variables
    */
   feature_status_parameters();
 
   /**
-   * @brief feature_status_parameters Constructor
+   * @brief Constructor using the current version of nvcomp
    *
-   * @param major positive integer representing major value of nvcomp
-   * @param minor positive integer representing minor value of nvcomp
-   * @param patch positive integer representing patch value of nvcomp
    * @param all_enabled if all integrations are enabled
    * @param stable_enabled if stable integrations are enabled
-   * @param cc_major CUDA compute capability
    */
-  feature_status_parameters(
-    int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major)
-    : lib_major_version{major},
-      lib_minor_version{minor},
-      lib_patch_version{patch},
-      are_all_integrations_enabled{all_enabled},
-      are_stable_integrations_enabled{stable_enabled},
-      compute_capability_major{cc_major}
-  {
-  }
+  feature_status_parameters(bool all_enabled, bool stable_enabled);
 };
 
 /**
@@ -74,8 +61,7 @@ inline bool operator==(feature_status_parameters const& lhs, feature_status_para
          lhs.lib_minor_version == rhs.lib_minor_version and
          lhs.lib_patch_version == rhs.lib_patch_version and
          lhs.are_all_integrations_enabled == rhs.are_all_integrations_enabled and
-         lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled and
-         lhs.compute_capability_major == rhs.compute_capability_major;
+         lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled;
 }
 
 /**
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 5d0c6a8c83b..261a8eb401d 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -22,95 +22,44 @@
 #include <cudf/io/config_utils.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <nvcomp/deflate.h>
 #include <nvcomp/lz4.h>
 #include <nvcomp/snappy.h>
+#include <nvcomp/zstd.h>
 
 #include <mutex>
 
-#define NVCOMP_DEFLATE_HEADER <nvcomp/deflate.h>
-#if __has_include(NVCOMP_DEFLATE_HEADER)
-#include NVCOMP_DEFLATE_HEADER
-#endif
-
-#define NVCOMP_ZSTD_HEADER <nvcomp/zstd.h>
-#if __has_include(NVCOMP_ZSTD_HEADER)
-#include NVCOMP_ZSTD_HEADER
-#endif
-
-// When building with nvcomp 4.0 or newer, map the new version macros to the old ones
-#ifndef NVCOMP_MAJOR_VERSION
-#define NVCOMP_MAJOR_VERSION NVCOMP_VER_MAJOR
-#define NVCOMP_MINOR_VERSION NVCOMP_VER_MINOR
-#define NVCOMP_PATCH_VERSION NVCOMP_VER_PATCH
-#endif
-
-#define NVCOMP_HAS_ZSTD_DECOMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 3))
-
-#define NVCOMP_HAS_ZSTD_COMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 4))
-
-#define NVCOMP_HAS_DEFLATE(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 5))
-
-#define NVCOMP_HAS_DECOMP_TEMPSIZE_EX(MAJOR, MINOR, PATCH) \
-  (MAJOR > 2 or (MAJOR == 2 and MINOR > 3) or (MAJOR == 2 and MINOR == 3 and PATCH >= 1))
-
-#define NVCOMP_HAS_COMP_TEMPSIZE_EX(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 6))
-
-// ZSTD is stable for nvcomp 2.3.2 or newer
-#define NVCOMP_ZSTD_DECOMP_IS_STABLE(MAJOR, MINOR, PATCH) \
-  (MAJOR > 2 or (MAJOR == 2 and MINOR > 3) or (MAJOR == 2 and MINOR == 3 and PATCH >= 2))
-
 namespace cudf::io::nvcomp {
 
 // Dispatcher for nvcompBatched<format>DecompressGetTempSizeEx
 template <typename... Args>
-std::optional<nvcompStatus_t> batched_decompress_get_temp_size_ex(compression_type compression,
-                                                                  Args&&... args)
+auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&... args)
 {
-#if NVCOMP_HAS_DECOMP_TEMPSIZE_EX(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
   switch (compression) {
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressGetTempSizeEx(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedZstdDecompressGetTempSizeEx(std::forward<Args>(args)...);
-#else
-      return std::nullopt;
-#endif
     case compression_type::LZ4:
       return nvcompBatchedLZ4DecompressGetTempSizeEx(std::forward<Args>(args)...);
-    case compression_type::DEFLATE: [[fallthrough]];
-    default: return std::nullopt;
-  }
-#endif
-  return std::nullopt;
-}
-
-// Dispatcher for nvcompBatched<format>DecompressGetTempSize
-template <typename... Args>
-auto batched_decompress_get_temp_size(compression_type compression, Args&&... args)
-{
-  switch (compression) {
-    case compression_type::SNAPPY:
-      return nvcompBatchedSnappyDecompressGetTempSize(std::forward<Args>(args)...);
-    case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-      return nvcompBatchedZstdDecompressGetTempSize(std::forward<Args>(args)...);
-#else
-      CUDF_FAIL("Decompression error: " +
-                nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD).value());
-#endif
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-      return nvcompBatchedDeflateDecompressGetTempSize(std::forward<Args>(args)...);
-#else
-      CUDF_FAIL("Decompression error: " +
-                nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE).value());
-#endif
-    case compression_type::LZ4:
-      return nvcompBatchedLZ4DecompressGetTempSize(std::forward<Args>(args)...);
+      return nvcompBatchedDeflateDecompressGetTempSizeEx(std::forward<Args>(args)...);
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
+size_t batched_decompress_temp_size(compression_type compression,
+                                    size_t num_chunks,
+                                    size_t max_uncomp_chunk_size,
+                                    size_t max_total_uncomp_size)
+{
+  size_t temp_size             = 0;
+  nvcompStatus_t nvcomp_status = batched_decompress_get_temp_size_ex(
+    compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size);
+
+  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
+               "Unable to get scratch size for decompression");
+  return temp_size;
+}
 
 // Dispatcher for nvcompBatched<format>DecompressAsync
 template <typename... Args>
@@ -120,19 +69,9 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressAsync(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedZstdDecompressAsync(std::forward<Args>(args)...);
-#else
-      CUDF_FAIL("Decompression error: " +
-                nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD).value());
-#endif
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedDeflateDecompressAsync(std::forward<Args>(args)...);
-#else
-      CUDF_FAIL("Decompression error: " +
-                nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE).value());
-#endif
     case compression_type::LZ4: return nvcompBatchedLZ4DecompressAsync(std::forward<Args>(args)...);
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -149,27 +88,6 @@ std::string compression_type_name(compression_type compression)
   return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
 }
 
-size_t batched_decompress_temp_size(compression_type compression,
-                                    size_t num_chunks,
-                                    size_t max_uncomp_chunk_size,
-                                    size_t max_total_uncomp_size)
-{
-  size_t temp_size   = 0;
-  auto nvcomp_status = batched_decompress_get_temp_size_ex(
-    compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size);
-
-  if (nvcomp_status.value_or(nvcompStatus_t::nvcompErrorInternal) !=
-      nvcompStatus_t::nvcompSuccess) {
-    nvcomp_status =
-      batched_decompress_get_temp_size(compression, num_chunks, max_uncomp_chunk_size, &temp_size);
-  }
-
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "Unable to get scratch size for decompression");
-
-  return temp_size;
-}
-
 void batched_decompress(compression_type compression,
                         device_span<device_span<uint8_t const> const> inputs,
                         device_span<device_span<uint8_t> const> outputs,
@@ -204,54 +122,10 @@ void batched_decompress(compression_type compression,
   update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream);
 }
 
-// Wrapper for nvcompBatched<format>CompressGetTempSize
-auto batched_compress_get_temp_size(compression_type compression,
-                                    size_t batch_size,
-                                    size_t max_uncompressed_chunk_bytes)
-{
-  size_t temp_size             = 0;
-  nvcompStatus_t nvcomp_status = nvcompStatus_t::nvcompSuccess;
-  switch (compression) {
-    case compression_type::SNAPPY:
-      nvcomp_status = nvcompBatchedSnappyCompressGetTempSize(
-        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedSnappyDefaultOpts, &temp_size);
-      break;
-    case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-      nvcomp_status = nvcompBatchedDeflateCompressGetTempSize(
-        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedDeflateDefaultOpts, &temp_size);
-      break;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
-#endif
-    case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-      nvcomp_status = nvcompBatchedZstdCompressGetTempSize(
-        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedZstdDefaultOpts, &temp_size);
-      break;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
-#endif
-    case compression_type::LZ4:
-      nvcomp_status = nvcompBatchedLZ4CompressGetTempSize(
-        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedLZ4DefaultOpts, &temp_size);
-      break;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "Unable to get scratch size for compression");
-  return temp_size;
-}
-
-#if NVCOMP_HAS_COMP_TEMPSIZE_EX(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-// Wrapper for nvcompBatched<format>CompressGetTempSizeEx
-auto batched_compress_get_temp_size_ex(compression_type compression,
-                                       size_t batch_size,
-                                       size_t max_uncompressed_chunk_bytes,
-                                       size_t max_total_uncompressed_bytes)
+size_t batched_compress_temp_size(compression_type compression,
+                                  size_t batch_size,
+                                  size_t max_uncompressed_chunk_bytes,
+                                  size_t max_total_uncompressed_bytes)
 {
   size_t temp_size             = 0;
   nvcompStatus_t nvcomp_status = nvcompStatus_t::nvcompSuccess;
@@ -291,28 +165,8 @@ auto batched_compress_get_temp_size_ex(compression_type compression,
                "Unable to get scratch size for compression");
   return temp_size;
 }
-#endif
-
-size_t batched_compress_temp_size(compression_type compression,
-                                  size_t num_chunks,
-                                  size_t max_uncomp_chunk_size,
-                                  size_t max_total_uncomp_size)
-{
-#if NVCOMP_HAS_COMP_TEMPSIZE_EX(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-  try {
-    return batched_compress_get_temp_size_ex(
-      compression, num_chunks, max_uncomp_chunk_size, max_total_uncomp_size);
-  } catch (...) {
-    // Ignore errors in the expanded version; fall back to the old API in case of failure
-    CUDF_LOG_WARN(
-      "CompressGetTempSizeEx call failed, falling back to CompressGetTempSize; this may increase "
-      "the memory usage");
-  }
-#endif
-
-  return batched_compress_get_temp_size(compression, num_chunks, max_uncomp_chunk_size);
-}
 
+// Wrapper for nvcompBatched<format>CompressGetMaxOutputChunkSize
 size_t compress_max_output_chunk_size(compression_type compression,
                                       uint32_t max_uncompressed_chunk_bytes)
 {
@@ -328,23 +182,13 @@ size_t compress_max_output_chunk_size(compression_type compression,
         capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size);
       break;
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
         capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size);
       break;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
-#endif
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       status = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
         capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size);
       break;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
-#endif
     case compression_type::LZ4:
       status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
         capped_uncomp_bytes, nvcompBatchedLZ4DefaultOpts, &max_comp_chunk_size);
@@ -384,7 +228,6 @@ static void batched_compress_async(compression_type compression,
                                                        stream.value());
       break;
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       nvcomp_status = nvcompBatchedDeflateCompressAsync(device_uncompressed_ptrs,
                                                         device_uncompressed_bytes,
                                                         max_uncompressed_chunk_bytes,
@@ -396,12 +239,7 @@ static void batched_compress_async(compression_type compression,
                                                         nvcompBatchedDeflateDefaultOpts,
                                                         stream.value());
       break;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
-#endif
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       nvcomp_status = nvcompBatchedZstdCompressAsync(device_uncompressed_ptrs,
                                                      device_uncompressed_bytes,
                                                      max_uncompressed_chunk_bytes,
@@ -413,10 +251,6 @@ static void batched_compress_async(compression_type compression,
                                                      nvcompBatchedZstdDefaultOpts,
                                                      stream.value());
       break;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
-#endif
     case compression_type::LZ4:
       nvcomp_status = nvcompBatchedLZ4CompressAsync(device_uncompressed_ptrs,
                                                     device_uncompressed_bytes,
@@ -478,16 +312,18 @@ void batched_compress(compression_type compression,
 }
 
 feature_status_parameters::feature_status_parameters()
-  : lib_major_version{NVCOMP_MAJOR_VERSION},
-    lib_minor_version{NVCOMP_MINOR_VERSION},
-    lib_patch_version{NVCOMP_PATCH_VERSION},
-    are_all_integrations_enabled{nvcomp_integration::is_all_enabled()},
-    are_stable_integrations_enabled{nvcomp_integration::is_stable_enabled()}
+  : feature_status_parameters(nvcomp_integration::is_all_enabled(),
+                              nvcomp_integration::is_stable_enabled())
+{
+}
+
+feature_status_parameters::feature_status_parameters(bool all_enabled, bool stable_enabled)
+  : lib_major_version{NVCOMP_VER_MAJOR},
+    lib_minor_version{NVCOMP_VER_MINOR},
+    lib_patch_version{NVCOMP_VER_PATCH},
+    are_all_integrations_enabled{all_enabled},
+    are_stable_integrations_enabled{stable_enabled}
 {
-  int device;
-  CUDF_CUDA_TRY(cudaGetDevice(&device));
-  CUDF_CUDA_TRY(
-    cudaDeviceGetAttribute(&compute_capability_major, cudaDevAttrComputeCapabilityMajor, device));
 }
 
 // Represents all parameters required to determine status of a compression/decompression feature
@@ -510,41 +346,19 @@ std::optional<std::string> is_compression_disabled_impl(compression_type compres
 {
   switch (compression) {
     case compression_type::DEFLATE: {
-      if (not NVCOMP_HAS_DEFLATE(
-            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
-        return "nvCOMP 2.5 or newer is required for Deflate compression";
-      }
       if (not params.are_all_integrations_enabled) {
         return "DEFLATE compression is experimental, you can enable it through "
                "`LIBCUDF_NVCOMP_POLICY` environment variable.";
       }
       return std::nullopt;
     }
-    case compression_type::SNAPPY: {
-      if (not params.are_stable_integrations_enabled) {
-        return "Snappy compression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
-               "environment variable.";
-      }
-      return std::nullopt;
-    }
-    case compression_type::ZSTD: {
-      if (not NVCOMP_HAS_ZSTD_COMP(
-            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
-        return "nvCOMP 2.4 or newer is required for Zstandard compression";
-      }
-      if (not params.are_stable_integrations_enabled) {
-        return "Zstandard compression is experimental, you can enable it through "
-               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
-      }
-      return std::nullopt;
-    }
     case compression_type::LZ4:
+    case compression_type::SNAPPY:
+    case compression_type::ZSTD:
       if (not params.are_stable_integrations_enabled) {
-        return "LZ4 compression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
-               "environment variable.";
+        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
       }
       return std::nullopt;
-    default: return "Unsupported compression type";
   }
   return "Unsupported compression type";
 }
@@ -578,58 +392,25 @@ std::optional<std::string> is_compression_disabled(compression_type compression,
   return reason;
 }
 
-std::optional<std::string> is_zstd_decomp_disabled(feature_status_parameters const& params)
-{
-  if (not NVCOMP_HAS_ZSTD_DECOMP(
-        params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
-    return "nvCOMP 2.3 or newer is required for Zstandard decompression";
-  }
-
-  if (NVCOMP_ZSTD_DECOMP_IS_STABLE(
-        params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
-    if (not params.are_stable_integrations_enabled) {
-      return "Zstandard decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
-             "environment variable.";
-    }
-  } else if (not params.are_all_integrations_enabled) {
-    return "Zstandard decompression is experimental, you can enable it through "
-           "`LIBCUDF_NVCOMP_POLICY` environment variable.";
-  }
-
-  return std::nullopt;
-}
-
 std::optional<std::string> is_decompression_disabled_impl(compression_type compression,
                                                           feature_status_parameters params)
 {
   switch (compression) {
     case compression_type::DEFLATE: {
-      if (not NVCOMP_HAS_DEFLATE(
-            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
-        return "nvCOMP 2.5 or newer is required for Deflate decompression";
-      }
       if (not params.are_all_integrations_enabled) {
         return "DEFLATE decompression is experimental, you can enable it through "
                "`LIBCUDF_NVCOMP_POLICY` environment variable.";
       }
       return std::nullopt;
     }
-    case compression_type::SNAPPY: {
-      if (not params.are_stable_integrations_enabled) {
-        return "Snappy decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
-               "environment variable.";
-      }
-      return std::nullopt;
-    }
-    case compression_type::ZSTD: return is_zstd_decomp_disabled(params);
-    case compression_type::LZ4: {
+    case compression_type::LZ4:
+    case compression_type::SNAPPY:
+    case compression_type::ZSTD: {
       if (not params.are_stable_integrations_enabled) {
-        return "LZ4 decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
-               "environment variable.";
+        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
       }
       return std::nullopt;
     }
-    default: return "Unsupported compression type";
   }
   return "Unsupported compression type";
 }
@@ -663,24 +444,13 @@ std::optional<std::string> is_decompression_disabled(compression_type compressio
   return reason;
 }
 
-size_t compress_input_alignment_bits(compression_type compression)
+size_t required_alignment(compression_type compression)
 {
   switch (compression) {
-    case compression_type::DEFLATE: return 0;
-    case compression_type::SNAPPY: return 0;
-    case compression_type::ZSTD: return 2;
-    case compression_type::LZ4: return 2;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-}
-
-size_t compress_output_alignment_bits(compression_type compression)
-{
-  switch (compression) {
-    case compression_type::DEFLATE: return 3;
-    case compression_type::SNAPPY: return 0;
-    case compression_type::ZSTD: return 0;
-    case compression_type::LZ4: return 2;
+    case compression_type::DEFLATE: return nvcompDeflateRequiredAlignment;
+    case compression_type::SNAPPY: return nvcompSnappyRequiredAlignment;
+    case compression_type::ZSTD: return nvcompZstdRequiredAlignment;
+    case compression_type::LZ4: return nvcompLZ4RequiredAlignment;
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -688,16 +458,10 @@ size_t compress_output_alignment_bits(compression_type compression)
 std::optional<size_t> compress_max_allowed_chunk_size(compression_type compression)
 {
   switch (compression) {
-    case compression_type::DEFLATE: return 64 * 1024;
-    case compression_type::SNAPPY: return std::nullopt;
-    case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-      return nvcompZstdCompressionMaxAllowedChunkSize;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
-#endif
-    case compression_type::LZ4: return 16 * 1024 * 1024;
+    case compression_type::DEFLATE: return nvcompDeflateCompressionMaxAllowedChunkSize;
+    case compression_type::SNAPPY: return nvcompSnappyCompressionMaxAllowedChunkSize;
+    case compression_type::ZSTD: return nvcompZstdCompressionMaxAllowedChunkSize;
+    case compression_type::LZ4: return nvcompLZ4CompressionMaxAllowedChunkSize;
     default: return std::nullopt;
   }
 }
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 43c79e32375..583bd6a3523 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -75,20 +75,12 @@ size_t batched_decompress_temp_size(compression_type compression,
                                                     uint32_t max_uncomp_chunk_size);
 
 /**
- * @brief Gets input alignment requirements for the given compression type.
+ * @brief Gets input and output alignment requirements for the given compression type.
  *
  * @param compression Compression type
- * @returns required alignment, in bits
+ * @returns required alignment
  */
-[[nodiscard]] size_t compress_input_alignment_bits(compression_type compression);
-
-/**
- * @brief Gets output alignment requirements for the given compression type.
- *
- * @param compression Compression type
- * @returns required alignment, in bits
- */
-[[nodiscard]] size_t compress_output_alignment_bits(compression_type compression);
+[[nodiscard]] size_t required_alignment(compression_type compression);
 
 /**
  * @brief Maximum size of uncompressed chunks that can be compressed with nvCOMP.
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index ede9fd060b8..ebdf9f3f249 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -532,20 +532,20 @@ auto uncomp_block_alignment(CompressionKind compression_kind)
 {
   if (compression_kind == NONE or
       nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
-    return 1u;
+    return 1ul;
   }
 
-  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(compression_kind));
+  return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind));
 }
 
 auto comp_block_alignment(CompressionKind compression_kind)
 {
   if (compression_kind == NONE or
       nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
-    return 1u;
+    return 1ul;
   }
 
-  return 1u << nvcomp::compress_output_alignment_bits(to_nvcomp_compression_type(compression_kind));
+  return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind));
 }
 
 /**
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
index e2f09f872d3..396d44c0763 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -62,7 +62,7 @@ uint32_t page_alignment(Compression codec)
     return 1u;
   }
 
-  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec));
+  return nvcomp::required_alignment(to_nvcomp_compression_type(codec));
 }
 
 size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
index 38c1a57eca9..840cf263ed9 100644
--- a/cpp/tests/io/comp/decomp_test.cpp
+++ b/cpp/tests/io/comp/decomp_test.cpp
@@ -176,23 +176,19 @@ TEST_F(NvcompConfigTest, Compression)
   using cudf::io::nvcomp::compression_type;
   auto const& comp_disabled = cudf::io::nvcomp::is_compression_disabled;
 
-  EXPECT_FALSE(comp_disabled(compression_type::DEFLATE, {2, 5, 0, true, true, 0}));
-  // version 2.5 required
-  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {2, 4, 0, true, true, 0}));
+  EXPECT_FALSE(comp_disabled(compression_type::DEFLATE, {true, true}));
   // all integrations enabled required
-  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {2, 5, 0, false, true, 0}));
+  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {false, true}));
 
-  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {2, 4, 0, true, true, 0}));
-  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {2, 4, 0, false, true, 0}));
-  // 2.4 version required
-  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {2, 3, 1, false, true, 0}));
+  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {true, true}));
+  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {false, true}));
   // stable integrations enabled required
-  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {2, 4, 0, false, false, 0}));
+  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {false, false}));
 
-  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {2, 5, 0, true, true, 0}));
-  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {2, 4, 0, false, true, 0}));
+  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {true, true}));
+  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {false, true}));
   // stable integrations enabled required
-  EXPECT_TRUE(comp_disabled(compression_type::SNAPPY, {2, 3, 0, false, false, 0}));
+  EXPECT_TRUE(comp_disabled(compression_type::SNAPPY, {false, false}));
 }
 
 TEST_F(NvcompConfigTest, Decompression)
@@ -200,27 +196,19 @@ TEST_F(NvcompConfigTest, Decompression)
   using cudf::io::nvcomp::compression_type;
   auto const& decomp_disabled = cudf::io::nvcomp::is_decompression_disabled;
 
-  EXPECT_FALSE(decomp_disabled(compression_type::DEFLATE, {2, 5, 0, true, true, 7}));
-  // version 2.5 required
-  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {2, 4, 0, true, true, 7}));
+  EXPECT_FALSE(decomp_disabled(compression_type::DEFLATE, {true, true}));
   // all integrations enabled required
-  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {2, 5, 0, false, true, 7}));
-
-  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 4, 0, true, true, 7}));
-  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 3, 2, false, true, 6}));
-  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 3, 0, true, true, 6}));
-  // 2.3.1 and earlier requires all integrations to be enabled
-  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 3, 1, false, true, 7}));
-  // 2.3 version required
-  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 2, 0, true, true, 7}));
+  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {false, true}));
+
+  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {true, true}));
+  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {false, true}));
   // stable integrations enabled required
-  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 4, 0, false, false, 7}));
+  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {false, false}));
 
-  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 4, 0, true, true, 7}));
-  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 3, 0, false, true, 7}));
-  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 2, 0, false, true, 7}));
+  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {true, true}));
+  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {false, true}));
   // stable integrations enabled required
-  EXPECT_TRUE(decomp_disabled(compression_type::SNAPPY, {2, 2, 0, false, false, 7}));
+  EXPECT_TRUE(decomp_disabled(compression_type::SNAPPY, {false, false}));
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From f21979ec3fbfb97ddab8ee465aadf8e98ad33e65 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 9 Sep 2024 17:03:37 -0700
Subject: [PATCH 182/270] Extend the Parquet writer's dictionary encoding
 benchmark. (#16591)

This PR extends the data cardinality and run length range for the existing parquet writer's encoding benchmark.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16591
---
 cpp/benchmarks/io/parquet/parquet_writer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
index 46d2927a92b..256e50f0e64 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -202,8 +202,8 @@ NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list))
   .set_name("parquet_write_encode")
   .set_type_axes_names({"data_type"})
   .set_min_samples(4)
-  .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32});
+  .add_int64_axis("cardinality", {0, 1000, 10'000, 100'000})
+  .add_int64_axis("run_length", {1, 8, 32});
 
 NVBENCH_BENCH_TYPES(BM_parq_write_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list))
   .set_name("parquet_write_io_compression")

From afd3a4b4776adf738284c9f0b99e1fc2fcefeec8 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Tue, 10 Sep 2024 22:03:48 +1000
Subject: [PATCH 183/270] Add libcudf wrappers around current_device_resource
 functions. (#16679)

Merge after rapidsai/rmm#1661

Creates and uses CUDF internal wrappers around RMM `current_device_resource` functions.

I've marked this PR as breaking because it breaks the ABI, however the API is compatible.

For reviewers, the most substantial additions are in the new file `<cudf/utilities/memory_resource.hpp>`, and in the `DEVELOPER_GUIDE.md` and `*.rst` docs. The rest are all replacements of an include and all calls to `rmm::get_current_device_resource()` with `cudf::get_current_device_resource_ref()`.

Closes #16676

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - https://github.com/nvdbaranec
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16679
---
 cpp/benchmarks/common/generate_input.cu       | 14 +--
 .../random_column_generator.hpp               | 14 +--
 .../tpch_data_generator/table_helpers.cpp     |  3 +
 .../tpch_data_generator/table_helpers.hpp     | 20 +++--
 .../tpch_data_generator.cpp                   |  3 +
 .../tpch_data_generator.hpp                   | 14 +--
 cpp/benchmarks/copying/contiguous_split.cu    |  5 +-
 cpp/benchmarks/copying/shift.cu               |  5 +-
 cpp/benchmarks/fixture/benchmark_fixture.hpp  | 13 +--
 cpp/benchmarks/fixture/nvbench_fixture.hpp    |  5 +-
 cpp/benchmarks/io/cuio_common.cpp             |  4 +-
 cpp/benchmarks/io/json/nested_json.cpp        |  5 +-
 .../io/orc/orc_reader_multithreaded.cpp       |  3 +-
 .../io/parquet/parquet_reader_multithread.cpp |  3 +-
 cpp/benchmarks/iterator/iterator.cu           |  3 +-
 cpp/benchmarks/join/join_common.hpp           |  3 +-
 cpp/benchmarks/json/json.cu                   |  3 +-
 cpp/benchmarks/lists/copying/scatter_lists.cu |  3 +-
 cpp/benchmarks/lists/set_operations.cpp       |  5 +-
 cpp/benchmarks/merge/merge_lists.cpp          |  9 +-
 cpp/benchmarks/merge/merge_structs.cpp        |  9 +-
 cpp/benchmarks/reduction/rank.cpp             |  3 +-
 cpp/benchmarks/reduction/scan_structs.cpp     |  3 +-
 cpp/benchmarks/search/contains_table.cpp      |  7 +-
 cpp/benchmarks/sort/rank_lists.cpp            |  3 +-
 cpp/benchmarks/sort/rank_structs.cpp          |  3 +-
 cpp/benchmarks/sort/sort_lists.cpp            |  8 +-
 cpp/benchmarks/sort/sort_structs.cpp          |  6 +-
 .../developer_guide/DEVELOPER_GUIDE.md        | 29 +++---
 cpp/examples/basic/src/process_csv.cpp        |  2 +-
 cpp/examples/interop/interop.cpp              |  3 +-
 cpp/examples/nested_types/deduplication.cpp   |  2 +-
 cpp/examples/parquet_io/parquet_io.cpp        |  2 +-
 cpp/examples/strings/common.hpp               |  2 +-
 cpp/examples/tpch/q1.cpp                      |  7 +-
 cpp/examples/tpch/q10.cpp                     |  5 +-
 cpp/examples/tpch/q5.cpp                      |  5 +-
 cpp/examples/tpch/q6.cpp                      |  5 +-
 cpp/examples/tpch/q9.cpp                      |  5 +-
 cpp/examples/tpch/utils.hpp                   |  3 +-
 .../cudf/ast/detail/expression_parser.hpp     |  3 +-
 cpp/include/cudf/binaryop.hpp                 | 14 ++-
 cpp/include/cudf/column/column.hpp            |  7 +-
 cpp/include/cudf/column/column_factories.hpp  | 35 ++++----
 cpp/include/cudf/concatenate.hpp              | 10 +--
 cpp/include/cudf/contiguous_split.hpp         | 13 ++-
 cpp/include/cudf/copying.hpp                  | 40 ++++-----
 cpp/include/cudf/datetime.hpp                 | 44 +++++----
 cpp/include/cudf/detail/binaryop.hpp          |  2 +-
 .../detail/calendrical_month_sequence.cuh     |  2 +-
 cpp/include/cudf/detail/concatenate.hpp       |  2 +-
 cpp/include/cudf/detail/concatenate_masks.hpp |  2 +-
 cpp/include/cudf/detail/contiguous_split.hpp  |  2 +-
 cpp/include/cudf/detail/copy.hpp              |  2 +-
 cpp/include/cudf/detail/copy_if.cuh           |  2 +-
 cpp/include/cudf/detail/copy_if_else.cuh      |  2 +-
 cpp/include/cudf/detail/copy_range.cuh        |  2 +-
 cpp/include/cudf/detail/datetime.hpp          |  3 +-
 .../cudf/detail/distinct_hash_join.cuh        |  2 +-
 cpp/include/cudf/detail/fill.hpp              |  2 +-
 cpp/include/cudf/detail/gather.cuh            |  6 +-
 cpp/include/cudf/detail/gather.hpp            |  2 +-
 cpp/include/cudf/detail/groupby.hpp           |  2 +-
 .../detail/groupby/group_replace_nulls.hpp    |  2 +-
 .../cudf/detail/groupby/sort_helper.hpp       |  2 +-
 .../cudf/detail/hash_reduce_by_row.cuh        |  2 +-
 cpp/include/cudf/detail/interop.hpp           |  2 +-
 cpp/include/cudf/detail/join.hpp              |  2 +-
 cpp/include/cudf/detail/label_bins.hpp        |  3 +-
 cpp/include/cudf/detail/merge.hpp             |  2 +-
 cpp/include/cudf/detail/null_mask.cuh         | 10 +--
 cpp/include/cudf/detail/null_mask.hpp         |  2 +-
 cpp/include/cudf/detail/quantiles.hpp         |  2 +-
 cpp/include/cudf/detail/repeat.hpp            |  2 +-
 cpp/include/cudf/detail/replace.hpp           |  2 +-
 cpp/include/cudf/detail/reshape.hpp           |  2 +-
 cpp/include/cudf/detail/rolling.hpp           |  2 +-
 cpp/include/cudf/detail/round.hpp             |  2 +-
 cpp/include/cudf/detail/scan.hpp              |  2 +-
 cpp/include/cudf/detail/scatter.cuh           |  4 +-
 cpp/include/cudf/detail/scatter.hpp           |  2 +-
 cpp/include/cudf/detail/search.hpp            |  2 +-
 cpp/include/cudf/detail/sequence.hpp          |  6 +-
 .../cudf/detail/sizes_to_offsets_iterator.cuh |  2 +-
 cpp/include/cudf/detail/sorting.hpp           |  2 +-
 cpp/include/cudf/detail/stream_compaction.hpp |  2 +-
 cpp/include/cudf/detail/structs/utilities.hpp |  2 +-
 cpp/include/cudf/detail/tdigest/tdigest.hpp   |  2 +-
 cpp/include/cudf/detail/timezone.hpp          |  4 +-
 cpp/include/cudf/detail/transform.hpp         |  2 +-
 cpp/include/cudf/detail/transpose.hpp         |  2 +-
 cpp/include/cudf/detail/unary.hpp             |  2 +-
 .../cudf/detail/utilities/host_memory.hpp     |  3 +-
 .../cudf/detail/utilities/host_vector.hpp     |  8 +-
 .../detail/utilities/vector_factories.hpp     |  2 +-
 cpp/include/cudf/detail/valid_if.cuh          |  2 +-
 .../cudf/dictionary/detail/concatenate.hpp    |  2 +-
 cpp/include/cudf/dictionary/detail/encode.hpp |  2 +-
 cpp/include/cudf/dictionary/detail/merge.hpp  |  2 +-
 .../cudf/dictionary/detail/replace.hpp        |  2 +-
 cpp/include/cudf/dictionary/detail/search.hpp |  2 +-
 .../cudf/dictionary/detail/update_keys.hpp    |  2 +-
 .../cudf/dictionary/dictionary_factories.hpp  |  9 +-
 cpp/include/cudf/dictionary/encode.hpp        |  8 +-
 cpp/include/cudf/dictionary/search.hpp        |  6 +-
 cpp/include/cudf/dictionary/update_keys.hpp   | 14 ++-
 cpp/include/cudf/filling.hpp                  | 16 ++--
 cpp/include/cudf/groupby.hpp                  | 15 ++--
 cpp/include/cudf/hashing.hpp                  | 22 +++--
 cpp/include/cudf/hashing/detail/hashing.hpp   |  2 +-
 cpp/include/cudf/interop.hpp                  | 34 ++++---
 cpp/include/cudf/io/avro.hpp                  |  6 +-
 cpp/include/cudf/io/csv.hpp                   |  6 +-
 cpp/include/cudf/io/detail/avro.hpp           |  2 +-
 cpp/include/cudf/io/detail/batched_memset.hpp |  6 +-
 cpp/include/cudf/io/detail/csv.hpp            |  2 +-
 cpp/include/cudf/io/detail/json.hpp           |  2 +-
 cpp/include/cudf/io/detail/orc.hpp            |  2 +-
 cpp/include/cudf/io/detail/parquet.hpp        |  3 +-
 cpp/include/cudf/io/detail/tokenize_json.hpp  |  2 +-
 cpp/include/cudf/io/json.hpp                  |  6 +-
 cpp/include/cudf/io/orc.hpp                   | 12 ++-
 cpp/include/cudf/io/parquet.hpp               | 10 +--
 cpp/include/cudf/io/text/detail/trie.hpp      |  1 -
 cpp/include/cudf/io/text/multibyte_split.hpp  |  5 +-
 cpp/include/cudf/join.hpp                     | 59 ++++++------
 cpp/include/cudf/json/json.hpp                |  6 +-
 cpp/include/cudf/labeling/label_bins.hpp      |  5 +-
 cpp/include/cudf/lists/combine.hpp            |  8 +-
 cpp/include/cudf/lists/contains.hpp           | 14 ++-
 cpp/include/cudf/lists/count_elements.hpp     |  6 +-
 cpp/include/cudf/lists/detail/combine.hpp     |  3 +-
 cpp/include/cudf/lists/detail/concatenate.hpp |  2 +-
 cpp/include/cudf/lists/detail/contains.hpp    |  3 +-
 cpp/include/cudf/lists/detail/copying.hpp     |  2 +-
 cpp/include/cudf/lists/detail/extract.hpp     |  3 +-
 cpp/include/cudf/lists/detail/gather.cuh      |  2 +-
 .../cudf/lists/detail/interleave_columns.hpp  |  2 +-
 .../lists/detail/lists_column_factories.hpp   |  4 +-
 cpp/include/cudf/lists/detail/reverse.hpp     |  3 +-
 cpp/include/cudf/lists/detail/scatter.cuh     |  2 +-
 .../cudf/lists/detail/scatter_helper.cuh      |  2 +-
 .../cudf/lists/detail/set_operations.hpp      |  2 +-
 cpp/include/cudf/lists/detail/sorting.hpp     |  2 +-
 .../cudf/lists/detail/stream_compaction.hpp   |  2 +-
 cpp/include/cudf/lists/explode.hpp            | 12 ++-
 cpp/include/cudf/lists/extract.hpp            |  8 +-
 cpp/include/cudf/lists/filling.hpp            |  7 +-
 cpp/include/cudf/lists/gather.hpp             |  6 +-
 cpp/include/cudf/lists/reverse.hpp            |  6 +-
 cpp/include/cudf/lists/set_operations.hpp     | 10 +--
 cpp/include/cudf/lists/sorting.hpp            |  8 +-
 cpp/include/cudf/lists/stream_compaction.hpp  |  7 +-
 cpp/include/cudf/merge.hpp                    |  6 +-
 cpp/include/cudf/null_mask.hpp                | 13 ++-
 cpp/include/cudf/partitioning.hpp             |  9 +-
 cpp/include/cudf/quantiles.hpp                | 10 +--
 cpp/include/cudf/reduction.hpp                | 16 ++--
 .../cudf/reduction/detail/histogram.hpp       |  2 +-
 .../cudf/reduction/detail/reduction.cuh       |  2 +-
 .../cudf/reduction/detail/reduction.hpp       |  3 +-
 .../reduction/detail/reduction_functions.hpp  |  2 +-
 .../detail/segmented_reduction_functions.hpp  |  3 +-
 cpp/include/cudf/replace.hpp                  | 22 +++--
 cpp/include/cudf/reshape.hpp                  | 10 +--
 cpp/include/cudf/rolling.hpp                  | 24 +++--
 cpp/include/cudf/round.hpp                    |  6 +-
 cpp/include/cudf/scalar/scalar.hpp            | 67 +++++++-------
 cpp/include/cudf/scalar/scalar_factories.hpp  | 27 +++---
 cpp/include/cudf/search.hpp                   | 10 +--
 cpp/include/cudf/sorting.hpp                  | 26 +++---
 cpp/include/cudf/stream_compaction.hpp        | 22 +++--
 cpp/include/cudf/strings/attributes.hpp       | 10 +--
 cpp/include/cudf/strings/capitalize.hpp       | 10 +--
 cpp/include/cudf/strings/case.hpp             | 10 +--
 .../cudf/strings/char_types/char_types.hpp    |  8 +-
 cpp/include/cudf/strings/combine.hpp          | 14 ++-
 cpp/include/cudf/strings/contains.hpp         | 14 ++-
 .../cudf/strings/convert/convert_booleans.hpp |  8 +-
 .../cudf/strings/convert/convert_datetime.hpp | 10 +--
 .../strings/convert/convert_durations.hpp     |  8 +-
 .../strings/convert/convert_fixed_point.hpp   | 10 +--
 .../cudf/strings/convert/convert_floats.hpp   | 10 +--
 .../cudf/strings/convert/convert_integers.hpp | 18 ++--
 .../cudf/strings/convert/convert_ipv4.hpp     | 10 +--
 .../cudf/strings/convert/convert_lists.hpp    |  6 +-
 .../cudf/strings/convert/convert_urls.hpp     |  8 +-
 cpp/include/cudf/strings/detail/combine.hpp   |  2 +-
 .../cudf/strings/detail/concatenate.hpp       |  2 +-
 .../cudf/strings/detail/converters.hpp        |  2 +-
 .../cudf/strings/detail/copy_if_else.cuh      |  2 +-
 .../cudf/strings/detail/copy_range.hpp        |  2 +-
 cpp/include/cudf/strings/detail/copying.hpp   |  2 +-
 cpp/include/cudf/strings/detail/fill.hpp      |  2 +-
 cpp/include/cudf/strings/detail/gather.cuh    |  2 +-
 cpp/include/cudf/strings/detail/merge.hpp     |  1 +
 cpp/include/cudf/strings/detail/replace.hpp   |  2 +-
 cpp/include/cudf/strings/detail/scan.hpp      |  2 +-
 cpp/include/cudf/strings/detail/scatter.cuh   |  4 +-
 .../cudf/strings/detail/strings_children.cuh  |  2 +-
 .../detail/strings_column_factories.cuh       |  2 +-
 cpp/include/cudf/strings/detail/utilities.hpp |  2 +-
 cpp/include/cudf/strings/extract.hpp          |  8 +-
 cpp/include/cudf/strings/find.hpp             | 22 +++--
 cpp/include/cudf/strings/find_multiple.hpp    |  6 +-
 cpp/include/cudf/strings/findall.hpp          |  6 +-
 cpp/include/cudf/strings/padding.hpp          |  8 +-
 cpp/include/cudf/strings/repeat_strings.hpp   | 10 +--
 cpp/include/cudf/strings/replace.hpp          | 10 +--
 cpp/include/cudf/strings/replace_re.hpp       | 10 +--
 cpp/include/cudf/strings/reverse.hpp          |  6 +-
 cpp/include/cudf/strings/slice.hpp            |  8 +-
 cpp/include/cudf/strings/split/partition.hpp  |  8 +-
 cpp/include/cudf/strings/split/split.hpp      | 12 ++-
 cpp/include/cudf/strings/split/split_re.hpp   | 12 ++-
 cpp/include/cudf/strings/strip.hpp            |  6 +-
 cpp/include/cudf/strings/translate.hpp        |  8 +-
 cpp/include/cudf/strings/utilities.hpp        |  6 +-
 cpp/include/cudf/strings/wrap.hpp             |  6 +-
 .../cudf/structs/detail/concatenate.hpp       |  3 +-
 cpp/include/cudf/structs/detail/scan.hpp      |  2 +-
 cpp/include/cudf/table/table.hpp              |  7 +-
 cpp/include/cudf/timezone.hpp                 |  6 +-
 cpp/include/cudf/transform.hpp                | 22 +++--
 cpp/include/cudf/transpose.hpp                |  6 +-
 cpp/include/cudf/unary.hpp                    | 16 ++--
 .../cudf/utilities/memory_resource.hpp        | 90 +++++++++++++++++++
 cpp/include/cudf/utilities/pinned_memory.hpp  |  3 +-
 cpp/include/cudf_test/base_fixture.hpp        |  7 +-
 cpp/include/cudf_test/column_wrapper.hpp      | 16 ++--
 .../stream_checking_resource_adaptor.hpp      |  2 +-
 cpp/include/cudf_test/tdigest_utilities.cuh   | 20 ++---
 cpp/include/cudf_test/testing_main.hpp        |  8 +-
 cpp/include/doxygen_groups.h                  |  3 +-
 cpp/include/nvtext/byte_pair_encoding.hpp     | 11 ++-
 cpp/include/nvtext/detail/generate_ngrams.hpp |  3 +-
 cpp/include/nvtext/detail/load_hash_file.hpp  |  2 +-
 cpp/include/nvtext/detail/tokenize.hpp        |  2 +-
 cpp/include/nvtext/edit_distance.hpp          |  7 +-
 cpp/include/nvtext/generate_ngrams.hpp        |  9 +-
 cpp/include/nvtext/jaccard.hpp                |  5 +-
 cpp/include/nvtext/minhash.hpp                | 11 ++-
 cpp/include/nvtext/ngrams_tokenize.hpp        |  5 +-
 cpp/include/nvtext/normalize.hpp              |  7 +-
 cpp/include/nvtext/replace.hpp                |  7 +-
 cpp/include/nvtext/stemmer.hpp                |  9 +-
 cpp/include/nvtext/subword_tokenize.hpp       |  7 +-
 cpp/include/nvtext/tokenize.hpp               | 21 +++--
 cpp/src/binaryop/binaryop.cpp                 |  2 +-
 cpp/src/binaryop/compiled/binary_ops.cu       |  4 +-
 cpp/src/binaryop/compiled/binary_ops.hpp      |  2 +-
 cpp/src/bitmask/null_mask.cu                  |  2 +-
 cpp/src/column/column.cu                      |  2 +-
 cpp/src/column/column_factories.cpp           |  3 +-
 cpp/src/column/column_factories.cu            |  3 +-
 cpp/src/copying/concatenate.cu                |  6 +-
 cpp/src/copying/contiguous_split.cu           |  6 +-
 cpp/src/copying/copy.cpp                      |  2 +-
 cpp/src/copying/copy.cu                       |  5 +-
 cpp/src/copying/copy_range.cu                 |  6 +-
 cpp/src/copying/gather.cu                     |  2 +-
 cpp/src/copying/get_element.cu                |  2 +-
 cpp/src/copying/pack.cpp                      |  2 +-
 cpp/src/copying/purge_nonempty_nulls.cu       |  3 +-
 cpp/src/copying/reverse.cu                    |  3 +-
 cpp/src/copying/sample.cu                     |  2 +-
 cpp/src/copying/scatter.cu                    |  8 +-
 cpp/src/copying/segmented_shift.cu            |  2 +-
 cpp/src/copying/shift.cu                      |  2 +-
 cpp/src/datetime/datetime_ops.cu              |  2 +-
 cpp/src/datetime/timezone.cpp                 |  3 +-
 cpp/src/dictionary/add_keys.cu                |  6 +-
 cpp/src/dictionary/decode.cu                  |  2 +-
 cpp/src/dictionary/detail/concatenate.cu      |  7 +-
 cpp/src/dictionary/detail/merge.cu            |  2 +-
 cpp/src/dictionary/dictionary_factories.cu    |  2 +-
 cpp/src/dictionary/encode.cu                  |  2 +-
 cpp/src/dictionary/remove_keys.cu             |  2 +-
 cpp/src/dictionary/replace.cu                 |  4 +-
 cpp/src/dictionary/search.cu                  |  2 +-
 cpp/src/dictionary/set_keys.cu                |  4 +-
 cpp/src/filling/calendrical_month_sequence.cu |  2 +-
 cpp/src/filling/fill.cu                       |  5 +-
 cpp/src/filling/repeat.cu                     |  3 +-
 cpp/src/filling/sequence.cu                   |  2 +-
 cpp/src/groupby/common/utils.hpp              |  3 +-
 cpp/src/groupby/groupby.cu                    |  7 +-
 cpp/src/groupby/hash/groupby.cu               |  8 +-
 cpp/src/groupby/sort/aggregate.cpp            |  6 +-
 cpp/src/groupby/sort/functors.hpp             |  2 +-
 cpp/src/groupby/sort/group_argmax.cu          |  2 +-
 cpp/src/groupby/sort/group_argmin.cu          |  2 +-
 cpp/src/groupby/sort/group_collect.cu         |  2 +-
 cpp/src/groupby/sort/group_correlation.cu     |  2 +-
 cpp/src/groupby/sort/group_count.cu           |  2 +-
 cpp/src/groupby/sort/group_count_scan.cu      |  2 +-
 cpp/src/groupby/sort/group_histogram.cu       |  2 +-
 cpp/src/groupby/sort/group_m2.cu              |  2 +-
 cpp/src/groupby/sort/group_max.cu             |  3 +-
 cpp/src/groupby/sort/group_max_scan.cu        |  3 +-
 cpp/src/groupby/sort/group_merge_lists.cu     |  2 +-
 cpp/src/groupby/sort/group_merge_m2.cu        |  2 +-
 cpp/src/groupby/sort/group_min.cu             |  3 +-
 cpp/src/groupby/sort/group_min_scan.cu        |  3 +-
 cpp/src/groupby/sort/group_nth_element.cu     |  2 +-
 cpp/src/groupby/sort/group_nunique.cu         |  2 +-
 cpp/src/groupby/sort/group_product.cu         |  2 +-
 cpp/src/groupby/sort/group_product_scan.cu    |  3 +-
 cpp/src/groupby/sort/group_quantiles.cu       |  4 +-
 cpp/src/groupby/sort/group_rank_scan.cu       |  6 +-
 cpp/src/groupby/sort/group_reductions.hpp     |  2 +-
 cpp/src/groupby/sort/group_replace_nulls.cu   |  2 +-
 cpp/src/groupby/sort/group_scan.hpp           |  2 +-
 cpp/src/groupby/sort/group_scan_util.cuh      |  2 +-
 .../sort/group_single_pass_reduction_util.cuh |  2 +-
 cpp/src/groupby/sort/group_std.cu             |  2 +-
 cpp/src/groupby/sort/group_sum.cu             |  2 +-
 cpp/src/groupby/sort/group_sum_scan.cu        |  3 +-
 cpp/src/groupby/sort/scan.cpp                 | 12 +--
 cpp/src/groupby/sort/sort_helper.cu           | 14 +--
 cpp/src/hash/md5_hash.cu                      |  2 +-
 cpp/src/hash/murmurhash3_x64_128.cu           |  2 +-
 cpp/src/hash/murmurhash3_x86_32.cu            |  2 +-
 cpp/src/hash/sha1_hash.cu                     |  2 +-
 cpp/src/hash/sha224_hash.cu                   |  2 +-
 cpp/src/hash/sha256_hash.cu                   |  2 +-
 cpp/src/hash/sha384_hash.cu                   |  2 +-
 cpp/src/hash/sha512_hash.cu                   |  2 +-
 cpp/src/hash/sha_hash.cuh                     |  2 +-
 cpp/src/hash/xxhash_64.cu                     |  2 +-
 cpp/src/interop/arrow_utilities.cpp           |  1 -
 cpp/src/interop/arrow_utilities.hpp           |  3 +-
 .../interop/decimal_conversion_utilities.cuh  |  2 +-
 cpp/src/interop/dlpack.cpp                    |  2 +-
 cpp/src/interop/from_arrow_device.cu          |  2 +-
 cpp/src/interop/from_arrow_host.cu            |  2 +-
 cpp/src/interop/from_arrow_stream.cu          |  1 -
 cpp/src/interop/to_arrow_device.cu            |  3 +-
 cpp/src/interop/to_arrow_host.cu              |  3 +-
 cpp/src/io/avro/reader_impl.cu                |  8 +-
 cpp/src/io/comp/uncomp.cpp                    |  3 +-
 cpp/src/io/csv/csv_gpu.cu                     |  3 +-
 cpp/src/io/csv/durations.cu                   |  2 +-
 cpp/src/io/csv/durations.hpp                  |  3 +-
 cpp/src/io/csv/reader_impl.cu                 | 14 +--
 cpp/src/io/csv/writer_impl.cu                 | 13 ++-
 cpp/src/io/functions.cpp                      |  3 +-
 cpp/src/io/json/json_column.cu                | 14 +--
 cpp/src/io/json/json_normalization.cu         |  2 +-
 cpp/src/io/json/json_tree.cu                  |  2 +-
 cpp/src/io/json/nested_json.hpp               |  3 +-
 cpp/src/io/json/nested_json_gpu.cu            |  8 +-
 cpp/src/io/json/read_json.cu                  | 10 +--
 cpp/src/io/json/read_json.hpp                 |  2 +-
 cpp/src/io/json/write_json.cu                 | 26 +++---
 cpp/src/io/orc/reader_impl.hpp                |  2 +-
 cpp/src/io/orc/reader_impl_decode.cu          |  8 +-
 cpp/src/io/orc/reader_impl_helpers.cpp        |  2 +-
 cpp/src/io/orc/reader_impl_helpers.hpp        |  2 +-
 cpp/src/io/orc/stripe_enc.cu                  |  3 +-
 cpp/src/io/orc/writer_impl.cu                 | 13 +--
 cpp/src/io/parquet/predicate_pushdown.cpp     |  6 +-
 cpp/src/io/parquet/reader.cpp                 |  2 +-
 cpp/src/io/parquet/reader_impl.cpp            |  5 +-
 cpp/src/io/parquet/reader_impl.hpp            |  5 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    | 15 ++--
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  9 +-
 cpp/src/io/parquet/writer_impl.cu             | 13 +--
 cpp/src/io/text/multibyte_split.cu            |  7 +-
 cpp/src/io/utilities/column_buffer.cpp        |  6 +-
 cpp/src/io/utilities/column_buffer.hpp        |  5 +-
 cpp/src/io/utilities/data_casting.cu          |  2 +-
 cpp/src/io/utilities/output_builder.cuh       |  4 +-
 cpp/src/io/utilities/string_parsing.hpp       |  2 +-
 cpp/src/io/utilities/trie.cu                  |  5 +-
 cpp/src/join/conditional_join.cu              |  2 +-
 cpp/src/join/conditional_join.hpp             |  3 +-
 cpp/src/join/cross_join.cu                    |  2 +-
 cpp/src/join/distinct_hash_join.cu            |  5 +-
 cpp/src/join/hash_join.cu                     |  4 +-
 cpp/src/join/join.cu                          |  8 +-
 cpp/src/join/join_common_utils.cuh            |  2 +-
 cpp/src/join/join_utils.cu                    |  3 +-
 cpp/src/join/mixed_join.cu                    |  6 +-
 cpp/src/join/mixed_join_semi.cu               |  4 +-
 cpp/src/join/mixed_join_size_kernel.hpp       |  3 +
 cpp/src/join/semi_join.cu                     |  4 +-
 cpp/src/json/json_path.cu                     |  6 +-
 cpp/src/labeling/label_bins.cu                |  2 +-
 .../combine/concatenate_list_elements.cu      |  2 +-
 cpp/src/lists/combine/concatenate_rows.cu     |  8 +-
 cpp/src/lists/contains.cu                     |  6 +-
 cpp/src/lists/copying/concatenate.cu          |  2 +-
 cpp/src/lists/copying/copying.cu              |  2 +-
 cpp/src/lists/copying/gather.cu               |  2 +-
 cpp/src/lists/copying/scatter_helper.cu       |  3 +-
 cpp/src/lists/copying/segmented_gather.cu     |  2 +-
 cpp/src/lists/count_elements.cu               |  2 +-
 cpp/src/lists/dremel.cu                       |  3 +-
 cpp/src/lists/explode.cu                      |  2 +-
 cpp/src/lists/extract.cu                      |  4 +-
 cpp/src/lists/interleave_columns.cu           |  4 +-
 cpp/src/lists/lists_column_factories.cu       |  4 +-
 cpp/src/lists/reverse.cu                      |  4 +-
 cpp/src/lists/segmented_sort.cu               |  2 +-
 cpp/src/lists/sequences.cu                    |  2 +-
 cpp/src/lists/set_operations.cu               | 26 +++---
 .../stream_compaction/apply_boolean_mask.cu   |  4 +-
 cpp/src/lists/stream_compaction/distinct.cu   |  4 +-
 cpp/src/lists/utilities.cu                    |  3 +-
 cpp/src/lists/utilities.hpp                   |  2 +-
 cpp/src/merge/merge.cu                        | 12 +--
 cpp/src/partitioning/partitioning.cu          |  6 +-
 cpp/src/partitioning/round_robin.cu           |  2 +-
 cpp/src/quantiles/quantile.cu                 |  4 +-
 cpp/src/quantiles/quantiles.cu                |  6 +-
 cpp/src/quantiles/tdigest/tdigest.cu          |  4 +-
 .../quantiles/tdigest/tdigest_aggregation.cu  |  8 +-
 cpp/src/reductions/all.cu                     |  5 +-
 cpp/src/reductions/any.cu                     |  5 +-
 cpp/src/reductions/collect_ops.cu             |  3 +-
 cpp/src/reductions/compound.cuh               |  3 +-
 cpp/src/reductions/histogram.cu               |  5 +-
 cpp/src/reductions/max.cu                     |  2 +-
 cpp/src/reductions/mean.cu                    |  2 +-
 cpp/src/reductions/min.cu                     |  3 +-
 cpp/src/reductions/minmax.cu                  |  2 +-
 .../reductions/nested_type_minmax_util.cuh    |  5 +-
 cpp/src/reductions/nth_element.cu             |  2 +-
 cpp/src/reductions/product.cu                 |  2 +-
 cpp/src/reductions/reductions.cpp             |  6 +-
 cpp/src/reductions/scan/rank_scan.cu          |  4 +-
 cpp/src/reductions/scan/scan.cpp              |  3 +-
 cpp/src/reductions/scan/scan.cuh              |  2 +-
 cpp/src/reductions/scan/scan_exclusive.cu     |  2 +-
 cpp/src/reductions/scan/scan_inclusive.cu     |  2 +-
 cpp/src/reductions/segmented/all.cu           |  3 +-
 cpp/src/reductions/segmented/any.cu           |  3 +-
 cpp/src/reductions/segmented/compound.cuh     |  5 +-
 cpp/src/reductions/segmented/counts.cu        |  3 +-
 cpp/src/reductions/segmented/counts.hpp       |  2 +-
 cpp/src/reductions/segmented/max.cu           |  3 +-
 cpp/src/reductions/segmented/mean.cu          |  2 +-
 cpp/src/reductions/segmented/min.cu           |  3 +-
 cpp/src/reductions/segmented/nunique.cu       |  2 +-
 cpp/src/reductions/segmented/product.cu       |  3 +-
 cpp/src/reductions/segmented/reductions.cpp   |  2 +-
 cpp/src/reductions/segmented/simple.cuh       |  4 +-
 cpp/src/reductions/segmented/std.cu           |  2 +-
 cpp/src/reductions/segmented/sum.cu           |  3 +-
 .../reductions/segmented/sum_of_squares.cu    |  2 +-
 .../reductions/segmented/update_validity.cu   |  3 +-
 .../reductions/segmented/update_validity.hpp  |  2 +-
 cpp/src/reductions/segmented/var.cu           |  2 +-
 cpp/src/reductions/simple.cuh                 |  4 +-
 cpp/src/reductions/std.cu                     |  2 +-
 cpp/src/reductions/sum.cu                     |  2 +-
 cpp/src/reductions/sum_of_squares.cu          |  2 +-
 cpp/src/reductions/var.cu                     |  2 +-
 cpp/src/replace/clamp.cu                      |  4 +-
 cpp/src/replace/nans.cu                       |  2 +-
 cpp/src/replace/nulls.cu                      |  2 +-
 cpp/src/replace/replace.cu                    |  8 +-
 cpp/src/reshape/byte_cast.cu                  |  2 +-
 cpp/src/reshape/interleave_columns.cu         |  2 +-
 cpp/src/reshape/tile.cu                       |  2 +-
 cpp/src/rolling/detail/lead_lag_nested.cuh    |  5 +-
 cpp/src/rolling/detail/nth_element.cuh        |  2 +-
 .../detail/optimized_unbounded_window.cpp     |  5 +-
 .../detail/optimized_unbounded_window.hpp     |  2 +-
 cpp/src/rolling/detail/rolling.cuh            |  4 +-
 cpp/src/rolling/detail/rolling.hpp            |  3 +-
 .../rolling/detail/rolling_collect_list.cu    |  2 +-
 .../rolling/detail/rolling_collect_list.cuh   |  2 +-
 .../rolling/detail/rolling_fixed_window.cu    |  3 +-
 .../rolling/detail/rolling_variable_window.cu |  3 +-
 cpp/src/rolling/grouped_rolling.cu            |  7 +-
 cpp/src/rolling/rolling.cu                    |  3 +-
 cpp/src/round/round.cu                        |  2 +-
 cpp/src/scalar/scalar.cpp                     |  4 +-
 cpp/src/scalar/scalar_factories.cpp           |  2 +-
 cpp/src/search/contains_column.cu             |  6 +-
 cpp/src/search/contains_scalar.cu             |  3 +-
 cpp/src/search/contains_table.cu              |  4 +-
 cpp/src/search/search_ordered.cu              |  4 +-
 cpp/src/sort/rank.cu                          |  2 +-
 cpp/src/sort/segmented_sort.cu                |  2 +-
 cpp/src/sort/segmented_sort_impl.cuh          | 18 ++--
 cpp/src/sort/sort.cu                          |  4 +-
 cpp/src/sort/sort_column.cu                   |  3 +-
 cpp/src/sort/sort_column_impl.cuh             |  2 +-
 cpp/src/sort/sort_impl.cuh                    |  3 +-
 cpp/src/sort/stable_segmented_sort.cu         |  3 +-
 cpp/src/sort/stable_sort.cu                   |  4 +-
 cpp/src/sort/stable_sort_column.cu            |  3 +-
 .../stream_compaction/apply_boolean_mask.cu   |  2 +-
 cpp/src/stream_compaction/distinct.cu         |  5 +-
 cpp/src/stream_compaction/distinct_count.cu   |  3 +-
 .../stream_compaction/distinct_helpers.hpp    |  2 +-
 cpp/src/stream_compaction/drop_nans.cu        |  2 +-
 cpp/src/stream_compaction/drop_nulls.cu       |  2 +-
 cpp/src/stream_compaction/stable_distinct.cu  |  5 +-
 cpp/src/stream_compaction/unique.cu           |  2 +-
 cpp/src/strings/attributes.cu                 |  2 +-
 cpp/src/strings/capitalize.cu                 |  2 +-
 cpp/src/strings/case.cu                       |  2 +-
 cpp/src/strings/char_types/char_types.cu      |  2 +-
 cpp/src/strings/combine/concatenate.cu        |  2 +-
 cpp/src/strings/combine/join.cu               |  2 +-
 cpp/src/strings/combine/join_list_elements.cu |  2 +-
 cpp/src/strings/contains.cu                   |  2 +-
 cpp/src/strings/convert/convert_booleans.cu   |  2 +-
 cpp/src/strings/convert/convert_datetime.cu   |  4 +-
 cpp/src/strings/convert/convert_durations.cu  |  2 +-
 .../strings/convert/convert_fixed_point.cu    |  2 +-
 cpp/src/strings/convert/convert_floats.cu     |  2 +-
 cpp/src/strings/convert/convert_hex.cu        |  2 +-
 cpp/src/strings/convert/convert_integers.cu   |  2 +-
 cpp/src/strings/convert/convert_ipv4.cu       |  2 +-
 cpp/src/strings/convert/convert_lists.cu      |  2 +-
 cpp/src/strings/convert/convert_urls.cu       |  2 +-
 cpp/src/strings/copying/concatenate.cu        |  4 +-
 cpp/src/strings/copying/copy_range.cu         |  2 +-
 cpp/src/strings/copying/copying.cu            |  2 +-
 cpp/src/strings/copying/shift.cu              |  2 +-
 cpp/src/strings/count_matches.cu              |  3 +-
 cpp/src/strings/count_matches.hpp             |  2 +-
 cpp/src/strings/extract/extract.cu            |  2 +-
 cpp/src/strings/extract/extract_all.cu        |  2 +-
 cpp/src/strings/filling/fill.cu               |  2 +-
 cpp/src/strings/filter_chars.cu               |  6 +-
 cpp/src/strings/like.cu                       |  2 +-
 cpp/src/strings/padding.cu                    |  2 +-
 cpp/src/strings/regex/utilities.cuh           |  2 +-
 cpp/src/strings/repeat_strings.cu             |  2 +-
 cpp/src/strings/replace/backref_re.cu         |  4 +-
 cpp/src/strings/replace/find_replace.cu       |  2 +-
 cpp/src/strings/replace/multi.cu              |  8 +-
 cpp/src/strings/replace/multi_re.cu           |  4 +-
 cpp/src/strings/replace/replace.cu            |  4 +-
 cpp/src/strings/replace/replace_nulls.cu      |  2 +-
 cpp/src/strings/replace/replace_re.cu         |  2 +-
 cpp/src/strings/replace/replace_slice.cu      |  2 +-
 cpp/src/strings/reverse.cu                    |  2 +-
 cpp/src/strings/scan/scan_inclusive.cu        |  2 +-
 cpp/src/strings/search/find.cu                |  2 +-
 cpp/src/strings/search/find_multiple.cu       |  2 +-
 cpp/src/strings/search/findall.cu             |  2 +-
 cpp/src/strings/slice.cu                      |  2 +-
 cpp/src/strings/split/partition.cu            |  2 +-
 cpp/src/strings/split/split.cu                |  2 +-
 cpp/src/strings/split/split.cuh               |  2 +-
 cpp/src/strings/split/split_re.cu             |  6 +-
 cpp/src/strings/split/split_record.cu         |  2 +-
 cpp/src/strings/strings_column_factories.cu   |  2 +-
 cpp/src/strings/strings_scalar_factories.cpp  |  2 +-
 cpp/src/strings/strip.cu                      |  2 +-
 cpp/src/strings/translate.cu                  |  6 +-
 cpp/src/strings/utilities.cu                  |  2 +-
 cpp/src/strings/wrap.cu                       |  2 +-
 cpp/src/structs/copying/concatenate.cu        |  2 +-
 cpp/src/structs/scan/scan_inclusive.cu        |  2 +-
 cpp/src/structs/structs_column_factories.cu   |  2 +-
 cpp/src/structs/utilities.cpp                 |  3 +-
 cpp/src/table/row_operators.cu                | 26 +++---
 cpp/src/table/table.cpp                       |  2 +-
 cpp/src/text/bpe/byte_pair_encoding.cu        |  2 +-
 cpp/src/text/bpe/load_merge_pairs.cu          |  2 +-
 cpp/src/text/detokenize.cu                    |  4 +-
 cpp/src/text/edit_distance.cu                 |  2 +-
 cpp/src/text/generate_ngrams.cu               |  4 +-
 cpp/src/text/jaccard.cu                       |  2 +-
 cpp/src/text/minhash.cu                       |  2 +-
 cpp/src/text/ngrams_tokenize.cu               |  8 +-
 cpp/src/text/normalize.cu                     |  2 +-
 cpp/src/text/replace.cu                       |  2 +-
 cpp/src/text/stemmer.cu                       |  2 +-
 cpp/src/text/subword/load_hash_file.cu        |  2 +-
 cpp/src/text/subword/subword_tokenize.cu      |  2 +-
 cpp/src/text/tokenize.cu                      |  6 +-
 cpp/src/text/vocabulary_tokenize.cu           |  2 +-
 cpp/src/transform/bools_to_mask.cu            |  2 +-
 cpp/src/transform/compute_column.cu           |  2 +-
 cpp/src/transform/encode.cu                   |  2 +-
 cpp/src/transform/mask_to_bools.cu            |  2 +-
 cpp/src/transform/nans_to_nulls.cu            |  2 +-
 cpp/src/transform/one_hot_encode.cu           |  2 +-
 cpp/src/transform/row_bit_count.cu            |  4 +-
 cpp/src/transform/transform.cpp               |  2 +-
 cpp/src/transpose/transpose.cu                |  2 +-
 cpp/src/unary/cast_ops.cu                     |  2 +-
 cpp/src/unary/math_ops.cu                     |  4 +-
 cpp/src/unary/nan_ops.cu                      |  2 +-
 cpp/src/unary/null_ops.cu                     |  3 +-
 cpp/src/unary/unary_ops.cuh                   |  2 +-
 cpp/src/utilities/host_memory.cpp             |  2 +-
 cpp/tests/bitmask/bitmask_tests.cpp           |  3 +-
 cpp/tests/bitmask/valid_if_tests.cu           | 11 +--
 cpp/tests/column/column_test.cpp              |  5 +-
 cpp/tests/copying/detail_gather_tests.cu      |  9 +-
 cpp/tests/copying/gather_str_tests.cpp        | 11 ++-
 cpp/tests/copying/shift_tests.cpp             |  6 +-
 cpp/tests/copying/split_tests.cpp             |  9 +-
 .../device_atomics/device_atomics_test.cu     |  5 +-
 cpp/tests/dictionary/search_test.cpp          |  9 +-
 cpp/tests/fixed_point/fixed_point_tests.cu    |  7 +-
 cpp/tests/groupby/histogram_tests.cpp         |  5 +-
 cpp/tests/groupby/tdigest_tests.cu            | 15 ++--
 cpp/tests/io/json/json_chunked_reader.cu      |  6 +-
 .../io/json/json_quote_normalization_test.cpp |  3 +-
 cpp/tests/io/json/json_tree.cpp               | 23 ++---
 cpp/tests/io/json/json_type_cast_test.cu      |  9 +-
 .../json_whitespace_normalization_test.cu     |  5 +-
 cpp/tests/io/json/nested_json_test.cpp        | 41 ++++-----
 cpp/tests/io/orc_chunked_reader_test.cu       |  5 +-
 cpp/tests/io/parquet_chunked_reader_test.cu   |  3 +-
 cpp/tests/io/parquet_writer_test.cpp          |  3 +-
 cpp/tests/io/type_inference_test.cu           | 29 +++---
 cpp/tests/iterator/iterator_tests.cuh         |  3 +-
 cpp/tests/iterator/value_iterator_test.cuh    |  5 +-
 .../iterator/value_iterator_test_strings.cu   | 11 +--
 cpp/tests/join/distinct_join_tests.cpp        |  3 +-
 cpp/tests/join/join_tests.cpp                 |  7 +-
 cpp/tests/join/semi_anti_join_tests.cpp       |  5 +-
 cpp/tests/large_strings/json_tests.cu         |  3 +-
 .../large_strings/large_strings_fixture.cpp   |  2 +-
 .../partitioning/hash_partition_test.cpp      |  3 +-
 .../quantiles/percentile_approx_test.cpp      |  3 +-
 .../reductions/segmented_reduction_tests.cpp  | 71 +++++++--------
 cpp/tests/scalar/scalar_device_view_test.cu   |  3 +-
 cpp/tests/sort/segmented_sort_tests.cpp       |  3 +-
 cpp/tests/streams/reduction_test.cpp          |  5 +-
 cpp/tests/strings/contains_tests.cpp          |  5 +-
 cpp/tests/strings/factories_test.cu           | 11 +--
 cpp/tests/strings/integers_tests.cpp          |  3 +-
 cpp/tests/structs/utilities_tests.cpp         | 39 ++++----
 cpp/tests/table/table_view_tests.cu           |  5 +-
 cpp/tests/types/type_dispatcher_test.cu       |  5 +-
 cpp/tests/utilities/tdigest_utilities.cu      |  7 +-
 .../utilities_tests/batched_memset_tests.cu   |  3 +-
 .../utilities_tests/pinned_memory_tests.cpp   |  1 -
 cpp/tests/utilities_tests/span_tests.cu       |  3 +-
 .../source/libcudf_docs/api_docs/index.rst    |  1 +
 .../libcudf_docs/api_docs/memory_resource.rst |  5 ++
 .../main/native/include/maps_column_view.hpp  | 11 ++-
 java/src/main/native/src/ColumnVectorJni.cpp  |  5 +-
 java/src/main/native/src/ColumnViewJni.cu     | 10 +--
 java/src/main/native/src/RmmJni.cpp           |  9 +-
 java/src/main/native/src/TableJni.cpp         |  5 +-
 java/src/main/native/src/maps_column_view.cu  |  6 +-
 .../strings/src/strings/udf/udf_apis.cu       |  3 +-
 .../pylibcudf/pylibcudf/libcudf/interop.pxd   |  4 +-
 652 files changed, 1771 insertions(+), 1824 deletions(-)
 create mode 100644 cpp/include/cudf/utilities/memory_resource.hpp
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/memory_resource.rst

diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 0970003deb2..dc258e32dc5 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -28,10 +28,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -507,7 +507,7 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
                            null_mask.end(),
                            thrust::identity<bool>{},
                            cudf::get_default_stream(),
-                           rmm::mr::get_current_device_resource());
+                           cudf::get_current_device_resource_ref());
 
   return std::make_unique<cudf::column>(
     dtype,
@@ -591,7 +591,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
                            null_mask.end() - 1,
                            thrust::identity<bool>{},
                            cudf::get_default_stream(),
-                           rmm::mr::get_current_device_resource());
+                           cudf::get_current_device_resource_ref());
   return cudf::make_strings_column(
     num_rows,
     std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
@@ -626,7 +626,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
                                         cudf::out_of_bounds_policy::DONT_CHECK,
                                         cudf::detail::negative_index_policy::NOT_ALLOWED,
                                         cudf::get_default_stream(),
-                                        rmm::mr::get_current_device_resource());
+                                        cudf::get_current_device_resource_ref());
   return std::move(str_table->release()[0]);
 }
 
@@ -688,7 +688,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
                                         valids.end(),
                                         thrust::identity<bool>{},
                                         cudf::get_default_stream(),
-                                        rmm::mr::get_current_device_resource());
+                                        cudf::get_current_device_resource_ref());
         }
         return std::pair<rmm::device_buffer, cudf::size_type>{};
       }();
@@ -782,7 +782,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
                                                           valids.end(),
                                                           thrust::identity<bool>{},
                                                           cudf::get_default_stream(),
-                                                          rmm::mr::get_current_device_resource());
+                                                          cudf::get_current_device_resource_ref());
     list_column                  = cudf::make_lists_column(
       current_num_rows,
       std::move(offsets_column),
@@ -933,7 +933,7 @@ std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
                                   thrust::make_counting_iterator<cudf::size_type>(size),
                                   bool_generator{seed, 1.0 - *null_probability},
                                   cudf::get_default_stream(),
-                                  rmm::mr::get_current_device_resource());
+                                  cudf::get_current_device_resource_ref());
   }
 }
 
diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp b/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
index 3e254f49805..0bf1eee4e85 100644
--- a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
+++ b/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
@@ -17,6 +17,8 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <string>
 
@@ -36,7 +38,7 @@ std::unique_ptr<cudf::column> generate_random_string_column(
   cudf::size_type upper,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a column of random numbers
@@ -61,7 +63,7 @@ std::unique_ptr<cudf::column> generate_random_numeric_column(
   T upper,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a primary key column
@@ -81,7 +83,7 @@ std::unique_ptr<cudf::column> generate_primary_key_column(
   cudf::scalar const& start,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a column where all the rows have the same string value
@@ -101,7 +103,7 @@ std::unique_ptr<cudf::column> generate_repeat_string_column(
   std::string const& value,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a column by randomly choosing from set of strings
@@ -121,7 +123,7 @@ std::unique_ptr<cudf::column> generate_random_string_column_from_set(
   cudf::host_span<const char* const> set,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a column consisting of a repeating sequence of integers
@@ -145,6 +147,6 @@ std::unique_ptr<cudf::column> generate_repeat_sequence_column(
   bool zero_indexed,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp b/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
index 36bf9c49cea..d4368906702 100644
--- a/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
+++ b/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
@@ -35,6 +35,9 @@
 #include <cudf/transform.hpp>
 #include <cudf/unary.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <vector>
 
 namespace cudf::datagen {
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp b/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
index 11091689469..7d862afe755 100644
--- a/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
+++ b/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
@@ -20,6 +20,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <vector>
 
@@ -37,7 +39,7 @@ std::unique_ptr<cudf::column> add_calendrical_days(
   cudf::column_view const& timestamp_days,
   cudf::column_view const& days,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Perform a left join operation between two tables
@@ -56,7 +58,7 @@ std::unique_ptr<cudf::table> perform_left_join(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `p_retailprice` column of the `part` table
@@ -68,7 +70,7 @@ std::unique_ptr<cudf::table> perform_left_join(
 [[nodiscard]] std::unique_ptr<cudf::column> calculate_p_retailprice(
   cudf::column_view const& p_partkey,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `l_suppkey` column of the `lineitem` table
@@ -84,7 +86,7 @@ std::unique_ptr<cudf::table> perform_left_join(
   cudf::size_type scale_factor,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `ps_suppkey` column of the `partsupp` table
@@ -100,7 +102,7 @@ std::unique_ptr<cudf::table> perform_left_join(
   cudf::size_type scale_factor,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /**
  * @brief Calculate the cardinality of the `lineitem` table
  *
@@ -111,7 +113,7 @@ std::unique_ptr<cudf::table> perform_left_join(
 [[nodiscard]] cudf::size_type calculate_l_cardinality(
   cudf::column_view const& o_rep_freqs,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /**
  * @brief Calculate the charge column for the `lineitem` table
  *
@@ -126,7 +128,7 @@ std::unique_ptr<cudf::table> perform_left_join(
   cudf::column_view const& tax,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a column of random addresses according to TPC-H specification clause 4.2.2.7
@@ -138,7 +140,7 @@ std::unique_ptr<cudf::table> perform_left_join(
 [[nodiscard]] std::unique_ptr<cudf::column> generate_address_column(
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a phone number column according to TPC-H specification clause 4.2.2.9
@@ -150,6 +152,6 @@ std::unique_ptr<cudf::table> perform_left_join(
 [[nodiscard]] std::unique_ptr<cudf::column> generate_phone_column(
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
index 9001c50c5a5..236fe8095ad 100644
--- a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
+++ b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
@@ -36,6 +36,9 @@
 #include <cudf/transform.hpp>
 #include <cudf/unary.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <array>
 #include <string>
 #include <vector>
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
index a6286dd8dba..6e09c1e5708 100644
--- a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
+++ b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
@@ -17,6 +17,8 @@
 #pragma once
 
 #include <cudf/table/table.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace datagen {
@@ -32,7 +34,7 @@ std::tuple<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>, std::uniq
 generate_orders_lineitem_part(
   double scale_factor,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `partsupp` table
@@ -44,7 +46,7 @@ generate_orders_lineitem_part(
 std::unique_ptr<cudf::table> generate_partsupp(
   double scale_factor,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `supplier` table
@@ -56,7 +58,7 @@ std::unique_ptr<cudf::table> generate_partsupp(
 std::unique_ptr<cudf::table> generate_supplier(
   double scale_factor,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `customer` table
@@ -68,7 +70,7 @@ std::unique_ptr<cudf::table> generate_supplier(
 std::unique_ptr<cudf::table> generate_customer(
   double scale_factor,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `nation` table
@@ -78,7 +80,7 @@ std::unique_ptr<cudf::table> generate_customer(
  */
 std::unique_ptr<cudf::table> generate_nation(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `region` table
@@ -88,7 +90,7 @@ std::unique_ptr<cudf::table> generate_nation(
  */
 std::unique_ptr<cudf::table> generate_region(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace datagen
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu
index 910fc689c0b..161f67425c1 100644
--- a/cpp/benchmarks/copying/contiguous_split.cu
+++ b/cpp/benchmarks/copying/contiguous_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/contiguous_split.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -32,7 +33,7 @@ void contiguous_split(cudf::table_view const& src_table, std::vector<cudf::size_
 
 void chunked_pack(cudf::table_view const& src_table, std::vector<cudf::size_type> const&)
 {
-  auto const mr     = rmm::mr::get_current_device_resource();
+  auto const mr     = cudf::get_current_device_resource_ref();
   auto const stream = cudf::get_default_stream();
   auto user_buffer  = rmm::device_uvector<std::uint8_t>(100L * 1024 * 1024, stream, mr);
   auto chunked_pack = cudf::chunked_pack::create(src_table, user_buffer.size(), mr);
diff --git a/cpp/benchmarks/copying/shift.cu b/cpp/benchmarks/copying/shift.cu
index efc385cf10b..8f8e17ad4d0 100644
--- a/cpp/benchmarks/copying/shift.cu
+++ b/cpp/benchmarks/copying/shift.cu
@@ -20,14 +20,13 @@
 #include <cudf/copying.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
   T value                           = 0,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto s = new ScalarType(value, true, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index 8900899f9be..2f697ab0459 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -16,10 +16,11 @@
 
 #pragma once
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/device/statistics_resource_adaptor.hpp>
 
@@ -83,13 +84,13 @@ class benchmark : public ::benchmark::Fixture {
   void SetUp(::benchmark::State const& state) override
   {
     mr = make_pool_instance();
-    rmm::mr::set_current_device_resource(mr.get());  // set default resource to pool
+    cudf::set_current_device_resource(mr.get());  // set default resource to pool
   }
 
   void TearDown(::benchmark::State const& state) override
   {
     // reset default resource to the initial resource
-    rmm::mr::set_current_device_resource(nullptr);
+    cudf::set_current_device_resource(nullptr);
     mr.reset();
   }
 
@@ -106,13 +107,13 @@ class benchmark : public ::benchmark::Fixture {
 class memory_stats_logger {
  public:
   memory_stats_logger()
-    : existing_mr(rmm::mr::get_current_device_resource()),
+    : existing_mr(cudf::get_current_device_resource()),
       statistics_mr(rmm::mr::statistics_resource_adaptor(existing_mr))
   {
-    rmm::mr::set_current_device_resource(&statistics_mr);
+    cudf::set_current_device_resource(&statistics_mr);
   }
 
-  ~memory_stats_logger() { rmm::mr::set_current_device_resource(existing_mr); }
+  ~memory_stats_logger() { cudf::set_current_device_resource(existing_mr); }
 
   [[nodiscard]] size_t peak_memory_usage() const noexcept
   {
diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
index df1492690bb..63f09285a26 100644
--- a/cpp/benchmarks/fixture/nvbench_fixture.hpp
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
@@ -24,10 +25,8 @@
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <string>
 
@@ -110,7 +109,7 @@ struct nvbench_base_fixture {
     }
 
     mr = create_memory_resource(rmm_mode);
-    rmm::mr::set_current_device_resource(mr.get());
+    cudf::set_current_device_resource(mr.get());
     std::cout << "RMM memory resource = " << rmm_mode << "\n";
 
     cudf::set_pinned_memory_resource(create_cuio_host_memory_resource(cuio_host_mode));
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 645994f3f0d..fe24fb58728 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -18,9 +18,9 @@
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <unistd.h>
 
@@ -34,7 +34,7 @@ temp_directory const cuio_source_sink_pair::tmpdir{"cudf_gbench"};
 // Don't use cudf's pinned pool for the source data
 rmm::host_async_resource_ref pinned_memory_resource()
 {
-  static rmm::mr::pinned_host_memory_resource mr = rmm::mr::pinned_host_memory_resource{};
+  static auto mr = rmm::mr::pinned_host_memory_resource{};
 
   return mr;
 }
diff --git a/cpp/benchmarks/io/json/nested_json.cpp b/cpp/benchmarks/io/json/nested_json.cpp
index 9fd8de172a3..ae3528b783c 100644
--- a/cpp/benchmarks/io/json/nested_json.cpp
+++ b/cpp/benchmarks/io/json/nested_json.cpp
@@ -23,6 +23,7 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -170,7 +171,7 @@ void BM_NESTED_JSON(nvbench::state& state)
       cudf::device_span<char const>{input->data(), static_cast<size_t>(input->size())},
       default_options,
       cudf::get_default_stream(),
-      rmm::mr::get_current_device_resource());
+      cudf::get_current_device_resource_ref());
   });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
@@ -201,7 +202,7 @@ void BM_NESTED_JSON_DEPTH(nvbench::state& state)
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     // Allocate device-side temporary storage & run algorithm
     cudf::io::json::detail::device_parse_nested_json(
-      input, default_options, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      input, default_options, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
index e91bf06fdfa..6f20b4bd457 100644
--- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <BS_thread_pool.hpp>
@@ -109,7 +110,7 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
                    auto const stream = streams[index % num_threads];
                    cudf::io::orc_reader_options read_opts =
                      cudf::io::orc_reader_options::builder(source_info_vector[index]);
-                   cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
+                   cudf::io::read_orc(read_opts, stream, cudf::get_current_device_resource_ref());
                  };
 
                  threads.pause();
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index 9e76ebb71ab..3abd4280081 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <nvtx3/nvtx3.hpp>
@@ -111,7 +112,7 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
                  auto const stream = streams[index % num_threads];
                  cudf::io::parquet_reader_options read_opts =
                    cudf::io::parquet_reader_options::builder(source_info_vector[index]);
-                 cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource());
+                 cudf::io::read_parquet(read_opts, stream, cudf::get_current_device_resource_ref());
                };
 
                threads.pause();
diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index fd0cebb12ea..e2576c0d690 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 
@@ -138,7 +139,7 @@ void BM_iterator(benchmark::State& state)
 
   // Initialize dev_result to false
   auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 3d9d9c57548..1f1ca414ad1 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -29,6 +29,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -86,7 +87,7 @@ void BM_join(state_type& state, Join JoinFunc)
                                   validity + size,
                                   thrust::identity<bool>{},
                                   cudf::get_default_stream(),
-                                  rmm::mr::get_current_device_resource());
+                                  cudf::get_current_device_resource_ref());
   };
 
   std::unique_ptr<cudf::column> right_key_column0 = [&]() {
diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
index 06b793bf5f1..6d01f132189 100644
--- a/cpp/benchmarks/json/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -25,6 +25,7 @@
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -171,7 +172,7 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   json_benchmark_row_builder jb{
     desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
   auto [offsets, chars] = cudf::strings::detail::make_strings_children(
-    jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    jb, num_rows, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
 }
 
diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu
index 570decf410f..526a43d9ff5 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists.cu
@@ -22,6 +22,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
@@ -38,7 +39,7 @@ template <class TypeParam, bool coalesce>
 void BM_lists_scatter(::benchmark::State& state)
 {
   auto stream = cudf::get_default_stream();
-  auto mr     = rmm::mr::get_current_device_resource();
+  auto mr     = cudf::get_current_device_resource_ref();
 
   cudf::size_type const base_size{(cudf::size_type)state.range(0)};
   cudf::size_type const num_elements_per_row{(cudf::size_type)state.range(1)};
diff --git a/cpp/benchmarks/lists/set_operations.cpp b/cpp/benchmarks/lists/set_operations.cpp
index 6bed33d2570..8a94227c23b 100644
--- a/cpp/benchmarks/lists/set_operations.cpp
+++ b/cpp/benchmarks/lists/set_operations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmarks/common/generate_input.hpp>
 
 #include <cudf/lists/set_operations.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -55,7 +56,7 @@ void nvbench_set_op(nvbench::state& state, BenchFuncPtr bfunc)
           cudf::null_equality::EQUAL,
           cudf::nan_equality::ALL_EQUAL,
           cudf::get_default_stream(),
-          rmm::mr::get_current_device_resource());
+          cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/benchmarks/merge/merge_lists.cpp b/cpp/benchmarks/merge/merge_lists.cpp
index bcb9f10ac83..2fe8b02055b 100644
--- a/cpp/benchmarks/merge/merge_lists.cpp
+++ b/cpp/benchmarks/merge/merge_lists.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/merge.hpp>
 #include <cudf/detail/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -27,11 +28,11 @@ void nvbench_merge_list(nvbench::state& state)
 
   auto const input1 = create_lists_data(state);
   auto const sorted_input1 =
-    cudf::detail::sort(*input1, {}, {}, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::sort(*input1, {}, {}, stream, cudf::get_current_device_resource_ref());
 
   auto const input2 = create_lists_data(state);
   auto const sorted_input2 =
-    cudf::detail::sort(*input2, {}, {}, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::sort(*input2, {}, {}, stream, cudf::get_current_device_resource_ref());
 
   stream.synchronize();
 
@@ -43,7 +44,7 @@ void nvbench_merge_list(nvbench::state& state)
                         {cudf::order::ASCENDING},
                         {},
                         stream_view,
-                        rmm::mr::get_current_device_resource());
+                        cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/benchmarks/merge/merge_structs.cpp b/cpp/benchmarks/merge/merge_structs.cpp
index 9c56b44b623..cfb44d2737f 100644
--- a/cpp/benchmarks/merge/merge_structs.cpp
+++ b/cpp/benchmarks/merge/merge_structs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/merge.hpp>
 #include <cudf/detail/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -27,11 +28,11 @@ void nvbench_merge_struct(nvbench::state& state)
 
   auto const input1 = create_structs_data(state);
   auto const sorted_input1 =
-    cudf::detail::sort(*input1, {}, {}, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::sort(*input1, {}, {}, stream, cudf::get_current_device_resource_ref());
 
   auto const input2 = create_structs_data(state);
   auto const sorted_input2 =
-    cudf::detail::sort(*input2, {}, {}, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::sort(*input2, {}, {}, stream, cudf::get_current_device_resource_ref());
 
   stream.synchronize();
 
@@ -43,7 +44,7 @@ void nvbench_merge_struct(nvbench::state& state)
                         {cudf::order::ASCENDING},
                         {},
                         stream_view,
-                        rmm::mr::get_current_device_resource());
+                        cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
index 14876c80d3e..05aeed47fa6 100644
--- a/cpp/benchmarks/reduction/rank.cpp
+++ b/cpp/benchmarks/reduction/rank.cpp
@@ -21,6 +21,7 @@
 #include <cudf/detail/scan.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/lists/list_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -45,7 +46,7 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_list<typ
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
     result = cudf::detail::inclusive_dense_rank_scan(
-      input, stream_view, rmm::mr::get_current_device_resource());
+      input, stream_view, cudf::get_current_device_resource_ref());
   });
 
   state.add_element_count(input.size());
diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp
index a781f75a314..2de1db6dfe5 100644
--- a/cpp/benchmarks/reduction/scan_structs.cpp
+++ b/cpp/benchmarks/reduction/scan_structs.cpp
@@ -20,6 +20,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/scan.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -57,7 +58,7 @@ static void nvbench_structs_scan(nvbench::state& state)
   std::unique_ptr<cudf::column> result = nullptr;
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     result = cudf::detail::scan_inclusive(
-      input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
+      input_view, *agg, null_policy, stream, cudf::get_current_device_resource_ref());
   });
 
   state.add_element_count(input_view.size());
diff --git a/cpp/benchmarks/search/contains_table.cpp b/cpp/benchmarks/search/contains_table.cpp
index 17702d0741c..3bc1ac9c70a 100644
--- a/cpp/benchmarks/search/contains_table.cpp
+++ b/cpp/benchmarks/search/contains_table.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,7 @@
 #include <cudf/detail/search.hpp>
 #include <cudf/lists/list_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -58,7 +57,7 @@ static void nvbench_contains_table(nvbench::state& state, nvbench::type_list<Typ
                              cudf::null_equality::EQUAL,
                              cudf::nan_equality::ALL_EQUAL,
                              stream_view,
-                             rmm::mr::get_current_device_resource());
+                             cudf::get_current_device_resource_ref());
   });
 
   state.add_buffer_size(
diff --git a/cpp/benchmarks/sort/rank_lists.cpp b/cpp/benchmarks/sort/rank_lists.cpp
index 7015fe08089..8dfede3cb3a 100644
--- a/cpp/benchmarks/sort/rank_lists.cpp
+++ b/cpp/benchmarks/sort/rank_lists.cpp
@@ -21,6 +21,7 @@
 #include <cudf_test/column_utilities.hpp>
 
 #include <cudf/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -39,7 +40,7 @@ void nvbench_rank_lists(nvbench::state& state, nvbench::type_list<nvbench::enum_
                cudf::null_order::AFTER,
                false,
                cudf::get_default_stream(),
-               rmm::mr::get_current_device_resource());
+               cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/benchmarks/sort/rank_structs.cpp b/cpp/benchmarks/sort/rank_structs.cpp
index 8b4b09464d8..7575ba48a1a 100644
--- a/cpp/benchmarks/sort/rank_structs.cpp
+++ b/cpp/benchmarks/sort/rank_structs.cpp
@@ -19,6 +19,7 @@
 #include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -37,7 +38,7 @@ void nvbench_rank_structs(nvbench::state& state, nvbench::type_list<nvbench::enu
                cudf::null_order::AFTER,
                false,
                cudf::get_default_stream(),
-               rmm::mr::get_current_device_resource());
+               cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/benchmarks/sort/sort_lists.cpp b/cpp/benchmarks/sort/sort_lists.cpp
index 2052de3688c..abc89472538 100644
--- a/cpp/benchmarks/sort/sort_lists.cpp
+++ b/cpp/benchmarks/sort/sort_lists.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/detail/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -33,7 +34,7 @@ void sort_multiple_lists(nvbench::state& state)
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     cudf::detail::sorted_order(
-      *input_table, {}, {}, stream, rmm::mr::get_current_device_resource());
+      *input_table, {}, {}, stream, cudf::get_current_device_resource_ref());
   });
 }
 
@@ -76,7 +77,8 @@ void sort_lists_of_structs(nvbench::state& state)
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
-    cudf::detail::sorted_order(input_table, {}, {}, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::sorted_order(
+      input_table, {}, {}, stream, cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/benchmarks/sort/sort_structs.cpp b/cpp/benchmarks/sort/sort_structs.cpp
index 3a3d1080ba0..fa1cf0279dd 100644
--- a/cpp/benchmarks/sort/sort_structs.cpp
+++ b/cpp/benchmarks/sort/sort_structs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/detail/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -26,7 +27,8 @@ void nvbench_sort_struct(nvbench::state& state)
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
-    cudf::detail::sorted_order(*input, {}, {}, stream_view, rmm::mr::get_current_device_resource());
+    cudf::detail::sorted_order(
+      *input, {}, {}, stream_view, cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index aa054ba93e9..fce8adb4c06 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -223,17 +223,17 @@ can be passed to libcudf functions via `rmm::device_async_resource_ref` paramete
 
 ### Current Device Memory Resource
 
-RMM provides a "default" memory resource for each device that can be accessed and updated via the
-`rmm::mr::get_current_device_resource()` and `rmm::mr::set_current_device_resource(...)` functions,
-respectively. All memory resource parameters should be defaulted to use the return value of
-`rmm::mr::get_current_device_resource()`.
+RMM provides a "default" memory resource for each device and functions to access and set it. libcudf
+provides wrappers for these functions in `cpp/include/cudf/utilities/memory_resource.hpp`.
+All memory resource parameters should be defaulted to use the return value of
+`cudf::get_current_device_resource_ref()`.
 
 ### Resource Refs
 
 Memory resources are passed via resource ref parameters. A resource ref is a memory resource wrapper
 that enables consumers to specify properties of resources that they expect. These are defined
-in the `cuda::mr` namespace of libcu++, but RMM provides some convenience wrappers in
-`rmm/resource_ref.hpp`:
+in the `cuda::mr` namespace of libcu++, but RMM provides some convenience aliases in
+`rmm/resource_ref.hpp`.
  - `rmm::device_resource_ref` accepts a memory resource that provides synchronous allocation
     of device-accessible memory.
  - `rmm::device_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
@@ -247,7 +247,8 @@ in the `cuda::mr` namespace of libcu++, but RMM provides some convenience wrappe
  - `rmm::host_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
     of host- and device-accessible memory.
 
-See the libcu++ [docs on `resource_ref`](https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_resource/resource_ref.html) for more information.
+See the libcu++ [docs on `resource_ref`](https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_resource/resource_ref.html)
+for more information.
 
 ## cudf::column
 
@@ -515,7 +516,7 @@ For example:
 // cpp/include/cudf/header.hpp
 void external_function(...,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 // cpp/include/cudf/detail/header.hpp
 namespace detail{
@@ -575,7 +576,7 @@ whose outputs will be returned. Example:
 // Returned `column` contains newly allocated memory,
 // therefore the API must accept a memory resource pointer
 std::unique_ptr<column> returns_output_memory(
-  ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  ..., rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 // This API does not allocate any new *output* memory, therefore
 // a memory resource is unnecessary
@@ -586,17 +587,17 @@ This rule automatically applies to all detail APIs that allocate memory. Any det
 called by any public API, and therefore could be allocating memory that is returned to the user.
 To support such uses cases, all detail APIs allocating memory resources should accept an `mr`
 parameter. Callers are responsible for either passing through a provided `mr` or
-`rmm::mr::get_current_device_resource()` as needed.
+`cudf::get_current_device_resource_ref()` as needed.
 
 ### Temporary Memory
 
 Not all memory allocated within a libcudf API is returned to the caller. Often algorithms must
 allocate temporary, scratch memory for intermediate results. Always use the default resource
-obtained from `rmm::mr::get_current_device_resource()` for temporary memory allocations. Example:
+obtained from `cudf::get_current_device_resource_ref()` for temporary memory allocations. Example:
 
 ```c++
 rmm::device_buffer some_function(
-  ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) {
+  ..., rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) {
     rmm::device_buffer returned_buffer(..., mr); // Returned buffer uses the passed in MR
     ...
     rmm::device_buffer temporary_buffer(...); // Temporary buffer uses default MR
@@ -613,7 +614,7 @@ use memory resources for device memory allocation with automated lifetime manage
 #### rmm::device_buffer
 Allocates a specified number of bytes of untyped, uninitialized device memory using a
 memory resource. If no `rmm::device_async_resource_ref` is explicitly provided, it uses
-`rmm::mr::get_current_device_resource()`.
+`cudf::get_current_device_resource_ref()`.
 
 `rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the
 `device_buffer`'s device memory on the specified stream, whereas a move moves ownership of the
@@ -685,7 +686,7 @@ rmm::device_uvector<int32_t> v(100, s);
 // Initializes the elements to 0
 thrust::uninitialized_fill(thrust::cuda::par.on(s.value()), v.begin(), v.end(), int32_t{0});
 
-rmm::mr::device_memory_resource * mr = new my_custom_resource{...};
+auto mr = new my_custom_resource{...};
 // Allocates uninitialized storage for 100 `int32_t` elements on stream `s` using the resource `mr`
 rmm::device_uvector<int32_t> v2{100, s, mr};
 ```
diff --git a/cpp/examples/basic/src/process_csv.cpp b/cpp/examples/basic/src/process_csv.cpp
index 0d2b6b099ac..d27789a78a6 100644
--- a/cpp/examples/basic/src/process_csv.cpp
+++ b/cpp/examples/basic/src/process_csv.cpp
@@ -90,7 +90,7 @@ int main(int argc, char** argv)
   // it being set as the default
   // Also, call this before the first libcudf API call to ensure all data is allocated by the same
   // memory resource.
-  rmm::mr::set_current_device_resource(&mr);
+  cudf::set_current_device_resource(&mr);
 
   // Read data
   auto stock_table_with_metadata = read_csv("4stock_5day.csv");
diff --git a/cpp/examples/interop/interop.cpp b/cpp/examples/interop/interop.cpp
index 8271c3836e4..133a4e3a514 100644
--- a/cpp/examples/interop/interop.cpp
+++ b/cpp/examples/interop/interop.cpp
@@ -17,6 +17,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/io/csv.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -104,7 +105,7 @@ auto make_chars_and_offsets(std::vector<std::string> const& strings)
 std::unique_ptr<cudf::column> arrow_string_view_to_cudf_column(
   std::shared_ptr<arrow::StringViewArray> const& array,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   // Convert the string views into chars and offsets
   std::vector<std::string> strings;
diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp
index c7c54592b70..f067b358f2d 100644
--- a/cpp/examples/nested_types/deduplication.cpp
+++ b/cpp/examples/nested_types/deduplication.cpp
@@ -192,7 +192,7 @@ int main(int argc, char const** argv)
 
   auto pool     = mr_name == "pool";
   auto resource = create_memory_resource(pool);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   std::cout << "Reading " << input_filepath << "..." << std::endl;
   // read input file
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 274a2599189..442731694fa 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -123,7 +123,7 @@ int main(int argc, char const** argv)
   // Create and use a memory pool
   bool is_pool_used = true;
   auto resource     = create_memory_resource(is_pool_used);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   // Read input parquet file
   // We do not want to time the initial read time as it may include
diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp
index 65a9c100c7c..1855374803a 100644
--- a/cpp/examples/strings/common.hpp
+++ b/cpp/examples/strings/common.hpp
@@ -93,7 +93,7 @@ int main(int argc, char const** argv)
 
   auto const mr_name = std::string{argc > 2 ? std::string(argv[2]) : std::string("cuda")};
   auto resource      = create_memory_resource(mr_name);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   auto const csv_file   = std::string{argv[1]};
   auto const csv_result = [csv_file] {
diff --git a/cpp/examples/tpch/q1.cpp b/cpp/examples/tpch/q1.cpp
index fe03320b888..87b7e613766 100644
--- a/cpp/examples/tpch/q1.cpp
+++ b/cpp/examples/tpch/q1.cpp
@@ -20,6 +20,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 /**
  * @file q1.cpp
@@ -62,7 +63,7 @@
   cudf::column_view const& discount,
   cudf::column_view const& extendedprice,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto const one = cudf::numeric_scalar<double>(1);
   auto const one_minus_discount =
@@ -89,7 +90,7 @@
   cudf::column_view const& tax,
   cudf::column_view const& disc_price,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto const one = cudf::numeric_scalar<double>(1);
   auto const one_plus_tax =
@@ -106,7 +107,7 @@ int main(int argc, char const** argv)
 
   // Use a memory pool
   auto resource = create_memory_resource(args.memory_resource_type);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   cudf::examples::timer timer;
 
diff --git a/cpp/examples/tpch/q10.cpp b/cpp/examples/tpch/q10.cpp
index 94da46f6930..fdf147b50e0 100644
--- a/cpp/examples/tpch/q10.cpp
+++ b/cpp/examples/tpch/q10.cpp
@@ -20,6 +20,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 /**
  * @file q10.cpp
@@ -75,7 +76,7 @@
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto const one = cudf::numeric_scalar<double>(1);
   auto const one_minus_discount =
@@ -95,7 +96,7 @@ int main(int argc, char const** argv)
 
   // Use a memory pool
   auto resource = create_memory_resource(args.memory_resource_type);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   cudf::examples::timer timer;
 
diff --git a/cpp/examples/tpch/q5.cpp b/cpp/examples/tpch/q5.cpp
index 89396a6c968..12c186db10e 100644
--- a/cpp/examples/tpch/q5.cpp
+++ b/cpp/examples/tpch/q5.cpp
@@ -20,6 +20,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 /**
  * @file q5.cpp
@@ -70,7 +71,7 @@
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto const one = cudf::numeric_scalar<double>(1);
   auto const one_minus_discount =
@@ -91,7 +92,7 @@ int main(int argc, char const** argv)
 
   // Use a memory pool
   auto resource = create_memory_resource(args.memory_resource_type);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   cudf::examples::timer timer;
 
diff --git a/cpp/examples/tpch/q6.cpp b/cpp/examples/tpch/q6.cpp
index 405b2ac73ca..92dac40c768 100644
--- a/cpp/examples/tpch/q6.cpp
+++ b/cpp/examples/tpch/q6.cpp
@@ -20,6 +20,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 /**
  * @file q6.cpp
@@ -51,7 +52,7 @@
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64};
   auto revenue            = cudf::binary_operation(
@@ -65,7 +66,7 @@ int main(int argc, char const** argv)
 
   // Use a memory pool
   auto resource = create_memory_resource(args.memory_resource_type);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   cudf::examples::timer timer;
 
diff --git a/cpp/examples/tpch/q9.cpp b/cpp/examples/tpch/q9.cpp
index d3c218253f9..2882182aa2b 100644
--- a/cpp/examples/tpch/q9.cpp
+++ b/cpp/examples/tpch/q9.cpp
@@ -22,6 +22,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 /**
  * @file q9.cpp
@@ -84,7 +85,7 @@
   cudf::column_view const& supplycost,
   cudf::column_view const& quantity,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto const one = cudf::numeric_scalar<double>(1);
   auto const one_minus_discount =
@@ -114,7 +115,7 @@ int main(int argc, char const** argv)
 
   // Use a memory pool
   auto resource = create_memory_resource(args.memory_resource_type);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   cudf::examples::timer timer;
 
diff --git a/cpp/examples/tpch/utils.hpp b/cpp/examples/tpch/utils.hpp
index e586da2c802..8102fa8f976 100644
--- a/cpp/examples/tpch/utils.hpp
+++ b/cpp/examples/tpch/utils.hpp
@@ -27,6 +27,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/unary.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
@@ -189,7 +190,7 @@ std::vector<T> concat(std::vector<T> const& lhs, std::vector<T> const& rhs)
   auto const left_selected                           = left_input.select(left_on);
   auto const right_selected                          = right_input.select(right_on);
   auto const [left_join_indices, right_join_indices] = cudf::inner_join(
-    left_selected, right_selected, compare_nulls, rmm::mr::get_current_device_resource());
+    left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref());
 
   auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
   auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index da552d95421..a254171ef11 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -20,8 +20,7 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/scan.h>
 
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 51199bb5792..63908f6c870 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -19,9 +19,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -171,7 +169,7 @@ std::unique_ptr<column> binary_operation(
   binary_operator op,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a binary operation between a column and a scalar.
@@ -202,7 +200,7 @@ std::unique_ptr<column> binary_operation(
   binary_operator op,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a binary operation between two columns.
@@ -232,7 +230,7 @@ std::unique_ptr<column> binary_operation(
   binary_operator op,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a binary operation between two columns using a
@@ -263,7 +261,7 @@ std::unique_ptr<column> binary_operation(
   std::string const& ptx,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the `scale` for a `fixed_point` number based on given binary operator `op`
@@ -315,7 +313,7 @@ std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
   column_view const& col,
   scalar const& s,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace binops
 
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index 5d1d74c3f28..de19a076cc4 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -19,12 +19,11 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <type_traits>
@@ -65,7 +64,7 @@ class column {
    */
   column(column const& other,
          rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+         rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Move the contents from `other` to create a new column.
@@ -143,7 +142,7 @@ class column {
    */
   explicit column(column_view view,
                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Returns the column's logical element type
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index b2dcb25acb5..c3b68b52c36 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -18,12 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -78,7 +77,7 @@ std::unique_ptr<column> make_numeric_column(
   size_type size,
   mask_state state                  = mask_state::UNALLOCATED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -104,7 +103,7 @@ std::unique_ptr<column> make_numeric_column(
   B&& null_mask,
   size_type null_count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
   return std::make_unique<column>(type,
@@ -136,7 +135,7 @@ std::unique_ptr<column> make_fixed_point_column(
   size_type size,
   mask_state state                  = mask_state::UNALLOCATED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -161,7 +160,7 @@ std::unique_ptr<column> make_fixed_point_column(
   B&& null_mask,
   size_type null_count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
   return std::make_unique<column>(type,
@@ -194,7 +193,7 @@ std::unique_ptr<column> make_timestamp_column(
   size_type size,
   mask_state state                  = mask_state::UNALLOCATED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -220,7 +219,7 @@ std::unique_ptr<column> make_timestamp_column(
   B&& null_mask,
   size_type null_count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
   return std::make_unique<column>(type,
@@ -253,7 +252,7 @@ std::unique_ptr<column> make_duration_column(
   size_type size,
   mask_state state                  = mask_state::UNALLOCATED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -279,7 +278,7 @@ std::unique_ptr<column> make_duration_column(
   B&& null_mask,
   size_type null_count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
   return std::make_unique<column>(type,
@@ -312,7 +311,7 @@ std::unique_ptr<column> make_fixed_width_column(
   size_type size,
   mask_state state                  = mask_state::UNALLOCATED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -338,7 +337,7 @@ std::unique_ptr<column> make_fixed_width_column(
   B&& null_mask,
   size_type null_count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
   if (is_timestamp(type)) {
@@ -377,7 +376,7 @@ std::unique_ptr<column> make_fixed_width_column(
 std::unique_ptr<column> make_strings_column(
   cudf::device_span<thrust::pair<char const*, size_type> const> strings,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a STRING type column given a device span of string_view.
@@ -409,7 +408,7 @@ std::unique_ptr<column> make_strings_column(
   cudf::device_span<string_view const> string_views,
   string_view const null_placeholder,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a STRING type column given offsets column, chars columns, and null mask and null
@@ -497,7 +496,7 @@ std::unique_ptr<cudf::column> make_lists_column(
   size_type null_count,
   rmm::device_buffer&& null_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a STRUCT column using specified child columns as members.
@@ -528,7 +527,7 @@ std::unique_ptr<cudf::column> make_structs_column(
   size_type null_count,
   rmm::device_buffer&& null_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a column with size elements that are all equal to the given scalar.
@@ -548,7 +547,7 @@ std::unique_ptr<column> make_column_from_scalar(
   scalar const& s,
   size_type size,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a dictionary column with size elements that are all equal to the given scalar.
@@ -568,7 +567,7 @@ std::unique_ptr<column> make_dictionary_from_scalar(
   scalar const& s,
   size_type size,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp
index 0935bdf7def..155740dc29e 100644
--- a/cpp/include/cudf/concatenate.hpp
+++ b/cpp/include/cudf/concatenate.hpp
@@ -19,11 +19,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <memory>
 
 namespace CUDF_EXPORT cudf {
@@ -49,7 +47,7 @@ namespace CUDF_EXPORT cudf {
 rmm::device_buffer concatenate_masks(
   host_span<column_view const> views,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Concatenates multiple columns into a single column
@@ -66,7 +64,7 @@ rmm::device_buffer concatenate_masks(
 std::unique_ptr<column> concatenate(
   host_span<column_view const> columns_to_concat,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Columns of `tables_to_concat` are concatenated vertically to return a
@@ -95,7 +93,7 @@ std::unique_ptr<column> concatenate(
 std::unique_ptr<table> concatenate(
   host_span<table_view const> tables_to_concat,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 195dac25268..41eef9559b8 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -19,8 +19,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <vector>
@@ -122,7 +121,7 @@ struct packed_table {
 std::vector<packed_table> contiguous_split(
   cudf::table_view const& input,
   std::vector<size_type> const& splits,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 namespace detail {
 
@@ -154,7 +153,7 @@ struct contiguous_split_state;
  * // Choose a memory resource (optional). This memory resource is used for scratch/thrust temporary
  * // data. In memory constrained cases, this can be used to set aside scratch memory
  * // for `chunked_pack` at the beginning of a program.
- * auto mr = rmm::mr::get_current_device_resource();
+ * auto mr = cudf::get_current_device_resource_ref();
  *
  * // Define a buffer size for each chunk: the larger the buffer is, the more SMs can be
  * // occupied by this algorithm.
@@ -205,7 +204,7 @@ class chunked_pack {
   explicit chunked_pack(
     cudf::table_view const& input,
     std::size_t user_buffer_size,
-    rmm::device_async_resource_ref temp_mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref temp_mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Destructor that will be implemented as default. Declared with definition here because
@@ -270,7 +269,7 @@ class chunked_pack {
   [[nodiscard]] static std::unique_ptr<chunked_pack> create(
     cudf::table_view const& input,
     std::size_t user_buffer_size,
-    rmm::device_async_resource_ref temp_mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref temp_mr = cudf::get_current_device_resource_ref());
 
  private:
   // internal state of contiguous split
@@ -290,7 +289,7 @@ class chunked_pack {
  *         and device memory respectively
  */
 packed_columns pack(cudf::table_view const& input,
-                    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Produce the metadata used for packing a table stored in a contiguous buffer.
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index 3c44ff48fdf..388f19abea2 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -24,9 +24,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <vector>
@@ -88,7 +86,7 @@ std::unique_ptr<table> gather(
   column_view const& gather_map,
   out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
   rmm::cuda_stream_view stream       = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr  = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Reverses the rows within a table.
@@ -108,7 +106,7 @@ std::unique_ptr<table> gather(
 std::unique_ptr<table> reverse(
   table_view const& source_table,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Reverses the elements of a column
@@ -128,7 +126,7 @@ std::unique_ptr<table> reverse(
 std::unique_ptr<column> reverse(
   column_view const& source_column,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Scatters the rows of the source table into a copy of the target table
@@ -177,7 +175,7 @@ std::unique_ptr<table> scatter(
   column_view const& scatter_map,
   table_view const& target,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
@@ -220,7 +218,7 @@ std::unique_ptr<table> scatter(
   column_view const& indices,
   table_view const& target,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Indicates when to allocate a mask, based on an existing mask.
@@ -268,7 +266,7 @@ std::unique_ptr<column> allocate_like(
   column_view const& input,
   mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates an uninitialized new column of the specified size and same type as the `input`.
@@ -291,7 +289,7 @@ std::unique_ptr<column> allocate_like(
   size_type size,
   mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a table of empty columns with the same types as the `input_table`
@@ -383,7 +381,7 @@ std::unique_ptr<column> copy_range(
   size_type source_end,
   size_type target_begin,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a new column by shifting all values by an offset.
@@ -427,7 +425,7 @@ std::unique_ptr<column> shift(
   size_type offset,
   scalar const& fill_value,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Slices a `column_view` into a set of `column_view`s according to a set of indices.
@@ -630,7 +628,7 @@ std::unique_ptr<column> copy_if_else(
   column_view const& rhs,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -656,7 +654,7 @@ std::unique_ptr<column> copy_if_else(
   column_view const& rhs,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -682,7 +680,7 @@ std::unique_ptr<column> copy_if_else(
   scalar const& rhs,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -706,7 +704,7 @@ std::unique_ptr<column> copy_if_else(
   scalar const& rhs,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Scatters rows from the input table to rows of the output corresponding
@@ -750,7 +748,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Scatters scalar values to rows of the output corresponding
@@ -789,7 +787,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Get the element at specified index from a column
@@ -809,7 +807,7 @@ std::unique_ptr<scalar> get_element(
   column_view const& input,
   size_type index,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Indicates whether a row can be sampled more than once.
@@ -853,7 +851,7 @@ std::unique_ptr<table> sample(
   sample_with_replacement replacement = sample_with_replacement::FALSE,
   int64_t const seed                  = 0,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Checks if a column or its descendants have non-empty null rows
@@ -970,7 +968,7 @@ bool may_have_nonempty_nulls(column_view const& input);
 std::unique_ptr<column> purge_nonempty_nulls(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index f7bed8bdc7e..c7523c80b2b 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -18,9 +18,7 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -49,7 +47,7 @@ namespace datetime {
  */
 std::unique_ptr<cudf::column> extract_year(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts month from any datetime type and returns an int16_t
@@ -63,7 +61,7 @@ std::unique_ptr<cudf::column> extract_year(
  */
 std::unique_ptr<cudf::column> extract_month(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts day from any datetime type and returns an int16_t
@@ -77,7 +75,7 @@ std::unique_ptr<cudf::column> extract_month(
  */
 std::unique_ptr<cudf::column> extract_day(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts a weekday from any datetime type and returns an int16_t
@@ -91,7 +89,7 @@ std::unique_ptr<cudf::column> extract_day(
  */
 std::unique_ptr<cudf::column> extract_weekday(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts hour from any datetime type and returns an int16_t
@@ -105,7 +103,7 @@ std::unique_ptr<cudf::column> extract_weekday(
  */
 std::unique_ptr<cudf::column> extract_hour(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts minute from any datetime type and returns an int16_t
@@ -119,7 +117,7 @@ std::unique_ptr<cudf::column> extract_hour(
  */
 std::unique_ptr<cudf::column> extract_minute(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts second from any datetime type and returns an int16_t
@@ -133,7 +131,7 @@ std::unique_ptr<cudf::column> extract_minute(
  */
 std::unique_ptr<cudf::column> extract_second(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts millisecond fraction from any datetime type and returns an int16_t
@@ -150,7 +148,7 @@ std::unique_ptr<cudf::column> extract_second(
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts microsecond fraction from any datetime type and returns an int16_t
@@ -167,7 +165,7 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts nanosecond fraction from any datetime type and returns an int16_t
@@ -184,7 +182,7 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 /**
@@ -205,7 +203,7 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
  */
 std::unique_ptr<cudf::column> last_day_of_month(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Computes the day number since the start of the year from the datetime and
@@ -219,7 +217,7 @@ std::unique_ptr<cudf::column> last_day_of_month(
  */
 std::unique_ptr<cudf::column> day_of_year(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Adds or subtracts a number of months from the datetime type and returns a
@@ -254,7 +252,7 @@ std::unique_ptr<cudf::column> day_of_year(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::column_view const& months,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Adds or subtracts a number of months from the datetime type and returns a
@@ -289,7 +287,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::scalar const& months,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Check if the year of the given date is a leap year
@@ -306,7 +304,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  */
 std::unique_ptr<cudf::column> is_leap_year(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Extract the number of days in the month
@@ -322,7 +320,7 @@ std::unique_ptr<cudf::column> is_leap_year(
  */
 std::unique_ptr<cudf::column> days_in_month(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Returns the quarter of the date
@@ -338,7 +336,7 @@ std::unique_ptr<cudf::column> days_in_month(
  */
 std::unique_ptr<cudf::column> extract_quarter(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Fixed frequencies supported by datetime rounding functions ceil, floor, round.
@@ -367,7 +365,7 @@ enum class rounding_frequency : int32_t {
 std::unique_ptr<cudf::column> ceil_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Round datetimes down to the nearest multiple of the given frequency.
@@ -382,7 +380,7 @@ std::unique_ptr<cudf::column> ceil_datetimes(
 std::unique_ptr<cudf::column> floor_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Round datetimes to the nearest multiple of the given frequency.
@@ -397,7 +395,7 @@ std::unique_ptr<cudf::column> floor_datetimes(
 std::unique_ptr<cudf::column> round_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index fe739327a08..91f774839d9 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -18,9 +18,9 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
diff --git a/cpp/include/cudf/detail/calendrical_month_sequence.cuh b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
index a9cf54e29b8..2097411357d 100644
--- a/cpp/include/cudf/detail/calendrical_month_sequence.cuh
+++ b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
@@ -21,11 +21,11 @@
 #include <cudf/detail/datetime_ops.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 1be269710b2..51166f6054b 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -20,10 +20,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
diff --git a/cpp/include/cudf/detail/concatenate_masks.hpp b/cpp/include/cudf/detail/concatenate_masks.hpp
index fc829361fde..4f9e7f9cd13 100644
--- a/cpp/include/cudf/detail/concatenate_masks.hpp
+++ b/cpp/include/cudf/detail/concatenate_masks.hpp
@@ -18,12 +18,12 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp
index 52c51daa917..52ca091e1cd 100644
--- a/cpp/include/cudf/detail/contiguous_split.hpp
+++ b/cpp/include/cudf/detail/contiguous_split.hpp
@@ -19,9 +19,9 @@
 #include <cudf/contiguous_split.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 2be432c0825..60aa500f129 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -20,11 +20,11 @@
 #include <cudf/copying.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <initializer_list>
 
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 4071fa01fb2..dfb646c66c4 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -30,6 +30,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -38,7 +39,6 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda/atomic>
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index d260a4591b7..a70cd5a0661 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -21,9 +21,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_scalar.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/optional>
 #include <thrust/iterator/iterator_traits.h>
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 1b3b2056c6c..3aa136d630b 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -23,11 +23,11 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda_runtime.h>
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index 95469de8ae6..31782cbaf8a 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -18,8 +18,7 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index 0b3d7ac58bf..2acc10105cf 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -18,10 +18,10 @@
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 
diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp
index 82c6af8b611..04b3b63a9ed 100644
--- a/cpp/include/cudf/detail/fill.hpp
+++ b/cpp/include/cudf/detail/fill.hpp
@@ -19,9 +19,9 @@
 #include <cudf/filling.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index df6fe6e6ccb..d91c3df719a 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -33,12 +33,12 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/gather.h>
@@ -582,11 +582,11 @@ void gather_bitmask(table_view const& source,
     return col->mutable_view().null_mask();
   });
   auto d_target_masks =
-    make_device_uvector_async(target_masks, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(target_masks, stream, cudf::get_current_device_resource_ref());
 
   auto const device_source = table_device_view::create(source, stream);
   auto d_valid_counts      = make_zeroed_device_uvector_async<size_type>(
-    target.size(), stream, rmm::mr::get_current_device_resource());
+    target.size(), stream, cudf::get_current_device_resource_ref());
 
   // Dispatch operation enum to get implementation
   auto const impl = [op]() {
diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp
index 39cd43934e3..48fb60aa5dd 100644
--- a/cpp/include/cudf/detail/gather.hpp
+++ b/cpp/include/cudf/detail/gather.hpp
@@ -21,10 +21,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp
index 36eae05ce39..3e9511de5e4 100644
--- a/cpp/include/cudf/detail/groupby.hpp
+++ b/cpp/include/cudf/detail/groupby.hpp
@@ -17,10 +17,10 @@
 
 #include <cudf/groupby.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
index c0910b4d5ae..e3a6f7db2b5 100644
--- a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
+++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
@@ -20,10 +20,10 @@
 #include <cudf/replace.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 namespace CUDF_EXPORT cudf {
 namespace groupby {
 namespace detail {
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index a411a890622..ce8783d8b79 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -20,10 +20,10 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace groupby::detail::sort {
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
index 7a1e38eefe0..7de79b31bc7 100644
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -18,11 +18,11 @@
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_map.cuh>
 #include <thrust/for_each.h>
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 0d8f078c9d1..938d0e95097 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -20,9 +20,9 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index af46dd79cdb..b084a94cbc8 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -20,11 +20,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_multimap.cuh>
 
diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp
index 92a417b0132..44fcba0d2d6 100644
--- a/cpp/include/cudf/detail/label_bins.hpp
+++ b/cpp/include/cudf/detail/label_bins.hpp
@@ -21,11 +21,10 @@
 #include <cudf/labeling/label_bins.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 
diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp
index 72e34b76158..43a0387ab99 100644
--- a/cpp/include/cudf/detail/merge.hpp
+++ b/cpp/include/cudf/detail/merge.hpp
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index ae6db5409cc..327c732716c 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -21,12 +21,12 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_reduce.cuh>
 #include <cub/device/device_segmented_reduce.cuh>
@@ -164,7 +164,7 @@ size_type inplace_bitmask_binop(Binop op,
   CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
                "Mask pointer cannot be null");
 
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
   rmm::device_scalar<size_type> d_counter{0, stream, mr};
   rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
   rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
@@ -434,7 +434,7 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
     std::distance(indices_begin, indices_end), stream);
   std::copy(indices_begin, indices_end, std::back_inserter(h_indices));
   auto const d_indices =
-    make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(h_indices, stream, cudf::get_current_device_resource_ref());
 
   // Compute the bit counts over each segment.
   auto first_bit_indices_begin = thrust::make_transform_iterator(
@@ -449,7 +449,7 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
                                        last_bit_indices_begin,
                                        count_bits,
                                        stream,
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
 
   // Copy the results back to the host.
   return make_std_vector_sync(d_bit_counts, stream);
@@ -576,7 +576,7 @@ std::pair<rmm::device_buffer, size_type> segmented_null_mask_reduction(
                                        last_bit_indices_begin,
                                        cudf::detail::count_bits_policy::SET_BITS,
                                        stream,
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
   auto const length_and_valid_count =
     thrust::make_zip_iterator(segment_length_iterator, segment_valid_counts.begin());
   return cudf::detail::valid_if(
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 67e3617d873..b8c52a4ae2c 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -18,10 +18,10 @@
 #include <cudf/column/column.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index 23d5fb73ba3..4f912077e59 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -19,9 +19,9 @@
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp
index e17f1b7c5fd..81ac5bf2b14 100644
--- a/cpp/include/cudf/detail/repeat.hpp
+++ b/cpp/include/cudf/detail/repeat.hpp
@@ -18,9 +18,9 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index e2bd729861b..3b18b95ce75 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -18,9 +18,9 @@
 #include <cudf/replace.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index 68a856373bf..aeeed282d8b 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -18,9 +18,9 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/rolling.hpp b/cpp/include/cudf/detail/rolling.hpp
index 5bfa5679531..d8d5506969b 100644
--- a/cpp/include/cudf/detail/rolling.hpp
+++ b/cpp/include/cudf/detail/rolling.hpp
@@ -20,9 +20,9 @@
 #include <cudf/rolling.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp
index ba3ef1c1ce7..df1faf05dbd 100644
--- a/cpp/include/cudf/detail/round.hpp
+++ b/cpp/include/cudf/detail/round.hpp
@@ -18,9 +18,9 @@
 
 #include <cudf/round.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index bd60309c5c3..313964a6341 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -18,9 +18,9 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index 80bc87731ca..fa93ce4e13c 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -30,12 +30,12 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/distance.h>
@@ -223,7 +223,7 @@ struct column_scatterer_impl<dictionary32> {
     auto target_matched    = dictionary::detail::add_keys(target, source.keys(), stream, mr);
     auto const target_view = dictionary_column_view(target_matched->view());
     auto source_matched    = dictionary::detail::set_keys(
-      source, target_view.keys(), stream, rmm::mr::get_current_device_resource());
+      source, target_view.keys(), stream, cudf::get_current_device_resource_ref());
     auto const source_view = dictionary_column_view(source_matched->view());
 
     // now build the new indices by doing a scatter on just the matched indices
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 6691ddc5c09..39f973bb611 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -20,10 +20,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index 72e2cf074bc..da3b98660dc 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -20,10 +20,10 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index a08010a610f..41d9fe41080 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -19,16 +19,16 @@
 #include <cudf/filling.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init, scalar const& step,
  *                                       rmm::device_async_resource_ref mr =
- *rmm::mr::get_current_device_resource())
+ *cudf::get_current_device_resource_ref())
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -41,7 +41,7 @@ std::unique_ptr<column> sequence(size_type size,
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init,
                                          rmm::device_async_resource_ref mr =
- rmm::mr::get_current_device_resource())
+ cudf::get_current_device_resource_ref())
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
index 63e4fca8915..88ec0c07dc5 100644
--- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
+++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -19,11 +19,11 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index 08cf329f199..185855e1fc0 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -19,9 +19,9 @@
 #include <cudf/sorting.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 85d2ee9790f..8a4366bdd63 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -20,10 +20,10 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index 7de68035b19..261c54afd51 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -19,11 +19,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace structs::detail {
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 10eb3d389c7..80a4460023f 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -19,10 +19,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace tdigest::detail {
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
index c7798ff60ed..5738f9ec8e9 100644
--- a/cpp/include/cudf/detail/timezone.hpp
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -17,9 +17,9 @@
 
 #include <cudf/timezone.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
@@ -34,7 +34,7 @@ std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
   rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 02849ef023c..4cfa95468f2 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -20,9 +20,9 @@
 #include <cudf/transform.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp
index 559b2c32996..22382fa0713 100644
--- a/cpp/include/cudf/detail/transpose.hpp
+++ b/cpp/include/cudf/detail/transpose.hpp
@@ -19,9 +19,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index bb05138bc8c..18b1e9b2d2e 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -20,10 +20,10 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
index c6775a950c9..c661faf1fbe 100644
--- a/cpp/include/cudf/detail/utilities/host_memory.hpp
+++ b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -18,10 +18,9 @@
 
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <cstddef>
 
 namespace cudf::detail {
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index d4dd7b0d626..ecb8f910463 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -19,9 +19,9 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/aligned.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
 
@@ -33,7 +33,7 @@ namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
- *  that employs \c a `rmm::host_async_resource_ref` for allocation.
+ *  that employs \c a `cudf::host_async_resource_ref` for allocation.
  *
  *  \see https://en.cppreference.com/w/cpp/memory/allocator
  */
@@ -68,10 +68,10 @@ inline constexpr bool contains_property =
   (cuda::std::is_same_v<DesiredProperty, Properties> || ... || false);
 
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
- *  that employs \c `rmm::host_async_resource_ref` for allocation.
+ *  that employs \c `cudf::host_async_resource_ref` for allocation.
  *
  * The \p rmm_host_allocator provides an interface for host memory allocation through the user
- * provided \c `rmm::host_async_resource_ref`. The \p rmm_host_allocator does not take ownership of
+ * provided \c `cudf::host_async_resource_ref`. The \p rmm_host_allocator does not take ownership of
  * this reference and therefore it is the user's responsibility to ensure its lifetime for the
  * duration of the lifetime of the \p rmm_host_allocator.
  *
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index a9d91cdeee1..953ae5b9308 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -27,13 +27,13 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 56a2c76b741..cfb2e70bfed 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -22,10 +22,10 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 
diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp
index 0eb17aa06f4..12f09616295 100644
--- a/cpp/include/cudf/dictionary/detail/concatenate.hpp
+++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp
@@ -18,10 +18,10 @@
 #include <cudf/column/column.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary::detail {
diff --git a/cpp/include/cudf/dictionary/detail/encode.hpp b/cpp/include/cudf/dictionary/detail/encode.hpp
index cc7ffbd397f..600ba8d6c67 100644
--- a/cpp/include/cudf/dictionary/detail/encode.hpp
+++ b/cpp/include/cudf/dictionary/detail/encode.hpp
@@ -19,9 +19,9 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary::detail {
diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp
index a1777d412fe..69d0d9fa9b0 100644
--- a/cpp/include/cudf/dictionary/detail/merge.hpp
+++ b/cpp/include/cudf/dictionary/detail/merge.hpp
@@ -18,9 +18,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/detail/merge.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary::detail {
diff --git a/cpp/include/cudf/dictionary/detail/replace.hpp b/cpp/include/cudf/dictionary/detail/replace.hpp
index 1e1ee182fc5..c854e794b17 100644
--- a/cpp/include/cudf/dictionary/detail/replace.hpp
+++ b/cpp/include/cudf/dictionary/detail/replace.hpp
@@ -19,9 +19,9 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary::detail {
diff --git a/cpp/include/cudf/dictionary/detail/search.hpp b/cpp/include/cudf/dictionary/detail/search.hpp
index 921acc258a9..09907c9070d 100644
--- a/cpp/include/cudf/dictionary/detail/search.hpp
+++ b/cpp/include/cudf/dictionary/detail/search.hpp
@@ -19,9 +19,9 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary {
diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index 9eb812eb8ee..0848df64596 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -19,10 +19,10 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary::detail {
diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp
index 2f663c4af61..4a63ee05479 100644
--- a/cpp/include/cudf/dictionary/dictionary_factories.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp
@@ -18,10 +18,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 /**
@@ -67,7 +66,7 @@ std::unique_ptr<column> make_dictionary_column(
   column_view const& keys_column,
   column_view const& indices_column,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a dictionary column by taking ownership of the provided keys
@@ -97,7 +96,7 @@ std::unique_ptr<column> make_dictionary_column(
   rmm::device_buffer&& null_mask,
   size_type null_count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a dictionary column by taking ownership of the provided keys
@@ -124,7 +123,7 @@ std::unique_ptr<column> make_dictionary_column(
   std::unique_ptr<column> keys_column,
   std::unique_ptr<column> indices_column,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index 9e68c947793..dc81fd74992 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary {
@@ -62,7 +60,7 @@ std::unique_ptr<column> encode(
   column_view const& column,
   data_type indices_type            = data_type{type_id::UINT32},
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a column by gathering the keys from the provided
@@ -82,7 +80,7 @@ std::unique_ptr<column> encode(
 std::unique_ptr<column> decode(
   dictionary_column_view const& dictionary_column,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/search.hpp b/cpp/include/cudf/dictionary/search.hpp
index 66275de33e9..16d59318dd0 100644
--- a/cpp/include/cudf/dictionary/search.hpp
+++ b/cpp/include/cudf/dictionary/search.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary {
@@ -46,7 +44,7 @@ std::unique_ptr<scalar> get_index(
   dictionary_column_view const& dictionary,
   scalar const& key,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/update_keys.hpp b/cpp/include/cudf/dictionary/update_keys.hpp
index c02e91f8d78..85e5af8cf22 100644
--- a/cpp/include/cudf/dictionary/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/update_keys.hpp
@@ -17,11 +17,9 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 namespace CUDF_EXPORT cudf {
 namespace dictionary {
 /**
@@ -61,7 +59,7 @@ std::unique_ptr<column> add_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& new_keys,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a new dictionary column by removing the specified keys
@@ -93,7 +91,7 @@ std::unique_ptr<column> remove_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys_to_remove,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a new dictionary column by removing any keys
@@ -115,7 +113,7 @@ std::unique_ptr<column> remove_keys(
 std::unique_ptr<column> remove_unused_keys(
   dictionary_column_view const& dictionary_column,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a new dictionary column by applying only the specified keys
@@ -149,7 +147,7 @@ std::unique_ptr<column> set_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create new dictionaries that have keys merged from the input dictionaries.
@@ -165,7 +163,7 @@ std::unique_ptr<column> set_keys(
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index 054f1e859f4..15a21b44f3b 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -19,9 +19,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -94,7 +92,7 @@ std::unique_ptr<column> fill(
   size_type end,
   scalar const& value,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Repeat rows of a Table.
@@ -128,7 +126,7 @@ std::unique_ptr<table> repeat(
   table_view const& input_table,
   column_view const& count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Repeat rows of a Table.
@@ -153,7 +151,7 @@ std::unique_ptr<table> repeat(
   table_view const& input_table,
   size_type count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Fills a column with a sequence of value specified by an initial value and a step.
@@ -184,7 +182,7 @@ std::unique_ptr<column> sequence(
   scalar const& init,
   scalar const& step,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Fills a column with a sequence of value specified by an initial value and a step of 1.
@@ -211,7 +209,7 @@ std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a sequence of timestamps beginning at `init` and incrementing by `months` for
@@ -242,7 +240,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(
   scalar const& init,
   size_type months,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index f7df9c1aa9b..11c778408fe 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -22,11 +22,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -186,7 +185,7 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
     host_span<aggregation_request const> requests,
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @copydoc aggregate(host_span<aggregation_request const>, rmm::device_async_resource_ref)
@@ -196,7 +195,7 @@ class groupby {
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
     host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
   /**
    * @brief Performs grouped scans on the specified values.
    *
@@ -250,7 +249,7 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
     host_span<scan_request const> requests,
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Performs grouped shifts for specified values.
@@ -306,7 +305,7 @@ class groupby {
     table_view const& values,
     host_span<size_type const> offsets,
     std::vector<std::reference_wrapper<scalar const>> const& fill_values,
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief The grouped data corresponding to a groupby operation on a set of values.
@@ -335,7 +334,7 @@ class groupby {
    * @return A `groups` object representing grouped keys and values
    */
   groups get_groups(cudf::table_view values           = {},
-                    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Performs grouped replace nulls on @p value
@@ -375,7 +374,7 @@ class groupby {
   std::pair<std::unique_ptr<table>, std::unique_ptr<table>> replace_nulls(
     table_view const& values,
     host_span<cudf::replace_policy const> replace_policies,
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
  private:
   table_view _keys;                                      ///< Keys that determine grouping
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index b8be2af6967..0c5327edb91 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -18,9 +18,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 
@@ -62,7 +60,7 @@ std::unique_ptr<column> murmurhash3_x86_32(
   table_view const& input,
   uint32_t seed                     = DEFAULT_HASH_SEED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the MurmurHash3 64-bit hash value of each row in the given table
@@ -81,7 +79,7 @@ std::unique_ptr<table> murmurhash3_x64_128(
   table_view const& input,
   uint64_t seed                     = DEFAULT_HASH_SEED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the MD5 hash value of each row in the given table
@@ -95,7 +93,7 @@ std::unique_ptr<table> murmurhash3_x64_128(
 std::unique_ptr<column> md5(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the SHA-1 hash value of each row in the given table
@@ -109,7 +107,7 @@ std::unique_ptr<column> md5(
 std::unique_ptr<column> sha1(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the SHA-224 hash value of each row in the given table
@@ -123,7 +121,7 @@ std::unique_ptr<column> sha1(
 std::unique_ptr<column> sha224(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the SHA-256 hash value of each row in the given table
@@ -137,7 +135,7 @@ std::unique_ptr<column> sha224(
 std::unique_ptr<column> sha256(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the SHA-384 hash value of each row in the given table
@@ -151,7 +149,7 @@ std::unique_ptr<column> sha256(
 std::unique_ptr<column> sha384(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the SHA-512 hash value of each row in the given table
@@ -165,7 +163,7 @@ std::unique_ptr<column> sha384(
 std::unique_ptr<column> sha512(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the XXHash_64 hash value of each row in the given table
@@ -183,7 +181,7 @@ std::unique_ptr<column> xxhash_64(
   table_view const& input,
   uint64_t seed                     = DEFAULT_HASH_SEED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace hashing
 
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index 1a459430346..a978e54a1b9 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -17,9 +17,9 @@
 
 #include <cudf/hashing.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cstddef>
 #include <functional>
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 0f52b0f7b31..f789d950e51 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -22,11 +22,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <utility>
 
 struct DLManagedTensor;
@@ -65,7 +63,7 @@ namespace CUDF_EXPORT cudf {
  */
 std::unique_ptr<table> from_dlpack(
   DLManagedTensor const* managed_tensor,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Convert a cudf table into a DLPack DLTensor
@@ -87,7 +85,7 @@ std::unique_ptr<table> from_dlpack(
  */
 DLManagedTensor* to_dlpack(
   table_view const& input,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
@@ -173,7 +171,7 @@ unique_schema_t to_arrow_schema(cudf::table_view const& input,
 unique_device_array_t to_arrow_device(
   cudf::table&& table,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `ArrowDeviceArray` from cudf column and metadata
@@ -202,7 +200,7 @@ unique_device_array_t to_arrow_device(
 unique_device_array_t to_arrow_device(
   cudf::column&& col,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `ArrowDeviceArray` from a table view
@@ -234,7 +232,7 @@ unique_device_array_t to_arrow_device(
 unique_device_array_t to_arrow_device(
   cudf::table_view const& table,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `ArrowDeviceArray` from a column view
@@ -266,7 +264,7 @@ unique_device_array_t to_arrow_device(
 unique_device_array_t to_arrow_device(
   cudf::column_view const& col,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Copy table view data to host and create `ArrowDeviceArray` for it
@@ -291,7 +289,7 @@ unique_device_array_t to_arrow_device(
 unique_device_array_t to_arrow_host(
   cudf::table_view const& table,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Copy column view data to host and create `ArrowDeviceArray` for it
@@ -316,7 +314,7 @@ unique_device_array_t to_arrow_host(
 unique_device_array_t to_arrow_host(
   cudf::column_view const& col,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `cudf::table` from given ArrowArray and ArrowSchema input
@@ -337,7 +335,7 @@ std::unique_ptr<cudf::table> from_arrow(
   ArrowSchema const* schema,
   ArrowArray const* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input
@@ -356,7 +354,7 @@ std::unique_ptr<cudf::column> from_arrow_column(
   ArrowSchema const* schema,
   ArrowArray const* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `cudf::table` from given ArrowDeviceArray input
@@ -380,7 +378,7 @@ std::unique_ptr<table> from_arrow_host(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `cudf::table` from given ArrowArrayStream input
@@ -398,7 +396,7 @@ std::unique_ptr<table> from_arrow_host(
 std::unique_ptr<table> from_arrow_stream(
   ArrowArrayStream* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `cudf::column` from given ArrowDeviceArray input
@@ -421,7 +419,7 @@ std::unique_ptr<column> from_arrow_host_column(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
@@ -502,7 +500,7 @@ unique_table_view_t from_arrow_device(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
@@ -545,7 +543,7 @@ unique_column_view_t from_arrow_device_column(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 63f9ea3a624..b307d05c09d 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -20,9 +20,7 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <string>
@@ -217,7 +215,7 @@ class avro_reader_options_builder {
  */
 table_with_metadata read_avro(
   avro_reader_options const& options,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index bbb4636a5a3..dae056ef157 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -20,9 +20,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <string>
@@ -1354,7 +1352,7 @@ class csv_reader_options_builder {
 table_with_metadata read_csv(
   csv_reader_options options,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 /**
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index 13f695d6866..ab6cb422296 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -19,9 +19,9 @@
 #include <cudf/io/avro.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace io::detail::avro {
diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/io/detail/batched_memset.hpp
index d0922cc64ee..1c74be4a9fe 100644
--- a/cpp/include/cudf/io/detail/batched_memset.hpp
+++ b/cpp/include/cudf/io/detail/batched_memset.hpp
@@ -16,10 +16,10 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_copy.cuh>
 #include <cuda/functional>
@@ -50,7 +50,7 @@ void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
 
   // copy bufs into device memory and then get sizes
   auto gpu_bufs =
-    cudf::detail::make_device_uvector_async(bufs, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::make_device_uvector_async(bufs, stream, cudf::get_current_device_resource_ref());
 
   // get a vector with the sizes of all buffers
   auto sizes = cudf::detail::make_counting_transform_iterator(
@@ -72,7 +72,7 @@ void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
   cub::DeviceCopy::Batched(nullptr, temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
 
   rmm::device_buffer d_temp_storage(
-    temp_storage_bytes, stream, rmm::mr::get_current_device_resource());
+    temp_storage_bytes, stream, cudf::get_current_device_resource_ref());
 
   cub::DeviceCopy::Batched(
     d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index d4cad2f70fd..409663938a9 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -18,9 +18,9 @@
 
 #include <cudf/io/csv.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace io::detail::csv {
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 38ba4f675c3..73ff17b2b93 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -19,9 +19,9 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace io::json::detail {
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 7538cf7d29c..4a240d76696 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -22,9 +22,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <string>
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index a6945e0b7ab..1528ac0124a 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -25,10 +25,9 @@
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <string>
 #include <vector>
diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp
index 715eb855daa..a5b5caf300f 100644
--- a/cpp/include/cudf/io/detail/tokenize_json.hpp
+++ b/cpp/include/cudf/io/detail/tokenize_json.hpp
@@ -18,11 +18,11 @@
 
 #include <cudf/io/json.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf::io::json {
 
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index fde1857cb7f..a3d6533705e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -20,9 +20,7 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <map>
 #include <string>
@@ -675,7 +673,7 @@ class json_reader_options_builder {
 table_with_metadata read_json(
   json_reader_options options,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 8d484b15872..163fa20806d 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -21,9 +21,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <optional>
@@ -409,7 +407,7 @@ class orc_reader_options_builder {
 table_with_metadata read_orc(
   orc_reader_options const& options,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief The chunked orc reader class to read an ORC file iteratively into a series of
@@ -479,7 +477,7 @@ class chunked_orc_reader {
     size_type output_row_granularity,
     orc_reader_options const& options,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct the reader from input/output size limits along with other ORC reader options.
@@ -500,7 +498,7 @@ class chunked_orc_reader {
     std::size_t pass_read_limit,
     orc_reader_options const& options,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct the reader from output size limits along with other ORC reader options.
@@ -518,7 +516,7 @@ class chunked_orc_reader {
     std::size_t chunk_read_limit,
     orc_reader_options const& options,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Destructor, destroying the internal reader instance.
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 64c37f9a9df..ed7b2ac0850 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -22,9 +22,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <iostream>
 #include <memory>
@@ -502,7 +500,7 @@ class parquet_reader_options_builder {
 table_with_metadata read_parquet(
   parquet_reader_options const& options,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief The chunked parquet reader class to read Parquet file iteratively in to a series of
@@ -540,7 +538,7 @@ class chunked_parquet_reader {
     std::size_t chunk_read_limit,
     parquet_reader_options const& options,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Constructor for chunked reader.
@@ -566,7 +564,7 @@ class chunked_parquet_reader {
     std::size_t pass_read_limit,
     parquet_reader_options const& options,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Destructor, destroying the internal reader instance.
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index eee3fefc79f..70e06eeac93 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -22,7 +22,6 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <algorithm>
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index 3a1f9611324..99f9e7534ac 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -19,10 +19,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -94,7 +93,7 @@ std::unique_ptr<cudf::column> multibyte_split(
   std::string const& delimiter,
   parse_options options             = {},
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index f4139721475..cc8912cb022 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -22,12 +22,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 #include <utility>
@@ -109,7 +108,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 inner_join(cudf::table_view const& left_keys,
            cudf::table_view const& right_keys,
            null_equality compare_nulls       = null_equality::EQUAL,
-           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+           rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to a
@@ -149,7 +148,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 left_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
           null_equality compare_nulls       = null_equality::EQUAL,
-          rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+          rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to a
@@ -188,7 +187,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 full_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
           null_equality compare_nulls       = null_equality::EQUAL,
-          rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+          rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a vector of row indices corresponding to a left semi-join
@@ -216,7 +215,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   null_equality compare_nulls       = null_equality::EQUAL,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a vector of row indices corresponding to a left anti join
@@ -247,7 +246,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   null_equality compare_nulls       = null_equality::EQUAL,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a cross join on two tables (`left`, `right`)
@@ -274,7 +273,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
 std::unique_ptr<cudf::table> cross_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief The enum class to specify if any of the input join tables (`build` table and any later
@@ -353,7 +352,7 @@ class hash_join {
   inner_join(cudf::table_view const& probe,
              std::optional<std::size_t> output_size = {},
              rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-             rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
+             rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
    * Returns the row indices that can be used to construct the result of performing
@@ -378,7 +377,7 @@ class hash_join {
   left_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-            rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
+            rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref()) const;
 
   /**
    * Returns the row indices that can be used to construct the result of performing
@@ -403,7 +402,7 @@ class hash_join {
   full_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-            rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
+            rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref()) const;
 
   /**
    * Returns the exact number of matches (rows) when performing an inner join with the specified
@@ -455,7 +454,7 @@ class hash_join {
   [[nodiscard]] std::size_t full_join_size(
     cudf::table_view const& probe,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
  private:
   const std::unique_ptr<impl_type const> _impl;
@@ -511,7 +510,7 @@ class distinct_hash_join {
   [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                           std::unique_ptr<rmm::device_uvector<size_type>>>
   inner_join(rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-             rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+             rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
    * @brief Returns the build table indices that can be used to construct the result of performing
@@ -530,7 +529,7 @@ class distinct_hash_join {
    */
   [[nodiscard]] std::unique_ptr<rmm::device_uvector<size_type>> left_join(
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
  private:
   using impl_type = typename cudf::detail::distinct_hash_join<HasNested>;  ///< Implementation type
@@ -579,7 +578,7 @@ conditional_inner_join(table_view const& left,
                        table_view const& right,
                        ast::expression const& binary_predicate,
                        std::optional<std::size_t> output_size = {},
-                       rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs
@@ -624,7 +623,7 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size = {},
-                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                      rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs
@@ -666,7 +665,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
-                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                      rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left table
@@ -705,7 +704,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
-  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left table
@@ -744,7 +743,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
-  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -802,7 +801,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -862,7 +861,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -922,7 +921,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left tables
@@ -969,7 +968,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left tables
@@ -1017,7 +1016,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1057,7 +1056,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1097,7 +1096,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1120,7 +1119,7 @@ std::size_t conditional_inner_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1143,7 +1142,7 @@ std::size_t conditional_left_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1166,7 +1165,7 @@ std::size_t conditional_left_semi_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1189,6 +1188,6 @@ std::size_t conditional_left_anti_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/json/json.hpp b/cpp/include/cudf/json/json.hpp
index 403374c536d..2ad3421d27d 100644
--- a/cpp/include/cudf/json/json.hpp
+++ b/cpp/include/cudf/json/json.hpp
@@ -18,9 +18,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 
@@ -169,7 +167,7 @@ std::unique_ptr<cudf::column> get_json_object(
   cudf::string_scalar const& json_path,
   get_json_object_options options   = get_json_object_options{},
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/labeling/label_bins.hpp b/cpp/include/cudf/labeling/label_bins.hpp
index 7eb25134ca5..1d0ead35d96 100644
--- a/cpp/include/cudf/labeling/label_bins.hpp
+++ b/cpp/include/cudf/labeling/label_bins.hpp
@@ -19,10 +19,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 
@@ -76,7 +75,7 @@ std::unique_ptr<column> label_bins(
   column_view const& right_edges,
   inclusive right_inclusive,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index 5a310e6651f..fd2f42cf649 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 
@@ -68,7 +66,7 @@ std::unique_ptr<column> concatenate_rows(
   table_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Concatenating multiple lists on the same row of a lists column into a single list.
@@ -99,7 +97,7 @@ std::unique_ptr<column> concatenate_list_elements(
   column_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index cd0a216488c..e498c60682e 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -52,7 +50,7 @@ std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a column of `bool` values indicating whether the list rows of the first
@@ -76,7 +74,7 @@ std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a column of `bool` values indicating whether each row in the `lists` column
@@ -98,7 +96,7 @@ std::unique_ptr<column> contains(
 std::unique_ptr<column> contains_nulls(
   cudf::lists_column_view const& lists,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Option to choose whether `index_of()` returns the first or last match
@@ -142,7 +140,7 @@ std::unique_ptr<column> index_of(
   cudf::scalar const& search_key,
   duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a column of values indicating the position of a search key
@@ -179,7 +177,7 @@ std::unique_ptr<column> index_of(
   cudf::column_view const& search_keys,
   duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index a6f2ea6e68a..e7d50f11099 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -54,7 +52,7 @@ namespace lists {
 std::unique_ptr<column> count_elements(
   lists_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of lists_elements group
 
diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp
index 07309da2814..ee7a6a465c3 100644
--- a/cpp/include/cudf/lists/detail/combine.hpp
+++ b/cpp/include/cudf/lists/detail/combine.hpp
@@ -18,8 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index edfa3355dcd..d3a3a48dbb2 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -19,10 +19,10 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/contains.hpp b/cpp/include/cudf/lists/detail/contains.hpp
index 1ca3651b55a..9d30ef90723 100644
--- a/cpp/include/cudf/lists/detail/contains.hpp
+++ b/cpp/include/cudf/lists/detail/contains.hpp
@@ -17,8 +17,7 @@
 
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp
index 76154ae7064..04e6b18cd27 100644
--- a/cpp/include/cudf/lists/detail/copying.hpp
+++ b/cpp/include/cudf/lists/detail/copying.hpp
@@ -16,9 +16,9 @@
 #pragma once
 
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/extract.hpp b/cpp/include/cudf/lists/detail/extract.hpp
index e14b93ff912..7448f513788 100644
--- a/cpp/include/cudf/lists/detail/extract.hpp
+++ b/cpp/include/cudf/lists/detail/extract.hpp
@@ -17,8 +17,7 @@
 
 #include <cudf/lists/extract.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 294282d7caa..31b18c90c68 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -22,11 +22,11 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
diff --git a/cpp/include/cudf/lists/detail/interleave_columns.hpp b/cpp/include/cudf/lists/detail/interleave_columns.hpp
index ae8caa853f3..ebf554f0964 100644
--- a/cpp/include/cudf/lists/detail/interleave_columns.hpp
+++ b/cpp/include/cudf/lists/detail/interleave_columns.hpp
@@ -17,9 +17,9 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/lists_column_factories.hpp b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
index 18d66f15b1e..b726264aa65 100644
--- a/cpp/include/cudf/lists/detail/lists_column_factories.hpp
+++ b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/reverse.hpp b/cpp/include/cudf/lists/detail/reverse.hpp
index d10d7784e6c..a5a86f4d44d 100644
--- a/cpp/include/cudf/lists/detail/reverse.hpp
+++ b/cpp/include/cudf/lists/detail/reverse.hpp
@@ -17,8 +17,7 @@
 
 #include <cudf/lists/reverse.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index be76e456900..51f2fa3cd23 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -26,11 +26,11 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh
index fc44e0bc290..49678c97554 100644
--- a/cpp/include/cudf/lists/detail/scatter_helper.cuh
+++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh
@@ -20,10 +20,10 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/list_device_view.cuh>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/lists/detail/set_operations.hpp b/cpp/include/cudf/lists/detail/set_operations.hpp
index abfcef72d47..51293969e58 100644
--- a/cpp/include/cudf/lists/detail/set_operations.hpp
+++ b/cpp/include/cudf/lists/detail/set_operations.hpp
@@ -19,10 +19,10 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/sorting.hpp b/cpp/include/cudf/lists/detail/sorting.hpp
index 8cbfbbae769..748fb7acfee 100644
--- a/cpp/include/cudf/lists/detail/sorting.hpp
+++ b/cpp/include/cudf/lists/detail/sorting.hpp
@@ -16,9 +16,9 @@
 #pragma once
 
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
index be0bd27083c..fa7c0c173d2 100644
--- a/cpp/include/cudf/lists/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -18,9 +18,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/explode.hpp b/cpp/include/cudf/lists/explode.hpp
index a3375887815..23745e8a443 100644
--- a/cpp/include/cudf/lists/explode.hpp
+++ b/cpp/include/cudf/lists/explode.hpp
@@ -19,9 +19,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -75,7 +73,7 @@ std::unique_ptr<table> explode(
   table_view const& input_table,
   size_type explode_column_idx,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Explodes a list column's elements and includes a position column.
@@ -121,7 +119,7 @@ std::unique_ptr<table> explode_position(
   table_view const& input_table,
   size_type explode_column_idx,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Explodes a list column's elements retaining any null entries or empty lists inside.
@@ -165,7 +163,7 @@ std::unique_ptr<table> explode_outer(
   table_view const& input_table,
   size_type explode_column_idx,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Explodes a list column's elements retaining any null entries or empty lists and includes a
@@ -211,7 +209,7 @@ std::unique_ptr<table> explode_outer_position(
   table_view const& input_table,
   size_type explode_column_idx,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/lists/extract.hpp b/cpp/include/cudf/lists/extract.hpp
index 29a02308c66..f584dff6bed 100644
--- a/cpp/include/cudf/lists/extract.hpp
+++ b/cpp/include/cudf/lists/extract.hpp
@@ -19,9 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -69,7 +67,7 @@ std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   size_type index,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a column where each row is a single element from the corresponding sublist
@@ -110,7 +108,7 @@ std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   column_view const& indices,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp
index a1f3c37ad9e..d887a844aba 100644
--- a/cpp/include/cudf/lists/filling.hpp
+++ b/cpp/include/cudf/lists/filling.hpp
@@ -18,10 +18,9 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -69,7 +68,7 @@ std::unique_ptr<column> sequences(
   column_view const& starts,
   column_view const& sizes,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a lists column in which each row contains a sequence of values specified by a tuple
@@ -111,7 +110,7 @@ std::unique_ptr<column> sequences(
   column_view const& steps,
   column_view const& sizes,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/gather.hpp b/cpp/include/cudf/lists/gather.hpp
index 6359e0488c9..3e3c09cfea1 100644
--- a/cpp/include/cudf/lists/gather.hpp
+++ b/cpp/include/cudf/lists/gather.hpp
@@ -20,9 +20,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -77,7 +75,7 @@ std::unique_ptr<column> segmented_gather(
   lists_column_view const& gather_map_list,
   out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
   rmm::cuda_stream_view stream       = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr  = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/reverse.hpp b/cpp/include/cudf/lists/reverse.hpp
index f00e6e5117a..0c99dcbe8ae 100644
--- a/cpp/include/cudf/lists/reverse.hpp
+++ b/cpp/include/cudf/lists/reverse.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -52,7 +50,7 @@ namespace lists {
 std::unique_ptr<column> reverse(
   lists_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 
diff --git a/cpp/include/cudf/lists/set_operations.hpp b/cpp/include/cudf/lists/set_operations.hpp
index 55b1591fc44..f8ea972528c 100644
--- a/cpp/include/cudf/lists/set_operations.hpp
+++ b/cpp/include/cudf/lists/set_operations.hpp
@@ -19,9 +19,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -64,7 +64,7 @@ std::unique_ptr<column> have_overlap(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a lists column of distinct elements common to two input lists columns.
@@ -101,7 +101,7 @@ std::unique_ptr<column> intersect_distinct(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a lists column of distinct elements found in either of two input lists columns.
@@ -138,7 +138,7 @@ std::unique_ptr<column> union_distinct(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a lists column of distinct elements found only in the left input column.
@@ -175,7 +175,7 @@ std::unique_ptr<column> difference_distinct(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/sorting.hpp b/cpp/include/cudf/lists/sorting.hpp
index 39c71f6e9fa..ee18ed57c57 100644
--- a/cpp/include/cudf/lists/sorting.hpp
+++ b/cpp/include/cudf/lists/sorting.hpp
@@ -19,9 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -58,7 +56,7 @@ std::unique_ptr<column> sort_lists(
   order column_order,
   null_order null_precedence,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Segmented sort of the elements within a list in each row of a list column using stable
@@ -71,7 +69,7 @@ std::unique_ptr<column> stable_sort_lists(
   order column_order,
   null_order null_precedence,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp
index 28ef13cd870..59b53c10ac9 100644
--- a/cpp/include/cudf/lists/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/stream_compaction.hpp
@@ -18,10 +18,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -65,7 +64,7 @@ std::unique_ptr<column> apply_boolean_mask(
   lists_column_view const& input,
   lists_column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a new list column without duplicate elements in each list.
@@ -92,7 +91,7 @@ std::unique_ptr<column> distinct(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp
index 83c6ff04500..18701bf8ec6 100644
--- a/cpp/include/cudf/merge.hpp
+++ b/cpp/include/cudf/merge.hpp
@@ -18,9 +18,7 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <vector>
@@ -109,6 +107,6 @@ std::unique_ptr<cudf::table> merge(
   std::vector<cudf::order> const& column_order,
   std::vector<cudf::null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr                    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr                    = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 70ca6aa29c5..fe719bf2c62 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -18,11 +18,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -92,7 +91,7 @@ rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Sets a pre-allocated bitmask buffer to a given state in the range
@@ -135,7 +134,7 @@ rmm::device_buffer copy_bitmask(
   size_type begin_bit,
   size_type end_bit,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Copies `view`'s bitmask from the bits
@@ -152,7 +151,7 @@ rmm::device_buffer copy_bitmask(
 rmm::device_buffer copy_bitmask(
   column_view const& view,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs bitwise AND of the bitmasks of columns of a table. Returns
@@ -169,7 +168,7 @@ rmm::device_buffer copy_bitmask(
 std::pair<rmm::device_buffer, size_type> bitmask_and(
   table_view const& view,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs bitwise OR of the bitmasks of columns of a table. Returns
@@ -186,7 +185,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(
 std::pair<rmm::device_buffer, size_type> bitmask_or(
   table_view const& view,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Given a validity bitmask, counts the number of null elements (unset bits)
diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 6a53553063e..385da993262 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -19,10 +19,9 @@
 #include <cudf/hashing.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -80,7 +79,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Partitions rows from the input table into multiple output tables.
@@ -109,7 +108,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   hash_id hash_function             = hash_id::HASH_MURMUR3,
   uint32_t seed                     = DEFAULT_HASH_SEED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Round-robin partition.
@@ -252,7 +251,7 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robi
   table_view const& input,
   cudf::size_type num_partitions,
   cudf::size_type start_partition   = 0,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index 47eac2e72f9..f6bae170f03 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -21,9 +21,7 @@
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 /**
@@ -61,7 +59,7 @@ std::unique_ptr<column> quantile(
   interpolation interp               = interpolation::LINEAR,
   column_view const& ordered_indices = {},
   bool exact                         = true,
-  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr  = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the rows of the input corresponding to the requested quantiles.
@@ -100,7 +98,7 @@ std::unique_ptr<table> quantiles(
   cudf::sorted is_input_sorted                   = sorted::NO,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Calculate approximate percentiles on an input tdigest column.
@@ -127,7 +125,7 @@ std::unique_ptr<table> quantiles(
 std::unique_ptr<column> percentile_approx(
   tdigest::tdigest_column_view const& input,
   column_view const& percentiles,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index e42ff5df15d..41be2e70cc3 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -19,9 +19,7 @@
 #include <cudf/aggregation.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <optional>
 
@@ -85,7 +83,7 @@ std::unique_ptr<scalar> reduce(
   reduce_aggregation const& agg,
   data_type output_dtype,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Computes the reduction of the values in all rows of a column with an initial value
@@ -109,7 +107,7 @@ std::unique_ptr<scalar> reduce(
   data_type output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Compute reduction of each segment in the input column
@@ -161,7 +159,7 @@ std::unique_ptr<column> segmented_reduce(
   data_type output_dtype,
   null_policy null_handling,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Compute reduction of each segment in the input column with an initial value. Only SUM,
@@ -188,7 +186,7 @@ std::unique_ptr<column> segmented_reduce(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Computes the scan of a column.
@@ -214,7 +212,7 @@ std::unique_ptr<column> scan(
   scan_type inclusive,
   null_policy null_handling         = null_policy::EXCLUDE,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Determines the minimum and maximum values of a column.
@@ -229,7 +227,7 @@ std::unique_ptr<column> scan(
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
   column_view const& col,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
index 5b17df47ec7..c990db32977 100644
--- a/cpp/include/cudf/reduction/detail/histogram.hpp
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -20,10 +20,10 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
diff --git a/cpp/include/cudf/reduction/detail/reduction.cuh b/cpp/include/cudf/reduction/detail/reduction.cuh
index 7d1754d86f2..37e1545bcf2 100644
--- a/cpp/include/cudf/reduction/detail/reduction.cuh
+++ b/cpp/include/cudf/reduction/detail/reduction.cuh
@@ -20,13 +20,13 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/cast_functor.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_reduce.cuh>
 #include <thrust/for_each.h>
diff --git a/cpp/include/cudf/reduction/detail/reduction.hpp b/cpp/include/cudf/reduction/detail/reduction.hpp
index a15783fb460..fd0e3abb529 100644
--- a/cpp/include/cudf/reduction/detail/reduction.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction.hpp
@@ -20,8 +20,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <optional>
 
diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
index fa21dc87e64..b40211a54ad 100644
--- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -21,9 +21,9 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
diff --git a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
index 1c55b387454..af45a14874b 100644
--- a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
@@ -19,11 +19,10 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
diff --git a/cpp/include/cudf/replace.hpp b/cpp/include/cudf/replace.hpp
index 43aabd6c6c6..8d8510da5ea 100644
--- a/cpp/include/cudf/replace.hpp
+++ b/cpp/include/cudf/replace.hpp
@@ -19,9 +19,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -58,7 +56,7 @@ std::unique_ptr<column> replace_nulls(
   column_view const& input,
   column_view const& replacement,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces all null values in a column with a scalar.
@@ -77,7 +75,7 @@ std::unique_ptr<column> replace_nulls(
   column_view const& input,
   scalar const& replacement,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces all null values in a column with the first non-null value that precedes/follows.
@@ -96,7 +94,7 @@ std::unique_ptr<column> replace_nulls(
   column_view const& input,
   replace_policy const& replace_policy,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces all NaN values in a column with corresponding values from another column
@@ -124,7 +122,7 @@ std::unique_ptr<column> replace_nans(
   column_view const& input,
   column_view const& replacement,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces all NaN values in a column with a scalar
@@ -151,7 +149,7 @@ std::unique_ptr<column> replace_nans(
   column_view const& input,
   scalar const& replacement,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Return a copy of `input_col` replacing any `values_to_replace[i]`
@@ -170,7 +168,7 @@ std::unique_ptr<column> find_and_replace_all(
   column_view const& values_to_replace,
   column_view const& replacement_values,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces values less than `lo` in `input` with `lo_replace`,
@@ -225,7 +223,7 @@ std::unique_ptr<column> clamp(
   scalar const& hi,
   scalar const& hi_replace,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces values less than `lo` in `input` with `lo`,
@@ -271,7 +269,7 @@ std::unique_ptr<column> clamp(
   scalar const& lo,
   scalar const& hi,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Copies from a column of floating-point elements and replaces `-NaN` and `-0.0` with `+NaN`
@@ -291,7 +289,7 @@ std::unique_ptr<column> clamp(
 std::unique_ptr<column> normalize_nans_and_zeros(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Modifies a column of floating-point elements to replace all `-NaN` and `-0.0` with `+NaN`
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index 07aaf6488ad..e437e7abfca 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -20,9 +20,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -55,7 +53,7 @@ namespace CUDF_EXPORT cudf {
 std::unique_ptr<column> interleave_columns(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Repeats the rows from `input` table `count` times to form a new table.
@@ -80,7 +78,7 @@ std::unique_ptr<table> tile(
   table_view const& input,
   size_type count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Configures whether byte casting flips endianness
@@ -107,7 +105,7 @@ std::unique_ptr<column> byte_cast(
   column_view const& input_column,
   flip_endianness endian_configuration,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index 5a8c454d8fc..8a717c3f510 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -19,9 +19,7 @@
 #include <cudf/rolling/range_window_bounds.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -70,7 +68,7 @@ std::unique_ptr<column> rolling_window(
   size_type min_periods,
   rolling_aggregation const& agg,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  @copybrief rolling_window
@@ -95,7 +93,7 @@ std::unique_ptr<column> rolling_window(
   size_type min_periods,
   rolling_aggregation const& agg,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Abstraction for window boundary sizes
@@ -245,7 +243,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -267,7 +265,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -294,7 +292,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -318,7 +316,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Applies a grouping-aware, timestamp-based rolling window function to the values in a
@@ -415,7 +413,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Applies a grouping-aware, timestamp-based rolling window function to the values in a
@@ -446,7 +444,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Applies a grouping-aware, value range-based rolling window function to the values in a
@@ -568,7 +566,7 @@ std::unique_ptr<column> grouped_range_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Applies a variable-size rolling window function to the values in a column.
@@ -613,7 +611,7 @@ std::unique_ptr<column> rolling_window(
   size_type min_periods,
   rolling_aggregation const& agg,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp
index ef144b328f7..ba56ff34b97 100644
--- a/cpp/include/cudf/round.hpp
+++ b/cpp/include/cudf/round.hpp
@@ -18,9 +18,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 
@@ -76,7 +74,7 @@ std::unique_ptr<column> round(
   column_view const& input,
   int32_t decimal_places            = 0,
   rounding_method method            = rounding_method::HALF_UP,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 2c5cc60fc70..e8a498afc09 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -19,13 +19,12 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 /**
  * @file
@@ -114,7 +113,7 @@ class scalar {
    */
   scalar(scalar const& other,
          rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+         rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new scalar object.
@@ -130,7 +129,7 @@ class scalar {
   scalar(data_type type,
          bool is_valid                     = false,
          rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+         rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 };
 
 namespace detail {
@@ -166,7 +165,7 @@ class fixed_width_scalar : public scalar {
    */
   fixed_width_scalar(fixed_width_scalar const& other,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Set the value of the scalar.
@@ -217,7 +216,7 @@ class fixed_width_scalar : public scalar {
   fixed_width_scalar(T value,
                      bool is_valid                     = true,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new fixed width scalar object from existing device memory.
@@ -230,7 +229,7 @@ class fixed_width_scalar : public scalar {
   fixed_width_scalar(rmm::device_scalar<T>&& data,
                      bool is_valid                     = true,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 };
 
 }  // namespace detail
@@ -266,7 +265,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    */
   numeric_scalar(numeric_scalar const& other,
                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                 rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new numeric scalar object.
@@ -279,7 +278,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
   numeric_scalar(T value,
                  bool is_valid                     = true,
                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                 rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new numeric scalar object from existing device memory.
@@ -292,7 +291,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
   numeric_scalar(rmm::device_scalar<T>&& data,
                  bool is_valid                     = true,
                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                 rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 };
 
 /**
@@ -329,7 +328,7 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(fixed_point_scalar const& other,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new fixed_point scalar object from already shifted value and scale.
@@ -344,7 +343,7 @@ class fixed_point_scalar : public scalar {
                      numeric::scale_type scale,
                      bool is_valid                     = true,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new fixed_point scalar object from a value and default 0-scale.
@@ -357,7 +356,7 @@ class fixed_point_scalar : public scalar {
   fixed_point_scalar(rep_type value,
                      bool is_valid                     = true,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new fixed_point scalar object from a fixed_point number.
@@ -370,7 +369,7 @@ class fixed_point_scalar : public scalar {
   fixed_point_scalar(T value,
                      bool is_valid                     = true,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new fixed_point scalar object from existing device memory.
@@ -385,7 +384,7 @@ class fixed_point_scalar : public scalar {
                      numeric::scale_type scale,
                      bool is_valid                     = true,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Get the value of the scalar.
@@ -454,7 +453,7 @@ class string_scalar : public scalar {
    */
   string_scalar(string_scalar const& other,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new string scalar object.
@@ -469,7 +468,7 @@ class string_scalar : public scalar {
   string_scalar(std::string const& string,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new string scalar object from string_view.
@@ -484,7 +483,7 @@ class string_scalar : public scalar {
   string_scalar(value_type const& source,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new string scalar object from string_view in device memory.
@@ -499,7 +498,7 @@ class string_scalar : public scalar {
   string_scalar(rmm::device_scalar<value_type>& data,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new string scalar object by moving an existing string data buffer.
@@ -515,7 +514,7 @@ class string_scalar : public scalar {
   string_scalar(rmm::device_buffer&& data,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Explicit conversion operator to get the value of the scalar in a host std::string.
@@ -587,7 +586,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    */
   chrono_scalar(chrono_scalar const& other,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new chrono scalar object.
@@ -600,7 +599,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
   chrono_scalar(T value,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new chrono scalar object from existing device memory.
@@ -613,7 +612,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
   chrono_scalar(rmm::device_scalar<T>&& data,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 };
 
 /**
@@ -646,7 +645,7 @@ class timestamp_scalar : public chrono_scalar<T> {
    */
   timestamp_scalar(timestamp_scalar const& other,
                    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new timestamp scalar object from a duration that is
@@ -662,7 +661,7 @@ class timestamp_scalar : public chrono_scalar<T> {
   timestamp_scalar(Duration2 const& value,
                    bool is_valid,
                    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Returns the duration in number of ticks since the UNIX epoch.
@@ -702,7 +701,7 @@ class duration_scalar : public chrono_scalar<T> {
    */
   duration_scalar(duration_scalar const& other,
                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new duration scalar object from tick counts.
@@ -715,7 +714,7 @@ class duration_scalar : public chrono_scalar<T> {
   duration_scalar(rep_type value,
                   bool is_valid,
                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Returns the duration in number of ticks.
@@ -751,7 +750,7 @@ class list_scalar : public scalar {
    */
   list_scalar(list_scalar const& other,
               rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+              rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new list scalar object from column_view.
@@ -766,7 +765,7 @@ class list_scalar : public scalar {
   list_scalar(cudf::column_view const& data,
               bool is_valid                     = true,
               rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+              rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new list scalar object from existing column.
@@ -779,7 +778,7 @@ class list_scalar : public scalar {
   list_scalar(cudf::column&& data,
               bool is_valid                     = true,
               rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+              rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
@@ -816,7 +815,7 @@ class struct_scalar : public scalar {
    */
   struct_scalar(struct_scalar const& other,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new struct scalar object from table_view.
@@ -831,7 +830,7 @@ class struct_scalar : public scalar {
   struct_scalar(table_view const& data,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new struct scalar object from a host_span of column_views.
@@ -846,7 +845,7 @@ class struct_scalar : public scalar {
   struct_scalar(host_span<column_view const> data,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new struct scalar object from an existing table in device memory.
@@ -862,7 +861,7 @@ class struct_scalar : public scalar {
   struct_scalar(table&& data,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp
index a422c3bfbe9..87700115996 100644
--- a/cpp/include/cudf/scalar/scalar_factories.hpp
+++ b/cpp/include/cudf/scalar/scalar_factories.hpp
@@ -17,10 +17,9 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 /**
@@ -45,7 +44,7 @@ namespace CUDF_EXPORT cudf {
 std::unique_ptr<scalar> make_numeric_scalar(
   data_type type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -62,7 +61,7 @@ std::unique_ptr<scalar> make_numeric_scalar(
 std::unique_ptr<scalar> make_timestamp_scalar(
   data_type type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -79,7 +78,7 @@ std::unique_ptr<scalar> make_timestamp_scalar(
 std::unique_ptr<scalar> make_duration_scalar(
   data_type type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -96,7 +95,7 @@ std::unique_ptr<scalar> make_duration_scalar(
 std::unique_ptr<scalar> make_fixed_width_scalar(
   data_type type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct STRING type scalar given a `std::string`.
@@ -113,7 +112,7 @@ std::unique_ptr<scalar> make_fixed_width_scalar(
 std::unique_ptr<scalar> make_string_scalar(
   std::string const& string,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Constructs default constructed scalar of type `type`
@@ -128,7 +127,7 @@ std::unique_ptr<scalar> make_string_scalar(
 std::unique_ptr<scalar> make_default_constructed_scalar(
   data_type type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates an empty (invalid) scalar of the same type as the `input` column_view.
@@ -143,7 +142,7 @@ std::unique_ptr<scalar> make_default_constructed_scalar(
 std::unique_ptr<scalar> make_empty_scalar_like(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct scalar using the given value of fixed width type
@@ -158,7 +157,7 @@ template <typename T>
 std::unique_ptr<scalar> make_fixed_width_scalar(
   T value,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   return std::make_unique<scalar_type_t<T>>(value, true, stream, mr);
 }
@@ -178,7 +177,7 @@ std::unique_ptr<scalar> make_fixed_point_scalar(
   typename T::rep value,
   numeric::scale_type scale,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   return std::make_unique<scalar_type_t<T>>(value, scale, true, stream, mr);
 }
@@ -194,7 +193,7 @@ std::unique_ptr<scalar> make_fixed_point_scalar(
 std::unique_ptr<scalar> make_list_scalar(
   column_view elements,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a struct scalar using the given table_view.
@@ -209,7 +208,7 @@ std::unique_ptr<scalar> make_list_scalar(
 std::unique_ptr<scalar> make_struct_scalar(
   table_view const& data,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a struct scalar using the given span of column views.
@@ -224,7 +223,7 @@ std::unique_ptr<scalar> make_struct_scalar(
 std::unique_ptr<scalar> make_struct_scalar(
   host_span<column_view const> data,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp
index ad170ec726b..e10c8c8b4d2 100644
--- a/cpp/include/cudf/search.hpp
+++ b/cpp/include/cudf/search.hpp
@@ -21,9 +21,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <vector>
 
@@ -75,7 +73,7 @@ std::unique_ptr<column> lower_bound(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Find largest indices in a sorted table where values should be inserted to maintain order.
@@ -117,7 +115,7 @@ std::unique_ptr<column> upper_bound(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Check if the given `needle` value exists in the `haystack` column.
@@ -166,7 +164,7 @@ std::unique_ptr<column> contains(
   column_view const& haystack,
   column_view const& needles,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 4cb265a2a0b..b773f76defe 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -20,9 +20,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <vector>
@@ -56,7 +54,7 @@ std::unique_ptr<column> sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the row indices that would produce `input` in a stable
@@ -71,7 +69,7 @@ std::unique_ptr<column> stable_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Checks whether the rows of a `table` are sorted in a lexicographical
@@ -115,7 +113,7 @@ std::unique_ptr<table> sort(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a stable lexicographic sort of the rows of a table
@@ -127,7 +125,7 @@ std::unique_ptr<table> stable_sort(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a key-value sort.
@@ -157,7 +155,7 @@ std::unique_ptr<table> sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a key-value stable sort.
@@ -170,7 +168,7 @@ std::unique_ptr<table> stable_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the ranks of input column in sorted order.
@@ -210,7 +208,7 @@ std::unique_ptr<column> rank(
   null_order null_precedence,
   bool percentage,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns sorted order after sorting each segment in the table.
@@ -261,7 +259,7 @@ std::unique_ptr<column> segmented_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns sorted order after stably sorting each segment in the table.
@@ -274,7 +272,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a lexicographic segmented sort of a table
@@ -330,7 +328,7 @@ std::unique_ptr<table> segmented_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a stably lexicographic segmented sort of a table
@@ -344,7 +342,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index ced8d5849d0..ed0730d50a4 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -19,9 +19,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <vector>
@@ -77,7 +75,7 @@ std::unique_ptr<table> drop_nulls(
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Filters a table to remove null elements.
@@ -110,7 +108,7 @@ std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Filters a table to remove NANs with threshold count.
@@ -155,7 +153,7 @@ std::unique_ptr<table> drop_nans(
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Filters a table to remove NANs.
@@ -189,7 +187,7 @@ std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Filters `input` using `boolean_mask` of boolean values as a mask.
@@ -217,7 +215,7 @@ std::unique_ptr<table> apply_boolean_mask(
   table_view const& input,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Choices for drop_duplicates API for retainment of duplicate rows
@@ -263,7 +261,7 @@ std::unique_ptr<table> unique(
   duplicate_keep_option keep,
   null_equality nulls_equal         = null_equality::EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a new table without duplicate rows.
@@ -292,7 +290,7 @@ std::unique_ptr<table> distinct(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a column of indices of all distinct rows in the input table.
@@ -314,7 +312,7 @@ std::unique_ptr<column> distinct_indices(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a new table without duplicate rows, preserving input order.
@@ -346,7 +344,7 @@ std::unique_ptr<table> stable_distinct(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Count the number of consecutive groups of equivalent rows in a column.
diff --git a/cpp/include/cudf/strings/attributes.hpp b/cpp/include/cudf/strings/attributes.hpp
index 323290e907c..5f2eda8fa5b 100644
--- a/cpp/include/cudf/strings/attributes.hpp
+++ b/cpp/include/cudf/strings/attributes.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 
@@ -48,7 +46,7 @@ namespace strings {
  */
 std::unique_ptr<column> count_characters(
   strings_column_view const& input,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column containing byte lengths
@@ -66,7 +64,7 @@ std::unique_ptr<column> count_characters(
  */
 std::unique_ptr<column> count_bytes(
   strings_column_view const& input,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a numeric column with code point values (integers) for each
@@ -86,7 +84,7 @@ std::unique_ptr<column> count_bytes(
  */
 std::unique_ptr<column> code_points(
   strings_column_view const& input,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of strings_apis group
 
diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp
index 420b46a05b2..312e3a5bef1 100644
--- a/cpp/include/cudf/strings/capitalize.hpp
+++ b/cpp/include/cudf/strings/capitalize.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -63,7 +61,7 @@ std::unique_ptr<column> capitalize(
   strings_column_view const& input,
   string_scalar const& delimiters   = string_scalar("", true, cudf::get_default_stream()),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Modifies first character of each word to upper-case and lower-cases the rest.
@@ -96,7 +94,7 @@ std::unique_ptr<column> title(
   strings_column_view const& input,
   string_character_types sequence_type = string_character_types::ALPHA,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Checks if the strings in the input column are title formatted.
@@ -125,7 +123,7 @@ std::unique_ptr<column> title(
 std::unique_ptr<column> is_title(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/case.hpp b/cpp/include/cudf/strings/case.hpp
index 45f56a681a6..c2bd559accc 100644
--- a/cpp/include/cudf/strings/case.hpp
+++ b/cpp/include/cudf/strings/case.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -46,7 +44,7 @@ namespace strings {
 std::unique_ptr<column> to_lower(
   strings_column_view const& strings,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Converts a column of strings to upper case.
@@ -65,7 +63,7 @@ std::unique_ptr<column> to_lower(
 std::unique_ptr<column> to_upper(
   strings_column_view const& strings,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of strings converting lower case characters to
@@ -85,7 +83,7 @@ std::unique_ptr<column> to_upper(
 std::unique_ptr<column> swapcase(
   strings_column_view const& strings,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index a6af681eec6..3ebe5cb53e9 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/char_types/char_types_enum.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -68,7 +66,7 @@ std::unique_ptr<column> all_characters_of_type(
   string_character_types types,
   string_character_types verify_types = string_character_types::ALL_TYPES,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Filter specific character types from a column of strings.
@@ -115,7 +113,7 @@ std::unique_ptr<column> filter_characters_of_type(
   string_scalar const& replacement     = string_scalar(""),
   string_character_types types_to_keep = string_character_types::ALL_TYPES,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 2cade813d78..d766fba0cdc 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -20,9 +20,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -81,7 +79,7 @@ std::unique_ptr<column> join_strings(
   string_scalar const& separator    = string_scalar(""),
   string_scalar const& narep        = string_scalar("", false),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Concatenates a list of strings columns using separators for each row
@@ -149,7 +147,7 @@ std::unique_ptr<column> concatenate(
   string_scalar const& col_narep       = string_scalar("", false),
   separator_on_nulls separate_nulls    = separator_on_nulls::YES,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Row-wise concatenates the given list of strings columns and
@@ -204,7 +202,7 @@ std::unique_ptr<column> concatenate(
   string_scalar const& narep        = string_scalar("", false),
   separator_on_nulls separate_nulls = separator_on_nulls::YES,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
@@ -271,7 +269,7 @@ std::unique_ptr<column> join_list_elements(
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
@@ -330,7 +328,7 @@ std::unique_ptr<column> join_list_elements(
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index 59c9b2dea40..2a25ac79bbb 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -61,7 +59,7 @@ std::unique_ptr<column> contains_re(
   strings_column_view const& input,
   regex_program const& prog,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -89,7 +87,7 @@ std::unique_ptr<column> matches_re(
   strings_column_view const& input,
   regex_program const& prog,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the number of times the given regex_program's pattern
@@ -117,7 +115,7 @@ std::unique_ptr<column> count_re(
   strings_column_view const& input,
   regex_program const& prog,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -164,7 +162,7 @@ std::unique_ptr<column> like(
   string_scalar const& pattern,
   string_scalar const& escape_character = string_scalar(""),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -205,7 +203,7 @@ std::unique_ptr<column> like(
   strings_column_view const& patterns,
   string_scalar const& escape_character = string_scalar(""),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_booleans.hpp b/cpp/include/cudf/strings/convert/convert_booleans.hpp
index d79dd4a80ea..bf7b6c1525b 100644
--- a/cpp/include/cudf/strings/convert/convert_booleans.hpp
+++ b/cpp/include/cudf/strings/convert/convert_booleans.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -46,7 +44,7 @@ std::unique_ptr<column> to_booleans(
   strings_column_view const& input,
   string_scalar const& true_string,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting the boolean values from the
@@ -68,7 +66,7 @@ std::unique_ptr<column> from_booleans(
   string_scalar const& true_string,
   string_scalar const& false_string,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp
index c3b3c91ab35..04eba83925d 100644
--- a/cpp/include/cudf/strings/convert/convert_datetime.hpp
+++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <string>
 #include <vector>
@@ -90,7 +88,7 @@ std::unique_ptr<column> to_timestamps(
   data_type timestamp_type,
   std::string_view format,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Verifies the given strings column can be parsed to timestamps using the provided format
@@ -137,7 +135,7 @@ std::unique_ptr<column> is_timestamp(
   strings_column_view const& input,
   std::string_view format,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting a timestamp column into
@@ -251,7 +249,7 @@ std::unique_ptr<column> from_timestamps(
   strings_column_view const& names  = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_durations.hpp b/cpp/include/cudf/strings/convert/convert_durations.hpp
index 8b69968a609..25184cbfd02 100644
--- a/cpp/include/cudf/strings/convert/convert_durations.hpp
+++ b/cpp/include/cudf/strings/convert/convert_durations.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -78,7 +76,7 @@ std::unique_ptr<column> to_durations(
   data_type duration_type,
   std::string_view format,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting a duration column into
@@ -129,7 +127,7 @@ std::unique_ptr<column> from_durations(
   column_view const& durations,
   std::string_view format           = "%D days %H:%M:%S",
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
index a9c5aea6343..6d5e94a8e02 100644
--- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
+++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -64,7 +62,7 @@ std::unique_ptr<column> to_fixed_point(
   strings_column_view const& input,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting the fixed-point values
@@ -94,7 +92,7 @@ std::unique_ptr<column> to_fixed_point(
 std::unique_ptr<column> from_fixed_point(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -126,7 +124,7 @@ std::unique_ptr<column> is_fixed_point(
   strings_column_view const& input,
   data_type decimal_type            = data_type{type_id::DECIMAL64},
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index 64e9bb776f4..52fb47df94f 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -50,7 +48,7 @@ std::unique_ptr<column> to_floats(
   strings_column_view const& strings,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting the float values from the
@@ -73,7 +71,7 @@ std::unique_ptr<column> to_floats(
 std::unique_ptr<column> from_floats(
   column_view const& floats,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -99,7 +97,7 @@ std::unique_ptr<column> from_floats(
 std::unique_ptr<column> is_float(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 62eb1fdda4d..9aad32bfba4 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -57,7 +55,7 @@ std::unique_ptr<column> to_integers(
   strings_column_view const& input,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting the integer values from the
@@ -78,7 +76,7 @@ std::unique_ptr<column> to_integers(
 std::unique_ptr<column> from_integers(
   column_view const& integers,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -107,7 +105,7 @@ std::unique_ptr<column> from_integers(
 std::unique_ptr<column> is_integer(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -141,7 +139,7 @@ std::unique_ptr<column> is_integer(
   strings_column_view const& input,
   data_type int_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new integer numeric column parsing hexadecimal values from the
@@ -171,7 +169,7 @@ std::unique_ptr<column> hex_to_integers(
   strings_column_view const& input,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -198,7 +196,7 @@ std::unique_ptr<column> hex_to_integers(
 std::unique_ptr<column> is_hex(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting integer columns to hexadecimal
@@ -231,7 +229,7 @@ std::unique_ptr<column> is_hex(
 std::unique_ptr<column> integers_to_hex(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
index 97d1dfee017..2dd82554cee 100644
--- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp
+++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -54,7 +52,7 @@ namespace strings {
 std::unique_ptr<column> ipv4_to_integers(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Converts integers into IPv4 addresses as strings.
@@ -77,7 +75,7 @@ std::unique_ptr<column> ipv4_to_integers(
 std::unique_ptr<column> integers_to_ipv4(
   column_view const& integers,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -104,7 +102,7 @@ std::unique_ptr<column> integers_to_ipv4(
 std::unique_ptr<column> is_ipv4(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index 85b67907228..80d0511fc1f 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -19,9 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -64,7 +62,7 @@ std::unique_ptr<column> format_list_column(
   strings_column_view const& separators = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp
index a42a5cd2407..d6e87f9d543 100644
--- a/cpp/include/cudf/strings/convert/convert_urls.hpp
+++ b/cpp/include/cudf/strings/convert/convert_urls.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -48,7 +46,7 @@ namespace strings {
 std::unique_ptr<column> url_encode(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Encodes each string using URL encoding.
@@ -71,7 +69,7 @@ std::unique_ptr<column> url_encode(
 std::unique_ptr<column> url_decode(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index 962191eae6a..31698457048 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -22,9 +22,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index e038102ab1f..75762e61afe 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -20,10 +20,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/converters.hpp b/cpp/include/cudf/strings/detail/converters.hpp
index 73a97499293..3880b8abc32 100644
--- a/cpp/include/cudf/strings/detail/converters.hpp
+++ b/cpp/include/cudf/strings/detail/converters.hpp
@@ -19,9 +19,9 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 213a41ca596..6b025e8659d 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -18,11 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <cuda/std/optional>
diff --git a/cpp/include/cudf/strings/detail/copy_range.hpp b/cpp/include/cudf/strings/detail/copy_range.hpp
index 71dcf9edaf3..33ac74da97f 100644
--- a/cpp/include/cudf/strings/detail/copy_range.hpp
+++ b/cpp/include/cudf/strings/detail/copy_range.hpp
@@ -17,9 +17,9 @@
 
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp
index b4d3362359d..f97cc9f5b5d 100644
--- a/cpp/include/cudf/strings/detail/copying.hpp
+++ b/cpp/include/cudf/strings/detail/copying.hpp
@@ -20,9 +20,9 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp
index 1a3ff2c9166..55508b0ac1b 100644
--- a/cpp/include/cudf/strings/detail/fill.hpp
+++ b/cpp/include/cudf/strings/detail/fill.hpp
@@ -20,9 +20,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 4369de317b3..4216523df97 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -24,11 +24,11 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/prefetch.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
diff --git a/cpp/include/cudf/strings/detail/merge.hpp b/cpp/include/cudf/strings/detail/merge.hpp
index 0aa5c0c2899..92f0fe34576 100644
--- a/cpp/include/cudf/strings/detail/merge.hpp
+++ b/cpp/include/cudf/strings/detail/merge.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index ab092555c48..780a0f6a9f5 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -20,9 +20,9 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/scan.hpp b/cpp/include/cudf/strings/detail/scan.hpp
index 4991fd633d5..71fbfadf9ec 100644
--- a/cpp/include/cudf/strings/detail/scan.hpp
+++ b/cpp/include/cudf/strings/detail/scan.hpp
@@ -17,9 +17,9 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index 87f0e7ae47c..e49d6dff40d 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -19,12 +19,12 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -70,7 +70,7 @@ std::unique_ptr<column> scatter(SourceIterator begin,
 
   // create vector of string_view's to scatter into
   rmm::device_uvector<string_view> target_vector =
-    create_string_vector_from_column(target, stream, rmm::mr::get_current_device_resource());
+    create_string_vector_from_column(target, stream, cudf::get_current_device_resource_ref());
 
   // this ensures empty strings are not mapped to nulls in the make_strings_column function
   auto const size = thrust::distance(begin, end);
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 55b59dd4ff2..1283226879b 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -23,11 +23,11 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/prefetch.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index a3221038eed..6b1b453a752 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -22,10 +22,10 @@
 #include <cudf/strings/detail/gather.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index 1fa505501d8..d276c5df7dc 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -19,11 +19,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index 2ef7308b802..f8bf93b77cf 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -18,9 +18,7 @@
 #include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -64,7 +62,7 @@ std::unique_ptr<table> extract(
   strings_column_view const& input,
   regex_program const& prog,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a lists column of strings where each string column row corresponds to the
@@ -100,7 +98,7 @@ std::unique_ptr<column> extract_all_record(
   strings_column_view const& input,
   regex_program const& prog,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index efba6da9454..e024b116a71 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -59,7 +57,7 @@ std::unique_ptr<column> find(
   size_type start                   = 0,
   size_type stop                    = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of character position values where the target
@@ -90,7 +88,7 @@ std::unique_ptr<column> rfind(
   size_type start                   = 0,
   size_type stop                    = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of character position values where the target
@@ -117,7 +115,7 @@ std::unique_ptr<column> find(
   strings_column_view const& target,
   size_type start                   = 0,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -138,7 +136,7 @@ std::unique_ptr<column> contains(
   strings_column_view const& input,
   string_scalar const& target,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -163,7 +161,7 @@ std::unique_ptr<column> contains(
   strings_column_view const& input,
   strings_column_view const& targets,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -185,7 +183,7 @@ std::unique_ptr<column> starts_with(
   strings_column_view const& input,
   string_scalar const& target,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -211,7 +209,7 @@ std::unique_ptr<column> starts_with(
   strings_column_view const& input,
   strings_column_view const& targets,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -233,7 +231,7 @@ std::unique_ptr<column> ends_with(
   strings_column_view const& input,
   string_scalar const& target,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -259,7 +257,7 @@ std::unique_ptr<column> ends_with(
   strings_column_view const& input,
   strings_column_view const& targets,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index dea08308ff0..1fe446db8da 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -59,7 +57,7 @@ std::unique_ptr<column> find_multiple(
   strings_column_view const& input,
   strings_column_view const& targets,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index 26249b6842c..c6b9bc7e58a 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -18,9 +18,7 @@
 #include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -66,7 +64,7 @@ std::unique_ptr<column> findall(
   strings_column_view const& input,
   regex_program const& prog,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp
index 11e35f717ae..606a866cb8a 100644
--- a/cpp/include/cudf/strings/padding.hpp
+++ b/cpp/include/cudf/strings/padding.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/side_type.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -62,7 +60,7 @@ std::unique_ptr<column> pad(
   side_type side                    = side_type::RIGHT,
   std::string_view fill_char        = " ",
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Add '0' as padding to the left of each string.
@@ -92,7 +90,7 @@ std::unique_ptr<column> zfill(
   strings_column_view const& input,
   size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index e160f75390b..af419d9501f 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -61,7 +59,7 @@ std::unique_ptr<string_scalar> repeat_string(
   string_scalar const& input,
   size_type repeat_times,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Repeat each string in the given strings column a given number of times
@@ -92,7 +90,7 @@ std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   size_type repeat_times,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Repeat each string in the given strings column by the numbers of times given in another
@@ -129,7 +127,7 @@ std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   column_view const& repeat_times,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index f450b77ad7a..c7a87bbb0d0 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -70,7 +68,7 @@ std::unique_ptr<column> replace(
   string_scalar const& repl,
   cudf::size_type maxrepl           = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief This function replaces each string in the column with the provided
@@ -112,7 +110,7 @@ std::unique_ptr<column> replace_slice(
   size_type start                   = 0,
   size_type stop                    = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces substrings matching a list of targets with the corresponding
@@ -158,7 +156,7 @@ std::unique_ptr<column> replace_multiple(
   strings_column_view const& targets,
   strings_column_view const& repls,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index 6b487072cb2..4a58142cbe6 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <optional>
 
@@ -60,7 +58,7 @@ std::unique_ptr<column> replace_re(
   string_scalar const& replacement           = string_scalar(""),
   std::optional<size_type> max_replace_count = std::nullopt,
   rmm::cuda_stream_view stream               = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr          = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr          = cudf::get_current_device_resource_ref());
 
 /**
  * @brief For each string, replaces any character sequence matching the given patterns
@@ -84,7 +82,7 @@ std::unique_ptr<column> replace_re(
   strings_column_view const& replacements,
   regex_flags const flags           = regex_flags::DEFAULT,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief For each string, replaces any character sequence matching the given regex
@@ -109,7 +107,7 @@ std::unique_ptr<column> replace_with_backrefs(
   regex_program const& prog,
   std::string_view replacement,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace strings
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/reverse.hpp b/cpp/include/cudf/strings/reverse.hpp
index fbda2e5fe7c..f9ab34373df 100644
--- a/cpp/include/cudf/strings/reverse.hpp
+++ b/cpp/include/cudf/strings/reverse.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -49,7 +47,7 @@ namespace strings {
 std::unique_ptr<column> reverse(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp
index b0da6976207..754bee4b1f0 100644
--- a/cpp/include/cudf/strings/slice.hpp
+++ b/cpp/include/cudf/strings/slice.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -65,7 +63,7 @@ std::unique_ptr<column> slice_strings(
   numeric_scalar<size_type> const& stop  = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& step  = numeric_scalar<size_type>(1),
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column that contains substrings of the
@@ -110,7 +108,7 @@ std::unique_ptr<column> slice_strings(
   column_view const& starts,
   column_view const& stops,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp
index 8f5ae752417..92573a665c9 100644
--- a/cpp/include/cudf/strings/split/partition.hpp
+++ b/cpp/include/cudf/strings/split/partition.hpp
@@ -18,9 +18,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -63,7 +61,7 @@ std::unique_ptr<table> partition(
   strings_column_view const& input,
   string_scalar const& delimiter    = string_scalar(""),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a set of 3 columns by splitting each string using the
@@ -97,7 +95,7 @@ std::unique_ptr<table> rpartition(
   strings_column_view const& input,
   string_scalar const& delimiter    = string_scalar(""),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index ca371d7abd1..026192d4a0b 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -18,9 +18,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -58,7 +56,7 @@ std::unique_ptr<table> split(
   string_scalar const& delimiter    = string_scalar(""),
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a list of columns by splitting each string using the
@@ -88,7 +86,7 @@ std::unique_ptr<table> rsplit(
   string_scalar const& delimiter    = string_scalar(""),
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Splits individual strings elements into a list of strings.
@@ -162,7 +160,7 @@ std::unique_ptr<column> split_record(
   string_scalar const& delimiter    = string_scalar(""),
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Splits individual strings elements into a list of strings starting
@@ -241,7 +239,7 @@ std::unique_ptr<column> rsplit_record(
   string_scalar const& delimiter    = string_scalar(""),
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index 96ef0b6e830..ce376ab93cf 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -85,7 +83,7 @@ std::unique_ptr<table> split_re(
   regex_program const& prog,
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Splits strings elements into a table of strings columns using a
@@ -141,7 +139,7 @@ std::unique_ptr<table> rsplit_re(
   regex_program const& prog,
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Splits strings elements into a list column of strings
@@ -199,7 +197,7 @@ std::unique_ptr<column> split_record_re(
   regex_program const& prog,
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Splits strings elements into a list column of strings using the given
@@ -259,7 +257,7 @@ std::unique_ptr<column> rsplit_record_re(
   regex_program const& prog,
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index 4cfba59c72c..396940dbb30 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/side_type.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -67,7 +65,7 @@ std::unique_ptr<column> strip(
   side_type side                    = side_type::BOTH,
   string_scalar const& to_strip     = string_scalar(""),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index 531753f4a8c..aa69a2e5679 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <vector>
 
@@ -58,7 +56,7 @@ std::unique_ptr<column> translate(
   strings_column_view const& input,
   std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Removes or keeps the specified character ranges in cudf::strings::filter_characters
@@ -105,7 +103,7 @@ std::unique_ptr<column> filter_characters(
   filter_type keep_characters       = filter_type::KEEP,
   string_scalar const& replacement  = string_scalar(""),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/utilities.hpp b/cpp/include/cudf/strings/utilities.hpp
index ae445282382..999fff0f4c8 100644
--- a/cpp/include/cudf/strings/utilities.hpp
+++ b/cpp/include/cudf/strings/utilities.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -35,7 +33,7 @@ namespace strings {
 rmm::device_uvector<string_view> create_string_vector_from_column(
   cudf::strings_column_view const strings,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Return the threshold size for a strings column to use int64 offsets
diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp
index 465a9d15d00..96ae2fb0582 100644
--- a/cpp/include/cudf/strings/wrap.hpp
+++ b/cpp/include/cudf/strings/wrap.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -68,7 +66,7 @@ std::unique_ptr<column> wrap(
   strings_column_view const& input,
   size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index 16be868af52..96964eac31f 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -19,10 +19,9 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 namespace CUDF_EXPORT cudf {
 namespace structs::detail {
 
diff --git a/cpp/include/cudf/structs/detail/scan.hpp b/cpp/include/cudf/structs/detail/scan.hpp
index 6121f63d42f..e9e721c3335 100644
--- a/cpp/include/cudf/structs/detail/scan.hpp
+++ b/cpp/include/cudf/structs/detail/scan.hpp
@@ -18,9 +18,9 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace structs::detail {
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index be2af7ac653..762131a174f 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -18,10 +18,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -58,7 +57,7 @@ class table {
    */
   explicit table(table const& other,
                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                 rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
   /**
    * @brief Moves the contents from a vector of `unique_ptr`s to columns to
    * construct a new table.
@@ -77,7 +76,7 @@ class table {
    */
   table(table_view view,
         rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-        rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+        rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Returns the number of columns in the table
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
index 8329c64e24f..aa903770e26 100644
--- a/cpp/include/cudf/timezone.hpp
+++ b/cpp/include/cudf/timezone.hpp
@@ -16,9 +16,7 @@
 #pragma once
 
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <optional>
@@ -52,6 +50,6 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
 std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index f16214260f7..82b8bee1acf 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -19,9 +19,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -58,7 +56,7 @@ std::unique_ptr<column> transform(
   data_type output_type,
   bool is_ptx,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
@@ -75,7 +73,7 @@ std::unique_ptr<column> transform(
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Compute a new column by evaluating an expression tree on a table.
@@ -95,7 +93,7 @@ std::unique_ptr<column> compute_column(
   table_view const& table,
   ast::expression const& expr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a bitmask from a column of boolean elements.
@@ -116,7 +114,7 @@ std::unique_ptr<column> compute_column(
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Encode the rows of the given table as integers
@@ -146,7 +144,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Encodes `input` by generating a new column for each value in `categories` indicating the
@@ -180,7 +178,7 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
   column_view const& input,
   column_view const& categories,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a boolean column from given bitmask.
@@ -209,7 +207,7 @@ std::unique_ptr<column> mask_to_bools(
   size_type begin_bit,
   size_type end_bit,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -240,7 +238,7 @@ std::unique_ptr<column> mask_to_bools(
 std::unique_ptr<column> row_bit_count(
   table_view const& t,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -265,7 +263,7 @@ std::unique_ptr<column> segmented_row_bit_count(
   table_view const& t,
   size_type segment_length,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/transpose.hpp b/cpp/include/cudf/transpose.hpp
index f4433c46a06..8b680071e71 100644
--- a/cpp/include/cudf/transpose.hpp
+++ b/cpp/include/cudf/transpose.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 /**
@@ -46,7 +44,7 @@ namespace CUDF_EXPORT cudf {
  */
 std::pair<std::unique_ptr<column>, table_view> transpose(
   table_view const& input,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 55f4c1f5a23..53e0f3a15d2 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -21,11 +21,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <memory>
 
 namespace CUDF_EXPORT cudf {
@@ -159,7 +157,7 @@ std::unique_ptr<cudf::column> unary_operation(
   cudf::column_view const& input,
   cudf::unary_operator op,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements where for every element in `input` `true`
@@ -175,7 +173,7 @@ std::unique_ptr<cudf::column> unary_operation(
 std::unique_ptr<cudf::column> is_null(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements where for every element in `input` `true`
@@ -191,7 +189,7 @@ std::unique_ptr<cudf::column> is_null(
 std::unique_ptr<cudf::column> is_valid(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Casts data from dtype specified in input to dtype specified in output.
@@ -210,7 +208,7 @@ std::unique_ptr<column> cast(
   column_view const& input,
   data_type out_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Check if a cast between two datatypes is supported.
@@ -238,7 +236,7 @@ bool is_supported_cast(data_type from, data_type to) noexcept;
 std::unique_ptr<column> is_nan(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements indicating the absence of `NaN` values
@@ -257,7 +255,7 @@ std::unique_ptr<column> is_nan(
 std::unique_ptr<column> is_not_nan(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/memory_resource.hpp b/cpp/include/cudf/utilities/memory_resource.hpp
new file mode 100644
index 00000000000..b562574fd79
--- /dev/null
+++ b/cpp/include/cudf/utilities/memory_resource.hpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace cudf {
+
+/**
+ * @addtogroup memory_resource
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Get the current device memory resource.
+ *
+ * @return The current device memory resource.
+ */
+inline rmm::mr::device_memory_resource* get_current_device_resource()
+{
+  return rmm::mr::get_current_device_resource();
+}
+
+/**
+ * @brief Get the current device memory resource reference.
+ *
+ * @return The current device memory resource reference.
+ */
+inline rmm::device_async_resource_ref get_current_device_resource_ref()
+{
+  // For now, match current behavior which is to return current resource pointer
+  return rmm::mr::get_current_device_resource();
+}
+
+/**
+ * @brief Set the current device memory resource.
+ *
+ * @param mr The new device memory resource.
+ * @return The previous device memory resource.
+ */
+inline rmm::mr::device_memory_resource* set_current_device_resource(
+  rmm::mr::device_memory_resource* mr)
+{
+  return rmm::mr::set_current_device_resource(mr);
+}
+
+/**
+ * @brief Set the current device memory resource reference.
+ *
+ * @param mr The new device memory resource reference.
+ * @return The previous device memory resource reference.
+ */
+inline rmm::device_async_resource_ref set_current_device_resource_ref(
+  rmm::device_async_resource_ref mr)
+{
+  return rmm::mr::set_current_device_resource_ref(mr);
+}
+
+/**
+ * @brief Reset the current device memory resource reference to the initial resource.
+ *
+ * @return The previous device memory resource reference.
+ */
+inline rmm::device_async_resource_ref reset_current_device_resource_ref()
+{
+  return rmm::mr::reset_current_device_resource_ref();
+}
+
+/** @} */  // end of group
+}  // namespace cudf
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index 623a033698f..2cab0aa363e 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -17,8 +17,7 @@
 #pragma once
 
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <optional>
 
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 04bd51e9aa3..7b86f971cae 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -20,11 +20,10 @@
 #include <cudf_test/file_utilities.hpp>
 
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace test {
@@ -38,7 +37,7 @@ namespace test {
  * ```
  */
 class BaseFixture : public ::testing::Test {
-  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{cudf::get_current_device_resource_ref()};
 
  public:
   /**
@@ -59,7 +58,7 @@ class BaseFixture : public ::testing::Test {
  */
 template <typename T>
 class BaseFixtureWithParam : public ::testing::TestWithParam<T> {
-  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{cudf::get_current_device_resource_ref()};
 
  public:
   /**
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index d00db222b62..6206c1311d2 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -33,11 +33,11 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_buffer.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/functional.h>
@@ -771,10 +771,10 @@ class strings_column_wrapper : public detail::column_wrapper {
     auto all_valid        = thrust::make_constant_iterator(true);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
     auto d_chars          = cudf::detail::make_device_uvector_async(
-      chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+      chars, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
     auto d_offsets = std::make_unique<cudf::column>(
       cudf::detail::make_device_uvector_sync(
-        offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+        offsets, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()),
       rmm::device_buffer{},
       0);
     wrapped =
@@ -821,14 +821,14 @@ class strings_column_wrapper : public detail::column_wrapper {
     auto [chars, offsets]        = detail::make_chars_and_offsets(begin, end, v);
     auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings);
     auto d_chars                 = cudf::detail::make_device_uvector_async(
-      chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+      chars, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
     auto d_offsets = std::make_unique<cudf::column>(
       cudf::detail::make_device_uvector_async(
-        offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+        offsets, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()),
       rmm::device_buffer{},
       0);
     auto d_bitmask = cudf::detail::make_device_uvector_sync(
-      null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+      null_mask, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
     wrapped = cudf::make_strings_column(
       num_strings, std::move(d_offsets), d_chars.release(), null_count, d_bitmask.release());
   }
@@ -1651,7 +1651,7 @@ class lists_column_wrapper : public detail::column_wrapper {
     auto data = children.empty() ? cudf::empty_like(expected_hierarchy)
                                  : cudf::concatenate(children,
                                                      cudf::test::get_default_stream(),
-                                                     rmm::mr::get_current_device_resource());
+                                                     cudf::get_current_device_resource_ref());
 
     // increment depth
     depth = expected_depth + 1;
@@ -1756,7 +1756,7 @@ class lists_column_wrapper : public detail::column_wrapper {
                        lists_column_view(expected_hierarchy).child()),
       col.null_count(),
       cudf::copy_bitmask(
-        col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+        col, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()),
       cudf::test::get_default_stream());
   }
 
diff --git a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index 417bbb3d9ab..b4001babe24 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -18,9 +18,9 @@
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/detail/utilities/stacktrace.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <iostream>
 
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index 5fd2403b0f2..1758790cd64 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -24,9 +24,9 @@
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
@@ -171,7 +171,7 @@ void tdigest_minmax_compare(cudf::tdigest::tdigest_column_view const& tdv,
   thrust::host_vector<device_span<T const>> h_spans;
   h_spans.push_back({input_values.begin<T>(), static_cast<size_t>(input_values.size())});
   auto spans = cudf::detail::make_device_uvector_async(
-    h_spans, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_spans, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto expected_min = cudf::make_fixed_width_column(
     data_type{type_id::FLOAT64}, spans.size(), mask_state::UNALLOCATED);
@@ -271,7 +271,7 @@ void tdigest_simple_all_nulls_aggregation(Func op)
 
   // NOTE: an empty tdigest column still has 1 row.
   auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -562,12 +562,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
-                                                            rmm::mr::get_current_device_resource());
-  auto b = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
-                                                            rmm::mr::get_current_device_resource());
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
-                                                            rmm::mr::get_current_device_resource());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto b = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -578,7 +578,7 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto result      = merge_op(*values, delta);
 
   auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index ed83ddabb00..272c91133f8 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -21,6 +21,7 @@
 
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/aligned.hpp>
 #include <rmm/cuda_stream_view.hpp>
@@ -30,7 +31,6 @@
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
@@ -161,7 +161,7 @@ inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
 {
   auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
   auto resource       = cudf::test::create_memory_resource(rmm_mode);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
   return resource;
 }
 
@@ -178,7 +178,7 @@ inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
  */
 inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
 {
-  auto resource                      = rmm::mr::get_current_device_resource();
+  auto resource                      = cudf::get_current_device_resource_ref();
   auto const stream_mode             = cmd_opts["stream_mode"].as<std::string>();
   auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
   auto const error_on_invalid_stream = (stream_error_mode == "error");
@@ -186,7 +186,7 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
   auto adaptor                       = cudf::test::stream_checking_resource_adaptor(
     resource, error_on_invalid_stream, check_default_stream);
   if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
-    rmm::mr::set_current_device_resource(&adaptor);
+    cudf::set_current_device_resource(&adaptor);
   }
   return adaptor;
 }
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index 7c395ffee42..5f3e7efbbfe 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 
 /**
  * @defgroup default_stream Default Stream
+ * @defgroup memory_resource Memory Resource Management
  * @defgroup cudf_classes Classes
  * @{
  *   @defgroup column_classes Column
diff --git a/cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
index 6559933f696..ab862df044d 100644
--- a/cpp/include/nvtext/byte_pair_encoding.hpp
+++ b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -21,8 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 
@@ -49,7 +48,7 @@ struct bpe_merge_pairs {
    */
   bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new bpe merge pairs object
@@ -60,7 +59,7 @@ struct bpe_merge_pairs {
    */
   bpe_merge_pairs(cudf::strings_column_view const& input,
                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   ~bpe_merge_pairs();
   bpe_merge_pairs();
@@ -98,7 +97,7 @@ struct bpe_merge_pairs {
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
   cudf::strings_column_view const& merge_pairs,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Byte pair encode the input strings.
@@ -130,7 +129,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::strings_column_view const& input,
   bpe_merge_pairs const& merges_pairs,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp
index 7c49421560d..ae48fed4e79 100644
--- a/cpp/include/nvtext/detail/generate_ngrams.hpp
+++ b/cpp/include/nvtext/detail/generate_ngrams.hpp
@@ -15,10 +15,11 @@
  */
 #pragma once
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <nvtext/generate_ngrams.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT nvtext {
 namespace detail {
diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp
index 438a4a9afdd..1334cbf47ea 100644
--- a/cpp/include/nvtext/detail/load_hash_file.hpp
+++ b/cpp/include/nvtext/detail/load_hash_file.hpp
@@ -16,11 +16,11 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/subword_tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cstdint>
 #include <cstring>
diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp
index 57ad008f1a9..5e5c78e993f 100644
--- a/cpp/include/nvtext/detail/tokenize.hpp
+++ b/cpp/include/nvtext/detail/tokenize.hpp
@@ -19,9 +19,9 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT nvtext {
 namespace detail {
diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp
index 102f2cffa18..723ba310a1e 100644
--- a/cpp/include/nvtext/edit_distance.hpp
+++ b/cpp/include/nvtext/edit_distance.hpp
@@ -19,8 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 //! NVText APIs
 namespace CUDF_EXPORT nvtext {
@@ -64,7 +63,7 @@ std::unique_ptr<cudf::column> edit_distance(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& targets,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Compute the edit distance between all the strings in the input column.
@@ -102,7 +101,7 @@ std::unique_ptr<cudf::column> edit_distance(
 std::unique_ptr<cudf::column> edit_distance_matrix(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index ce79d985a49..54282b8ef3c 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -19,8 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 /**
@@ -62,7 +61,7 @@ std::unique_ptr<cudf::column> generate_ngrams(
   cudf::size_type ngrams,
   cudf::string_scalar const& separator,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generates ngrams of characters within each string
@@ -91,7 +90,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
   cudf::strings_column_view const& input,
   cudf::size_type ngrams            = 2,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Hashes ngrams of characters within each string
@@ -126,7 +125,7 @@ std::unique_ptr<cudf::column> hash_character_ngrams(
   cudf::strings_column_view const& input,
   cudf::size_type ngrams            = 5,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/jaccard.hpp b/cpp/include/nvtext/jaccard.hpp
index 3c3486c079e..e0b924ac658 100644
--- a/cpp/include/nvtext/jaccard.hpp
+++ b/cpp/include/nvtext/jaccard.hpp
@@ -18,8 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 /**
@@ -76,7 +75,7 @@ std::unique_ptr<cudf::column> jaccard_index(
   cudf::strings_column_view const& input2,
   cudf::size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index fc28ecfb199..c83a4260c19 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -20,10 +20,9 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_minhash
@@ -56,7 +55,7 @@ std::unique_ptr<cudf::column> minhash(
   cudf::numeric_scalar<uint32_t> seed = 0,
   cudf::size_type width               = 4,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -88,7 +87,7 @@ std::unique_ptr<cudf::column> minhash(
   cudf::device_span<uint32_t const> seeds,
   cudf::size_type width             = 4,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the minhash value for each string
@@ -117,7 +116,7 @@ std::unique_ptr<cudf::column> minhash64(
   cudf::numeric_scalar<uint64_t> seed = 0,
   cudf::size_type width               = 4,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -149,7 +148,7 @@ std::unique_ptr<cudf::column> minhash64(
   cudf::device_span<uint64_t const> seeds,
   cudf::size_type width             = 4,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp
index 1048cd4abad..e3b3c23a7a9 100644
--- a/cpp/include/nvtext/ngrams_tokenize.hpp
+++ b/cpp/include/nvtext/ngrams_tokenize.hpp
@@ -19,8 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 /**
@@ -84,7 +83,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   cudf::string_scalar const& delimiter,
   cudf::string_scalar const& separator,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index ec0b8981f8f..74325f4a406 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -18,8 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 //! NVText APIs
 namespace CUDF_EXPORT nvtext {
@@ -55,7 +54,7 @@ namespace CUDF_EXPORT nvtext {
 std::unique_ptr<cudf::column> normalize_spaces(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Normalizes strings characters for tokenizing.
@@ -106,7 +105,7 @@ std::unique_ptr<cudf::column> normalize_characters(
   cudf::strings_column_view const& input,
   bool do_lower_case,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index eedcd3976ca..bbd0503379b 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -19,8 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 //! NVText APIs
 namespace CUDF_EXPORT nvtext {
@@ -91,7 +90,7 @@ std::unique_ptr<cudf::column> replace_tokens(
   cudf::strings_column_view const& replacements,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Removes tokens whose lengths are less than a specified number of characters.
@@ -140,7 +139,7 @@ std::unique_ptr<cudf::column> filter_tokens(
   cudf::string_scalar const& replacement = cudf::string_scalar{""},
   cudf::string_scalar const& delimiter   = cudf::string_scalar{""},
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 4607c42ceed..55a4124bfd0 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -19,8 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 /**
@@ -83,7 +82,7 @@ std::unique_ptr<cudf::column> is_letter(
   letter_type ltype,
   cudf::size_type character_index,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns boolean column indicating if character at `indices[i]` of `input[i]`
@@ -136,7 +135,7 @@ std::unique_ptr<cudf::column> is_letter(
   letter_type ltype,
   cudf::column_view const& indices,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the Porter Stemmer measurements of a strings column.
@@ -170,7 +169,7 @@ std::unique_ptr<cudf::column> is_letter(
 std::unique_ptr<cudf::column> porter_stemmer_measure(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index b5636c8401b..c4210699975 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -19,8 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 
@@ -68,7 +67,7 @@ struct hashed_vocabulary {
  */
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Result object for the subword_tokenize functions.
@@ -158,7 +157,7 @@ tokenizer_result subword_tokenize(
   uint32_t stride,
   bool do_lower_case,
   bool do_truncate,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index 833b53efcde..e61601c6fea 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -19,8 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 /**
@@ -63,7 +62,7 @@ std::unique_ptr<cudf::column> tokenize(
   cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a single column of strings by tokenizing the input strings
@@ -99,7 +98,7 @@ std::unique_ptr<cudf::column> tokenize(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the number of tokens in each string of a strings column.
@@ -130,7 +129,7 @@ std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the number of tokens in each string of a strings column
@@ -162,7 +161,7 @@ std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a single column of strings by converting each character to a string.
@@ -188,7 +187,7 @@ std::unique_ptr<cudf::column> count_tokens(
 std::unique_ptr<cudf::column> character_tokenize(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a strings column from a strings column of tokens and an
@@ -229,7 +228,7 @@ std::unique_ptr<cudf::column> detokenize(
   cudf::column_view const& row_indices,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Vocabulary object to be used with nvtext::tokenize_with_vocabulary
@@ -251,7 +250,7 @@ struct tokenize_vocabulary {
    */
   tokenize_vocabulary(cudf::strings_column_view const& input,
                       rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                      rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
   ~tokenize_vocabulary();
 
   struct tokenize_vocabulary_impl;
@@ -274,7 +273,7 @@ struct tokenize_vocabulary {
 std::unique_ptr<tokenize_vocabulary> load_vocabulary(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the token ids for the input string by looking up each delimited
@@ -307,7 +306,7 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(
   cudf::string_scalar const& delimiter,
   cudf::size_type default_id        = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of tokenize group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 25b0f68aaa8..a6c878efbbc 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -35,11 +35,11 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/optional>
 
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 7a0bc312434..3c558f1e264 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -24,11 +24,11 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
@@ -116,7 +116,7 @@ scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::struc
 auto scalar_to_column_view(
   scalar const& scal,
   rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   return type_dispatcher(scal.type(), scalar_as_column_view{}, scal, stream, mr);
 }
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index ceeba9cf817..cdcc40331f2 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -19,9 +19,9 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index d0faeea8336..4ca05f9c335 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -27,13 +27,13 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index 90f719b9516..973b1ffd133 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -30,12 +30,12 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 0260068d4db..482413d0ccb 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -23,10 +23,9 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/detail/fill.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/iterator/constant_iterator.h>
 
 namespace cudf {
diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu
index ad9c5e4d3a0..60405ae7af1 100644
--- a/cpp/src/column/column_factories.cu
+++ b/cpp/src/column/column_factories.cu
@@ -21,8 +21,7 @@
 #include <cudf/lists/detail/lists_column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/uninitialized_fill.h>
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index ac9931335ff..b8e140f1fa5 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -32,11 +32,11 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -82,7 +82,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
                  [](auto const& col) { return *col; });
 
   auto d_views =
-    make_device_uvector_async(device_views, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(device_views, stream, cudf::get_current_device_resource_ref());
 
   // Compute the partition offsets
   auto offsets = cudf::detail::make_host_vector<size_t>(views.size() + 1, stream);
@@ -94,7 +94,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     [](auto const& col) { return col.size(); },
     thrust::plus{});
   auto d_offsets =
-    make_device_uvector_async(offsets, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(offsets, stream, cudf::get_current_device_resource_ref());
   auto const output_size = offsets.back();
 
   return std::make_tuple(
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 95544742fb7..15aa31ff5ee 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -28,10 +28,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -1939,8 +1939,8 @@ struct contiguous_split_state {
       std::transform(h_buf_sizes,
                      h_buf_sizes + num_partitions,
                      std::back_inserter(out_buffers),
-                     [stream = stream,
-                      mr = mr.value_or(rmm::mr::get_current_device_resource())](std::size_t bytes) {
+                     [stream = stream, mr = mr.value_or(cudf::get_current_device_resource_ref())](
+                       std::size_t bytes) {
                        return rmm::device_buffer{bytes, stream, mr};
                      });
     }
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index bac8dbe5d95..d60fb5ce110 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -23,10 +23,10 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index e86a1f8d6f1..e5e2514d035 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -25,13 +25,12 @@
 #include <cudf/strings/detail/copy_if_else.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -180,7 +179,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::column_view const& lh
                                                     out_of_bounds_policy::DONT_CHECK,
                                                     negative_index_policy::NOT_ALLOWED,
                                                     stream,
-                                                    rmm::mr::get_current_device_resource());
+                                                    cudf::get_current_device_resource_ref());
 
   auto result = cudf::detail::scatter(
     table_view{std::vector<column_view>{scatter_src_lhs->get_column(0).view()}},
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index dd18f99a3c8..bffb48a8ec0 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -31,11 +31,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -100,7 +100,7 @@ struct out_of_place_copy_range_dispatch {
     cudf::size_type source_end,
     cudf::size_type target_begin,
     rmm::cuda_stream_view stream,
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
   {
     auto p_ret = std::make_unique<cudf::column>(target, stream, mr);
     if ((!p_ret->nullable()) && source.has_nulls(source_begin, source_end)) {
@@ -157,7 +157,7 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
     cudf::dictionary::detail::add_keys(dict_target, dict_source.keys(), stream, mr);
   auto const target_view = cudf::dictionary_column_view(target_matched->view());
   auto source_matched    = cudf::dictionary::detail::set_keys(
-    dict_source, target_view.keys(), stream, rmm::mr::get_current_device_resource());
+    dict_source, target_view.keys(), stream, cudf::get_current_device_resource_ref());
   auto const source_view = cudf::dictionary_column_view(source_matched->view());
 
   // build the new indices by calling in_place_copy_range on just the indices
diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
index 5eb039419df..d1ab39d665d 100644
--- a/cpp/src/copying/gather.cu
+++ b/cpp/src/copying/gather.cu
@@ -23,9 +23,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index b8860da479c..29a28f81d1a 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -27,9 +27,9 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <stdexcept>
 
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index 819ad593c0a..1282eec6c44 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -18,9 +18,9 @@
 #include <cudf/detail/contiguous_split.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
index 581d0a00924..684deabf038 100644
--- a/cpp/src/copying/purge_nonempty_nulls.cu
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -18,8 +18,7 @@
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/copying/reverse.cu b/cpp/src/copying/reverse.cu
index d3d42e35e26..effbb59f223 100644
--- a/cpp/src/copying/reverse.cu
+++ b/cpp/src/copying/reverse.cu
@@ -21,12 +21,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/constant_iterator.h>
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index ba00527f6b6..dc03856c7cf 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -24,9 +24,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 993ee074f14..cd14eb96ec4 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -33,10 +33,10 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/count.h>
@@ -198,7 +198,7 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
                                    mr);
     auto dict_view    = dictionary_column_view(dict_target->view());
     auto scalar_index = dictionary::detail::get_index(
-      dict_view, source.get(), stream, rmm::mr::get_current_device_resource());
+      dict_view, source.get(), stream, cudf::get_current_device_resource_ref());
     auto scalar_iter = thrust::make_permutation_iterator(
       indexalator_factory::make_input_iterator(*scalar_index), thrust::make_constant_iterator(0));
     auto new_indices = std::make_unique<column>(dict_view.get_indices_annotated(), stream, mr);
@@ -271,7 +271,7 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
     auto scatter_functor   = column_scalar_scatterer<decltype(scatter_iter)>{};
     auto fields_iter_begin = make_counting_transform_iterator(0, [&](auto const& i) {
       auto row_slr = detail::get_element(
-        typed_s->view().column(i), 0, stream, rmm::mr::get_current_device_resource());
+        typed_s->view().column(i), 0, stream, cudf::get_current_device_resource_ref());
       return type_dispatcher<dispatch_storage_type>(row_slr->type(),
                                                     scatter_functor,
                                                     *row_slr,
@@ -416,7 +416,7 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
 
   // The scatter map is actually a table with only one column, which is scatter map.
   auto scatter_map = detail::apply_boolean_mask(
-    table_view{{indices->view()}}, boolean_mask, stream, rmm::mr::get_current_device_resource());
+    table_view{{indices->view()}}, boolean_mask, stream, cudf::get_current_device_resource_ref());
   auto output_table = detail::scatter(
     table_view{{input}}, scatter_map->get_column(0).view(), table_view{{target}}, stream, mr);
 
diff --git a/cpp/src/copying/segmented_shift.cu b/cpp/src/copying/segmented_shift.cu
index b7abc60f240..6ea5c5ab38a 100644
--- a/cpp/src/copying/segmented_shift.cu
+++ b/cpp/src/copying/segmented_shift.cu
@@ -21,10 +21,10 @@
 #include <cudf/strings/detail/copy_if_else.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index 91254f21170..674f6dbd28a 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -25,13 +25,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 7629cad79a9..fd9a6b8f5fe 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -29,13 +29,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/durations.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 7ca1b51df98..6498a5e6c55 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -18,8 +18,7 @@
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <algorithm>
 #include <filesystem>
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 0ed9006f88b..565055009ba 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -30,11 +30,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -61,7 +59,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
   // first, concatenate the keys together
   // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e]
   auto combined_keys = cudf::detail::concatenate(
-    std::vector<column_view>{old_keys, new_keys}, stream, rmm::mr::get_current_device_resource());
+    std::vector<column_view>{old_keys, new_keys}, stream, cudf::get_current_device_resource_ref());
 
   // Drop duplicates from the combined keys, then sort the result.
   // sort(distinct([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f]
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index 9f05593fc40..fb013586999 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -23,9 +23,9 @@
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 72828309425..b3a8bb4cd20 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -27,13 +27,12 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -120,7 +119,7 @@ struct compute_children_offsets_fn {
         return offsets_pair{lhs.first + rhs.first, lhs.second + rhs.second};
       });
     return cudf::detail::make_device_uvector_sync(
-      offsets, stream, rmm::mr::get_current_device_resource());
+      offsets, stream, cudf::get_current_device_resource_ref());
   }
 
  private:
@@ -229,7 +228,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     return keys;
   });
   auto all_keys =
-    cudf::detail::concatenate(keys_views, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::concatenate(keys_views, stream, cudf::get_current_device_resource_ref());
 
   // sort keys and remove duplicates;
   // this becomes the keys child for the output dictionary column
diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu
index c65aa5d1101..0af71397196 100644
--- a/cpp/src/dictionary/detail/merge.cu
+++ b/cpp/src/dictionary/detail/merge.cu
@@ -22,10 +22,10 @@
 #include <cudf/dictionary/detail/merge.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index 0617d71fa51..3e0c98d36ea 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -20,10 +20,10 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/detail/encode.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace {
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index ff29d83b80a..c8ccb511e8f 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -27,9 +27,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 35387efa56b..119f43a4ae9 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -27,11 +27,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index bc17dfd4bab..fe0b103cc55 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -25,10 +25,10 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -132,7 +132,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
     input, make_column_from_scalar(replacement, 1, stream)->view(), stream, mr);
   auto const input_view = dictionary_column_view(input_matched->view());
   auto const scalar_index =
-    get_index(input_view, replacement, stream, rmm::mr::get_current_device_resource());
+    get_index(input_view, replacement, stream, cudf::get_current_device_resource_ref());
 
   // now build the new indices by doing replace-null on the updated indices
   auto const input_indices = input_view.get_indices_annotated();
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 231619836f9..04e2c17635d 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -20,13 +20,13 @@
 #include <cudf/dictionary/search.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index cf40fda5971..be5c3dd6a26 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -31,11 +31,11 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -185,7 +185,7 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
 {
   std::vector<column_view> keys(input.size());
   std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); });
-  auto new_keys  = cudf::detail::concatenate(keys, stream, rmm::mr::get_current_device_resource());
+  auto new_keys  = cudf::detail::concatenate(keys, stream, cudf::get_current_device_resource_ref());
   auto keys_view = new_keys->view();
   std::vector<std::unique_ptr<column>> result(input.size());
   std::transform(input.begin(), input.end(), result.begin(), [keys_view, mr, stream](auto& col) {
diff --git a/cpp/src/filling/calendrical_month_sequence.cu b/cpp/src/filling/calendrical_month_sequence.cu
index f984f307ddd..f5ad211bd0d 100644
--- a/cpp/src/filling/calendrical_month_sequence.cu
+++ b/cpp/src/filling/calendrical_month_sequence.cu
@@ -20,11 +20,11 @@
 #include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 1fc9ed31c09..cfb209c0569 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -32,12 +32,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -175,7 +174,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
 
   // get the index of the key just added
   auto index_of_value = cudf::dictionary::detail::get_index(
-    target_matched->view(), value, stream, rmm::mr::get_current_device_resource());
+    target_matched->view(), value, stream, cudf::get_current_device_resource_ref());
   // now call fill using just the indices column and the new index
   auto new_indices =
     cudf::type_dispatcher(target_indices.type(),
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index ff4005d9366..2e78954d78a 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -27,13 +27,12 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index ee1745b8498..d8fd993bbd1 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -24,11 +24,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/sequence.h>
 #include <thrust/tabulate.h>
diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp
index 82c3c08b501..80849357811 100644
--- a/cpp/src/groupby/common/utils.hpp
+++ b/cpp/src/groupby/common/utils.hpp
@@ -18,10 +18,9 @@
 
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/groupby.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <memory>
 #include <vector>
 
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index e43dfcb4d98..cc0682b68b9 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -35,12 +35,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -284,7 +283,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls
     std::back_inserter(results),
     [&](auto i) {
       bool nullable       = values.column(i).nullable();
-      auto final_mr       = nullable ? rmm::mr::get_current_device_resource() : mr;
+      auto final_mr       = nullable ? cudf::get_current_device_resource_ref() : mr;
       auto grouped_values = helper().grouped_values(values.column(i), stream, final_mr);
       return nullable ? detail::group_replace_nulls(
                           *grouped_values, group_labels, replace_policies[i], stream, mr)
@@ -331,7 +330,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
     std::back_inserter(results),
     [&](size_type i) {
       auto grouped_values =
-        helper().grouped_values(values.column(i), stream, rmm::mr::get_current_device_resource());
+        helper().grouped_values(values.column(i), stream, cudf::get_current_device_resource_ref());
       return cudf::detail::segmented_shift(
         grouped_values->view(), group_offsets, offsets[i], fill_values[i].get(), stream, mr);
     });
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 35161eada28..f9a80a048b5 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -39,11 +39,11 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.cuh>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 #include <thrust/for_each.h>
@@ -401,7 +401,7 @@ void sparse_to_dense_results(table_view const& keys,
                              rmm::device_async_resource_ref mr)
 {
   auto row_bitmask =
-    cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
+    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
   bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
   bitmask_type const* row_bitmask_ptr =
     skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
@@ -475,13 +475,13 @@ void compute_single_pass_aggs(table_view const& keys,
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
   auto d_values       = table_device_view::create(flattened_values, stream);
   auto const d_aggs   = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, rmm::mr::get_current_device_resource());
+    agg_kinds, stream, cudf::get_current_device_resource_ref());
   auto const skip_key_rows_with_nulls =
     keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
   auto row_bitmask =
     skip_key_rows_with_nulls
-      ? cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first
+      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
       : rmm::device_buffer{};
 
   thrust::for_each_n(
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index ba59616babe..a9085a1f1fd 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -35,9 +35,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <unordered_map>
@@ -435,7 +435,7 @@ void aggregate_result_functor::operator()<aggregation::COLLECT_SET>(aggregation
                                                     helper.num_groups(stream),
                                                     null_handling,
                                                     stream,
-                                                    rmm::mr::get_current_device_resource());
+                                                    cudf::get_current_device_resource_ref());
   auto const nulls_equal =
     dynamic_cast<cudf::detail::collect_set_aggregation const&>(agg)._nulls_equal;
   auto const nans_equal =
@@ -507,7 +507,7 @@ void aggregate_result_functor::operator()<aggregation::MERGE_SETS>(aggregation c
                                                        helper.group_offsets(stream),
                                                        helper.num_groups(stream),
                                                        stream,
-                                                       rmm::mr::get_current_device_resource());
+                                                       cudf::get_current_device_resource_ref());
   auto const& merge_sets_agg = dynamic_cast<cudf::detail::merge_sets_aggregation const&>(agg);
   cache.add_result(values,
                    agg,
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index 057085fe85d..a13866802be 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -20,9 +20,9 @@
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu
index a1d197b1307..7dce341130e 100644
--- a/cpp/src/groupby/sort/group_argmax.cu
+++ b/cpp/src/groupby/sort/group_argmax.cu
@@ -17,10 +17,10 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <cudf/detail/gather.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu
index 03243bef836..c4bed330b9f 100644
--- a/cpp/src/groupby/sort/group_argmin.cu
+++ b/cpp/src/groupby/sort/group_argmin.cu
@@ -17,10 +17,10 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <cudf/detail/gather.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index 555c5d3ad41..a1cac7ee3bc 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -20,10 +20,10 @@
 #include <cudf/detail/copy_if.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu
index 152aa98a8b9..7f2102dc8ee 100644
--- a/cpp/src/groupby/sort/group_correlation.cu
+++ b/cpp/src/groupby/sort/group_correlation.cu
@@ -21,12 +21,12 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
diff --git a/cpp/src/groupby/sort/group_count.cu b/cpp/src/groupby/sort/group_count.cu
index 56a4943e272..2e1cb9591c4 100644
--- a/cpp/src/groupby/sort/group_count.cu
+++ b/cpp/src/groupby/sort/group_count.cu
@@ -18,11 +18,11 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/adjacent_difference.h>
diff --git a/cpp/src/groupby/sort/group_count_scan.cu b/cpp/src/groupby/sort/group_count_scan.cu
index c076f21e1f8..5897cc341d4 100644
--- a/cpp/src/groupby/sort/group_count_scan.cu
+++ b/cpp/src/groupby/sort/group_count_scan.cu
@@ -17,11 +17,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scan.h>
diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 1000ec0d470..861d801a070 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -23,10 +23,10 @@
 #include <cudf/reduction/detail/histogram.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
index 77f33486284..a17a4433d05 100644
--- a/cpp/src/groupby/sort/group_m2.cu
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -21,13 +21,13 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/reduce.h>
diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu
index 60b071c25ff..06a759dd25a 100644
--- a/cpp/src/groupby/sort/group_max.cu
+++ b/cpp/src/groupby/sort/group_max.cu
@@ -16,8 +16,9 @@
 
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_max_scan.cu b/cpp/src/groupby/sort/group_max_scan.cu
index 270059cfcad..21e439a2253 100644
--- a/cpp/src/groupby/sort/group_max_scan.cu
+++ b/cpp/src/groupby/sort/group_max_scan.cu
@@ -16,8 +16,9 @@
 
 #include "groupby/sort/group_scan_util.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_merge_lists.cu b/cpp/src/groupby/sort/group_merge_lists.cu
index 92cce1aa00e..009530a9915 100644
--- a/cpp/src/groupby/sort/group_merge_lists.cu
+++ b/cpp/src/groupby/sort/group_merge_lists.cu
@@ -16,11 +16,11 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 4ad8fa5ff07..746c3fe3962 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -20,12 +20,12 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/groupby/sort/group_min.cu b/cpp/src/groupby/sort/group_min.cu
index 22aaf664168..f86aa14430a 100644
--- a/cpp/src/groupby/sort/group_min.cu
+++ b/cpp/src/groupby/sort/group_min.cu
@@ -16,8 +16,9 @@
 
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_min_scan.cu b/cpp/src/groupby/sort/group_min_scan.cu
index 4ddc10a2e5a..96b7ad95a19 100644
--- a/cpp/src/groupby/sort/group_min_scan.cu
+++ b/cpp/src/groupby/sort/group_min_scan.cu
@@ -16,8 +16,9 @@
 
 #include "groupby/sort/group_scan_util.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index 1bc1eef908c..a4752b6948b 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -22,11 +22,11 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/constant_iterator.h>
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index de11e70719a..348ab366762 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -18,11 +18,11 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
diff --git a/cpp/src/groupby/sort/group_product.cu b/cpp/src/groupby/sort/group_product.cu
index 83ca1059325..5e81c8513c8 100644
--- a/cpp/src/groupby/sort/group_product.cu
+++ b/cpp/src/groupby/sort/group_product.cu
@@ -17,10 +17,10 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_product_scan.cu b/cpp/src/groupby/sort/group_product_scan.cu
index 40c53ceeff1..016f293ac5b 100644
--- a/cpp/src/groupby/sort/group_product_scan.cu
+++ b/cpp/src/groupby/sort/group_product_scan.cu
@@ -16,8 +16,9 @@
 
 #include "groupby/sort/group_scan_util.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index 3156dfaadd0..82d557b9f7e 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -24,12 +24,12 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -165,7 +165,7 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         rmm::device_async_resource_ref mr)
 {
   auto dv_quantiles = cudf::detail::make_device_uvector_async(
-    quantiles, stream, rmm::mr::get_current_device_resource());
+    quantiles, stream, cudf::get_current_device_resource_ref());
 
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 0b65889f127..65bd5ac408f 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -23,11 +23,11 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/reverse_iterator.h>
@@ -226,13 +226,13 @@ std::unique_ptr<column> average_rank_scan(column_view const& grouped_values,
                                 group_labels,
                                 group_offsets,
                                 stream,
-                                rmm::mr::get_current_device_resource());
+                                cudf::get_current_device_resource_ref());
   auto min_rank = min_rank_scan(grouped_values,
                                 value_order,
                                 group_labels,
                                 group_offsets,
                                 stream,
-                                rmm::mr::get_current_device_resource());
+                                cudf::get_current_device_resource_ref());
   auto ranks    = make_fixed_width_column(
     data_type{type_to_id<double>()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
   auto mutable_ranks = ranks->mutable_view();
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 5e76dc3135a..f8a531094c6 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -18,10 +18,10 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/groupby/sort/group_replace_nulls.cu b/cpp/src/groupby/sort/group_replace_nulls.cu
index 566507da230..088ed05e5eb 100644
--- a/cpp/src/groupby/sort/group_replace_nulls.cu
+++ b/cpp/src/groupby/sort/group_replace_nulls.cu
@@ -19,9 +19,9 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/replace/nulls.cuh>
 #include <cudf/replace.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp
index 6f2daae5f9d..b5d8ce23a97 100644
--- a/cpp/src/groupby/sort/group_scan.hpp
+++ b/cpp/src/groupby/sort/group_scan.hpp
@@ -18,10 +18,10 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index b360ba2c45d..86835ea8a67 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -29,12 +29,12 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 5e892710d3b..2358f47bbbb 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -26,11 +26,11 @@
 #include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 70f64186f21..86ee20dbbe2 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -29,7 +30,6 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/groupby/sort/group_sum.cu b/cpp/src/groupby/sort/group_sum.cu
index 316b6f395bb..fbbc9b5fd15 100644
--- a/cpp/src/groupby/sort/group_sum.cu
+++ b/cpp/src/groupby/sort/group_sum.cu
@@ -17,10 +17,10 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_sum_scan.cu b/cpp/src/groupby/sort/group_sum_scan.cu
index 01c4d0c2c4a..d3af8c8794a 100644
--- a/cpp/src/groupby/sort/group_sum_scan.cu
+++ b/cpp/src/groupby/sort/group_sum_scan.cu
@@ -16,8 +16,9 @@
 
 #include "groupby/sort/group_scan_util.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index f211c61b3b7..62bceccdf5f 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -33,9 +33,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -145,7 +145,7 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
       return cudf::detail::sequence(group_labels.size(),
                                     *cudf::make_fixed_width_scalar(size_type{0}, stream),
                                     stream,
-                                    rmm::mr::get_current_device_resource());
+                                    cudf::get_current_device_resource_ref());
     } else {
       auto sort_order = (rank_agg._method == rank_method::FIRST ? cudf::detail::stable_sorted_order
                                                                        : cudf::detail::sorted_order);
@@ -153,7 +153,7 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
                                {order::ASCENDING, rank_agg._column_order},
                                {null_order::AFTER, rank_agg._null_precedence},
                         stream,
-                        rmm::mr::get_current_device_resource());
+                        cudf::get_current_device_resource_ref());
     }
   }();
 
@@ -172,18 +172,18 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
                           helper.group_labels(stream),
                           helper.group_offsets(stream),
                           stream,
-                          rmm::mr::get_current_device_resource());
+                          cudf::get_current_device_resource_ref());
   if (rank_agg._percentage != rank_percentage::NONE) {
     auto count = get_grouped_values().nullable() and rank_agg._null_handling == null_policy::EXCLUDE
                    ? detail::group_count_valid(get_grouped_values(),
                                                helper.group_labels(stream),
                                                helper.num_groups(stream),
                                                stream,
-                                               rmm::mr::get_current_device_resource())
+                                               cudf::get_current_device_resource_ref())
                    : detail::group_count_all(helper.group_offsets(stream),
                                              helper.num_groups(stream),
                                              stream,
-                                             rmm::mr::get_current_device_resource());
+                                             cudf::get_current_device_resource_ref());
     result     = detail::group_rank_to_percentage(rank_agg._method,
                                               rank_agg._percentage,
                                               *result,
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 4da1da089cd..35e3e05a364 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -31,11 +31,11 @@
 #include <cudf/strings/string_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -100,7 +100,7 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream)
                                                numeric_scalar<size_type>(0, true, stream),
                                                numeric_scalar<size_type>(1, true, stream),
                                                stream,
-                                               rmm::mr::get_current_device_resource());
+                                               cudf::get_current_device_resource_ref());
     return sliced_key_sorted_order();
   }
 
@@ -109,7 +109,7 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream)
                               ? std::vector(_keys.num_columns(), null_order::AFTER)
                               : _null_precedence;
     _key_sorted_order     = cudf::detail::stable_sorted_order(
-      _keys, {}, precedence, stream, rmm::mr::get_current_device_resource());
+      _keys, {}, precedence, stream, cudf::get_current_device_resource_ref());
   } else {  // Pandas style
     // Temporarily prepend the keys table with a column that indicates the
     // presence of a null value within a row. This allows moving all rows that
@@ -125,7 +125,7 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream)
     }();
 
     _key_sorted_order = cudf::detail::stable_sorted_order(
-      augmented_keys, {}, precedence, stream, rmm::mr::get_current_device_resource());
+      augmented_keys, {}, precedence, stream, cudf::get_current_device_resource_ref());
 
     // All rows with one or more null values are at the end of the resulting sorted order.
   }
@@ -223,7 +223,7 @@ column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stre
                           scatter_map,
                           table_view({temp_labels->view()}),
                           stream,
-                          rmm::mr::get_current_device_resource());
+                          cudf::get_current_device_resource_ref());
 
   _unsorted_keys_labels = std::move(t_unsorted_keys_labels->release()[0]);
 
@@ -235,13 +235,13 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea
   if (_keys_bitmask_column) return _keys_bitmask_column->view();
 
   auto [row_bitmask, null_count] =
-    cudf::detail::bitmask_and(_keys, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::bitmask_and(_keys, stream, cudf::get_current_device_resource_ref());
 
   auto const zero = numeric_scalar<int8_t>(0, true, stream);
   // Create a temporary variable and only set _keys_bitmask_column right before the return.
   // This way, a 2nd (parallel) call to this will not be given a partially created object.
   auto keys_bitmask_column = cudf::detail::sequence(
-    _keys.num_rows(), zero, zero, stream, rmm::mr::get_current_device_resource());
+    _keys.num_rows(), zero, zero, stream, cudf::get_current_device_resource_ref());
   keys_bitmask_column->set_null_mask(std::move(row_bitmask), null_count);
 
   _keys_bitmask_column = std::move(keys_bitmask_column);
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index 0b559e8e86c..c7bfd4aecf4 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -25,11 +25,11 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu
index 6c91532a193..090bd92af8c 100644
--- a/cpp/src/hash/murmurhash3_x64_128.cu
+++ b/cpp/src/hash/murmurhash3_x64_128.cu
@@ -19,10 +19,10 @@
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 
diff --git a/cpp/src/hash/murmurhash3_x86_32.cu b/cpp/src/hash/murmurhash3_x86_32.cu
index eac72f5d995..dd7b19633be 100644
--- a/cpp/src/hash/murmurhash3_x86_32.cu
+++ b/cpp/src/hash/murmurhash3_x86_32.cu
@@ -20,10 +20,10 @@
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/tabulate.h>
 
diff --git a/cpp/src/hash/sha1_hash.cu b/cpp/src/hash/sha1_hash.cu
index f7609eb26af..3a0c442ed16 100644
--- a/cpp/src/hash/sha1_hash.cu
+++ b/cpp/src/hash/sha1_hash.cu
@@ -19,11 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/hash/sha224_hash.cu b/cpp/src/hash/sha224_hash.cu
index cf04504a489..3ac3c5dbbba 100644
--- a/cpp/src/hash/sha224_hash.cu
+++ b/cpp/src/hash/sha224_hash.cu
@@ -19,11 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/hash/sha256_hash.cu b/cpp/src/hash/sha256_hash.cu
index 664913c0f4c..8036308f09e 100644
--- a/cpp/src/hash/sha256_hash.cu
+++ b/cpp/src/hash/sha256_hash.cu
@@ -19,11 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/hash/sha384_hash.cu b/cpp/src/hash/sha384_hash.cu
index 92192f501ec..30fe181d55b 100644
--- a/cpp/src/hash/sha384_hash.cu
+++ b/cpp/src/hash/sha384_hash.cu
@@ -19,11 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/hash/sha512_hash.cu b/cpp/src/hash/sha512_hash.cu
index 244206aeeb9..fd74f38423b 100644
--- a/cpp/src/hash/sha512_hash.cu
+++ b/cpp/src/hash/sha512_hash.cu
@@ -19,11 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index 6976241057e..ebaec8e2775 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -24,11 +24,11 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
diff --git a/cpp/src/hash/xxhash_64.cu b/cpp/src/hash/xxhash_64.cu
index 4366c12b453..fad8383210b 100644
--- a/cpp/src/hash/xxhash_64.cu
+++ b/cpp/src/hash/xxhash_64.cu
@@ -19,11 +19,11 @@
 #include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/tabulate.h>
 
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index 3776daf41aa..a99262fb3bf 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -21,7 +21,6 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
index 1cee3071fcb..1b79fbf9eda 100644
--- a/cpp/src/interop/arrow_utilities.hpp
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -17,11 +17,10 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 
diff --git a/cpp/src/interop/decimal_conversion_utilities.cuh b/cpp/src/interop/decimal_conversion_utilities.cuh
index 41263147404..6b62eb0fee4 100644
--- a/cpp/src/interop/decimal_conversion_utilities.cuh
+++ b/cpp/src/interop/decimal_conversion_utilities.cuh
@@ -18,9 +18,9 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <type_traits>
 
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 78ddd7f5ad5..ba5b11b90d8 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -20,12 +20,12 @@
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <dlpack/dlpack.h>
 
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index 440df571de0..057e563c86e 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -28,13 +28,13 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index efde8f2a463..2e9504a6726 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -31,13 +31,13 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu
index 578105aa90a..deff62be576 100644
--- a/cpp/src/interop/from_arrow_stream.cu
+++ b/cpp/src/interop/from_arrow_stream.cu
@@ -24,7 +24,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index a5f3f9d87f5..a2874b46b06 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -30,14 +30,13 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
index 26f7c7e6e53..79fb7550044 100644
--- a/cpp/src/interop/to_arrow_host.cu
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -30,14 +30,13 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 69a0e982a5b..f0a92f7554d 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -26,6 +26,7 @@
 #include <cudf/io/detail/avro.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
@@ -33,7 +34,6 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/equal.h>
 #include <thrust/functional.h>
@@ -448,7 +448,7 @@ std::vector<column_buffer> decode_data(metadata& meta,
   }
 
   auto block_list = cudf::detail::make_device_uvector_async(
-    meta.block_list, stream, rmm::mr::get_current_device_resource());
+    meta.block_list, stream, cudf::get_current_device_resource_ref());
 
   schema_desc.host_to_device_async(stream);
 
@@ -578,9 +578,9 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
         }
 
         d_global_dict = cudf::detail::make_device_uvector_async(
-          h_global_dict, stream, rmm::mr::get_current_device_resource());
+          h_global_dict, stream, cudf::get_current_device_resource_ref());
         d_global_dict_data = cudf::detail::make_device_uvector_async(
-          h_global_dict_data, stream, rmm::mr::get_current_device_resource());
+          h_global_dict_data, stream, cudf::get_current_device_resource_ref());
 
         stream.synchronize();
       }
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index ab516dd585d..602ff1734b6 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -21,6 +21,7 @@
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cuda_runtime.h>
@@ -510,7 +511,7 @@ size_t decompress_zstd(host_span<uint8_t const> src,
 {
   // Init device span of spans (source)
   auto const d_src =
-    cudf::detail::make_device_uvector_async(src, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::make_device_uvector_async(src, stream, cudf::get_current_device_resource_ref());
   auto hd_srcs = cudf::detail::hostdevice_vector<device_span<uint8_t const>>(1, stream);
   hd_srcs[0]   = d_src;
   hd_srcs.host_to_device_async(stream);
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 5a0c6decfda..273e82edf8b 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -28,6 +28,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -807,7 +808,7 @@ cudf::detail::host_vector<column_type_histogram> detect_column_types(
   int const grid_size  = (row_starts.size() + block_size - 1) / block_size;
 
   auto d_stats = detail::make_zeroed_device_uvector_async<column_type_histogram>(
-    num_active_columns, stream, rmm::mr::get_current_device_resource());
+    num_active_columns, stream, cudf::get_current_device_resource_ref());
 
   data_type_detection<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, column_flags, row_starts, d_stats);
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 918951d5902..eac86b2f199 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -22,9 +22,9 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp
index f671f435eeb..62f31dcd09c 100644
--- a/cpp/src/io/csv/durations.hpp
+++ b/cpp/src/io/csv/durations.hpp
@@ -17,10 +17,9 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index e27b06682bb..ebca334a715 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -37,10 +37,10 @@
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -532,7 +532,7 @@ void infer_column_types(parse_options const& parse_opts,
   auto const column_stats = cudf::io::csv::gpu::detect_column_types(
     parse_opts.view(),
     data,
-    make_device_uvector_async(column_flags, stream, rmm::mr::get_current_device_resource()),
+    make_device_uvector_async(column_flags, stream, cudf::get_current_device_resource_ref()),
     row_offsets,
     num_inferred_columns,
     stream);
@@ -601,16 +601,16 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
   }
 
   auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-    num_active_columns, stream, rmm::mr::get_current_device_resource());
+    num_active_columns, stream, cudf::get_current_device_resource_ref());
 
   cudf::io::csv::gpu::decode_row_column_data(
     parse_opts.view(),
     data,
-    make_device_uvector_async(column_flags, stream, rmm::mr::get_current_device_resource()),
+    make_device_uvector_async(column_flags, stream, cudf::get_current_device_resource_ref()),
     row_offsets,
-    make_device_uvector_async(column_types, stream, rmm::mr::get_current_device_resource()),
-    make_device_uvector_async(h_data, stream, rmm::mr::get_current_device_resource()),
-    make_device_uvector_async(h_valid, stream, rmm::mr::get_current_device_resource()),
+    make_device_uvector_async(column_types, stream, cudf::get_current_device_resource_ref()),
+    make_device_uvector_async(h_data, stream, cudf::get_current_device_resource_ref()),
+    make_device_uvector_async(h_valid, stream, cudf::get_current_device_resource_ref()),
     d_valid_counts,
     stream);
 
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 00a6dcb2286..b84446b5f3e 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -38,11 +38,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
@@ -436,7 +435,7 @@ void write_csv(data_sink* out_sink,
   // (even for tables with no rows)
   //
   write_chunked_begin(
-    out_sink, table, user_column_names, options, stream, rmm::mr::get_current_device_resource());
+    out_sink, table, user_column_names, options, stream, cudf::get_current_device_resource_ref());
 
   if (table.num_rows() > 0) {
     // no need to check same-size columns constraint; auto-enforced by table_view
@@ -470,7 +469,7 @@ void write_csv(data_sink* out_sink,
 
     // convert each chunk to CSV:
     //
-    column_to_strings_fn converter{options, stream, rmm::mr::get_current_device_resource()};
+    column_to_strings_fn converter{options, stream, cudf::get_current_device_resource_ref()};
     for (auto&& sub_view : vector_views) {
       // Skip if the table has no rows
       if (sub_view.num_rows() == 0) continue;
@@ -505,13 +504,13 @@ void write_csv(data_sink* out_sink,
                                                     options_narep,
                                                     strings::separator_on_nulls::YES,
                                                     stream,
-                                                    rmm::mr::get_current_device_resource());
+                                                    cudf::get_current_device_resource_ref());
         return cudf::strings::detail::replace_nulls(
-          str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource());
+          str_table_view.column(0), options_narep, stream, cudf::get_current_device_resource_ref());
       }();
 
       write_chunked(
-        out_sink, str_concat_col->view(), options, stream, rmm::mr::get_current_device_resource());
+        out_sink, str_concat_col->view(), options, stream, cudf::get_current_device_resource_ref());
     }
   }
 }
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 62c3c5cd245..0ca54da5aaf 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -35,8 +35,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <algorithm>
 
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 8d6890045be..8890c786287 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -26,12 +26,12 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <cuda/functional>
@@ -369,7 +369,7 @@ std::vector<std::string> copy_strings_to_host_sync(
                                    0,
                                    options_view,
                                    stream,
-                                   rmm::mr::get_current_device_resource());
+                                   cudf::get_current_device_resource_ref());
   auto to_host        = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
@@ -825,9 +825,9 @@ void make_device_json_column(device_span<SymbolT const> input,
   }
 
   auto d_ignore_vals = cudf::detail::make_device_uvector_async(
-    ignore_vals, stream, rmm::mr::get_current_device_resource());
+    ignore_vals, stream, cudf::get_current_device_resource_ref());
   auto d_columns_data = cudf::detail::make_device_uvector_async(
-    columns_data, stream, rmm::mr::get_current_device_resource());
+    columns_data, stream, cudf::get_current_device_resource_ref());
 
   // 3. scatter string offsets to respective columns, set validity bits
   thrust::for_each_n(
@@ -1118,13 +1118,13 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
   auto gpu_tree = [&]() {
     // Parse the JSON and get the token stream
     const auto [tokens_gpu, token_indices_gpu] =
-      get_token_stream(d_input, options, stream, rmm::mr::get_current_device_resource());
+      get_token_stream(d_input, options, stream, cudf::get_current_device_resource_ref());
     // gpu tree generation
     return get_tree_representation(tokens_gpu,
                                    token_indices_gpu,
                                    options.is_enabled_mixed_types_as_string(),
                                    stream,
-                                   rmm::mr::get_current_device_resource());
+                                   cudf::get_current_device_resource_ref());
   }();  // IILE used to free memory of token data.
 #ifdef NJP_DEBUG_PRINT
   auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
@@ -1150,7 +1150,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                   is_array_of_arrays,
                                   options.is_enabled_lines(),
                                   stream,
-                                  rmm::mr::get_current_device_resource());
+                                  cudf::get_current_device_resource_ref());
 
   device_json_column root_column(stream, mr);
   root_column.type = json_col_t::ListColumn;
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index cb8b4e97ebb..7899ea7bac4 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -18,12 +18,12 @@
 
 #include <cudf/io/detail/json.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index ee6bc0b9f4b..4d0dc010c57 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -26,12 +26,12 @@
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_radix_sort.cuh>
 #include <cuco/static_set.cuh>
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 20c143f66c7..b06458e1a8e 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -22,8 +22,7 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <map>
 #include <vector>
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 1e484d74679..d76e5447c30 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -31,12 +31,12 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/device_vector.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -1517,7 +1517,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
     fst::detail::make_translation_functor<symbol_t, 0, 2>(token_filter::TransduceToken{}),
     stream);
 
-  auto const mr = rmm::mr::get_current_device_resource();
+  auto const mr = cudf::get_current_device_resource_ref();
   rmm::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
   rmm::device_uvector<PdaTokenT> filtered_tokens_out{tokens.size(), stream, mr};
   rmm::device_uvector<SymbolOffsetT> filtered_token_indices_out{tokens.size(), stream, mr};
@@ -2125,10 +2125,10 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       // Move string_offsets and string_lengths to GPU
       rmm::device_uvector<json_column::row_offset_t> d_string_offsets =
         cudf::detail::make_device_uvector_async(
-          json_col.string_offsets, stream, rmm::mr::get_current_device_resource());
+          json_col.string_offsets, stream, cudf::get_current_device_resource_ref());
       rmm::device_uvector<json_column::row_offset_t> d_string_lengths =
         cudf::detail::make_device_uvector_async(
-          json_col.string_lengths, stream, rmm::mr::get_current_device_resource());
+          json_col.string_lengths, stream, cudf::get_current_device_resource_ref());
 
       // Prepare iterator that returns (string_offset, string_length)-tuples
       auto offset_length_it =
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 98e8e8d3c7e..bd82b040359 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -25,11 +25,11 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -229,13 +229,13 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
   // invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_single_quotes()) {
-    normalize_single_quotes(bufview, stream, rmm::mr::get_current_device_resource());
+    normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref());
   }
 
   // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
   // enabled, invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_whitespace()) {
-    normalize_whitespace(bufview, stream, rmm::mr::get_current_device_resource());
+    normalize_whitespace(bufview, stream, cudf::get_current_device_resource_ref());
   }
 
   auto buffer =
@@ -304,7 +304,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                     "Currently only single-character delimiters are supported");
       auto const delimiter_source = thrust::make_constant_iterator('\n');
       auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
-        delimiter_map, stream, rmm::mr::get_current_device_resource());
+        delimiter_map, stream, cudf::get_current_device_resource_ref());
       thrust::scatter(rmm::exec_policy_nosync(stream),
                       delimiter_source,
                       delimiter_source + d_delimiter_map.size(),
@@ -421,7 +421,7 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
     batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
     batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
     partial_tables.emplace_back(
-      read_batch(sources, batched_reader_opts, stream, rmm::mr::get_current_device_resource()));
+      read_batch(sources, batched_reader_opts, stream, cudf::get_current_device_resource_ref()));
   }
 
   auto expects_schema_equality =
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 7e3a920f00d..982190eecb5 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -20,11 +20,11 @@
 #include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 60bb2366e87..dc7199d7ab1 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -42,12 +42,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -437,7 +436,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
 
   // scatter string and separator
   auto labels = cudf::lists::detail::generate_labels(
-    lists_strings, num_strings, stream, rmm::mr::get_current_device_resource());
+    lists_strings, num_strings, stream, cudf::get_current_device_resource_ref());
   auto d_strings_children = cudf::column_device_view::create(strings_children, stream);
   thrust::for_each(rmm::exec_policy(stream),
                    thrust::make_counting_iterator<size_type>(0),
@@ -645,13 +644,13 @@ struct column_to_strings_fn {
       }
     };
     auto new_offsets = cudf::lists::detail::get_normalized_offsets(
-      lists_column_view(column), stream_, rmm::mr::get_current_device_resource());
+      lists_column_view(column), stream_, cudf::get_current_device_resource_ref());
     auto const list_child_string = make_lists_column(
       column.size(),
       std::move(new_offsets),
       child_string_with_null(),
       column.null_count(),
-      cudf::detail::copy_bitmask(column, stream_, rmm::mr::get_current_device_resource()),
+      cudf::detail::copy_bitmask(column, stream_, cudf::get_current_device_resource_ref()),
       stream_);
     return join_list_of_strings(lists_column_view(*list_child_string),
                                 list_row_begin_wrap.value(stream_),
@@ -736,7 +735,7 @@ struct column_to_strings_fn {
                              narep,
                              options_.is_enabled_include_nulls(),
                              stream_,
-                             rmm::mr::get_current_device_resource());
+                             cudf::get_current_device_resource_ref());
   }
 
  private:
@@ -765,17 +764,18 @@ std::unique_ptr<column> make_strings_column_from_host(host_span<std::string cons
   std::string const host_chars =
     std::accumulate(host_strings.begin(), host_strings.end(), std::string(""));
   auto d_chars = cudf::detail::make_device_uvector_async(
-    host_chars, stream, rmm::mr::get_current_device_resource());
+    host_chars, stream, cudf::get_current_device_resource_ref());
   std::vector<cudf::size_type> offsets(host_strings.size() + 1, 0);
   std::transform_inclusive_scan(host_strings.begin(),
                                 host_strings.end(),
                                 offsets.begin() + 1,
                                 std::plus<cudf::size_type>{},
                                 [](auto& str) { return str.size(); });
-  auto d_offsets = std::make_unique<cudf::column>(
-    cudf::detail::make_device_uvector_sync(offsets, stream, rmm::mr::get_current_device_resource()),
-    rmm::device_buffer{},
-    0);
+  auto d_offsets =
+    std::make_unique<cudf::column>(cudf::detail::make_device_uvector_sync(
+                                     offsets, stream, cudf::get_current_device_resource_ref()),
+                                   rmm::device_buffer{},
+                                   0);
   return cudf::make_strings_column(
     host_strings.size(), std::move(d_offsets), d_chars.release(), 0, {});
 }
@@ -798,7 +798,7 @@ std::unique_ptr<column> make_column_names_column(host_span<column_name_info cons
   auto unescaped_string_col = make_strings_column_from_host(unescaped_column_names, stream);
   auto d_column             = column_device_view::create(*unescaped_string_col, stream);
   return escape_strings_fn{*d_column, true}.get_escaped_strings(
-    *unescaped_string_col, stream, rmm::mr::get_current_device_resource());
+    *unescaped_string_col, stream, cudf::get_current_device_resource_ref());
 }
 
 void write_chunked(data_sink* out_sink,
@@ -893,7 +893,7 @@ void write_json(data_sink* out_sink,
     }
 
     // convert each chunk to JSON:
-    column_to_strings_fn converter{options, stream, rmm::mr::get_current_device_resource()};
+    column_to_strings_fn converter{options, stream, cudf::get_current_device_resource_ref()};
 
     for (auto&& sub_view : vector_views) {
       // Skip if the table has no rows
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 94b294087b8..bb2d6dbcc9f 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -22,9 +22,9 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/orc.hpp>
 #include <cudf/io/orc.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <io/utilities/column_buffer.hpp>
 
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index e3b9a048be8..d628e936cb1 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -28,13 +28,13 @@
 #include <cudf/io/config_utils.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/fill.h>
@@ -506,7 +506,7 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
     }
   }
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
-    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
+    prefix_sums_to_update, stream, cudf::get_current_device_resource_ref());
 
   thrust::for_each(
     rmm::exec_policy_nosync(stream),
@@ -683,7 +683,7 @@ std::vector<range> find_table_splits(table_view const& input,
   segment_length = std::min(segment_length, input.num_rows());
 
   auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count(
-    input, segment_length, stream, rmm::mr::get_current_device_resource());
+    input, segment_length, stream, cudf::get_current_device_resource_ref());
 
   auto segmented_sizes =
     cudf::detail::hostdevice_vector<cumulative_size>(d_segmented_sizes->size(), stream);
@@ -777,7 +777,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
       [](auto const& sum, auto const& cols_level) { return sum + cols_level.size(); });
 
     return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-      num_total_cols * stripe_count, _stream, rmm::mr::get_current_device_resource());
+      num_total_cols * stripe_count, _stream, cudf::get_current_device_resource_ref());
   }();
   std::size_t num_processed_lvl_columns      = 0;
   std::size_t num_processed_prev_lvl_columns = 0;
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
index c943ae17d97..4c1079cffe8 100644
--- a/cpp/src/io/orc/reader_impl_helpers.cpp
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -16,7 +16,7 @@
 
 #include "reader_impl_helpers.hpp"
 
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::io::orc::detail {
 
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index a563fb19e15..5528b2ee763 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -21,9 +21,9 @@
 #include "io/utilities/column_buffer.hpp"
 
 #include <cudf/io/orc.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 80f32512b98..5c70e35fd2e 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -26,6 +26,7 @@
 #include <cudf/io/orc_types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -1425,7 +1426,7 @@ void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
 
   // Copy the vector of views to the device so that we can pass it to the kernel
   auto d_sizes = cudf::detail::make_device_uvector_async<decimal_column_element_sizes>(
-    h_sizes, stream, rmm::mr::get_current_device_resource());
+    h_sizes, stream, cudf::get_current_device_resource_ref());
 
   constexpr int block_size = 256;
   dim3 const grid_size{static_cast<unsigned int>(elem_sizes.size()),        // num decimal columns
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index ebdf9f3f249..60a64fb0ee6 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -32,6 +32,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -728,7 +729,7 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
                                 cudaMemcpyDefault,
                                 stream.value()));
   auto const d_stripes = cudf::detail::make_device_uvector_async(
-    segmentation.stripes, stream, rmm::mr::get_current_device_resource());
+    segmentation.stripes, stream, cudf::get_current_device_resource_ref());
 
   // One thread per column, per stripe
   thrust::for_each_n(
@@ -1354,7 +1355,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
     }
     //  Copy to device
     auto const d_stat_chunks = cudf::detail::make_device_uvector_async<statistics_chunk>(
-      h_stat_chunks, stream, rmm::mr::get_current_device_resource());
+      h_stat_chunks, stream, cudf::get_current_device_resource_ref());
     stats_merge.host_to_device_async(stream);
 
     // Encode and return
@@ -1738,7 +1739,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
 
   // Attach null masks to device column views (async)
   auto const d_mask_ptrs = cudf::detail::make_device_uvector_async(
-    mask_ptrs, stream, rmm::mr::get_current_device_resource());
+    mask_ptrs, stream, cudf::get_current_device_resource_ref());
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator(0ul),
@@ -1828,7 +1829,7 @@ orc_table_view make_orc_table_view(table_view const& table,
       return orc_column.orc_kind();
     });
   auto const d_type_kinds = cudf::detail::make_device_uvector_async(
-    type_kinds, stream, rmm::mr::get_current_device_resource());
+    type_kinds, stream, cudf::get_current_device_resource_ref());
 
   rmm::device_uvector<orc_column_device_view> d_orc_columns(orc_columns.size(), stream);
   using stack_value_type = thrust::pair<column_device_view const*, cuda::std::optional<uint32_t>>;
@@ -1879,7 +1880,7 @@ orc_table_view make_orc_table_view(table_view const& table,
           std::move(d_orc_columns),
           str_col_indexes,
           cudf::detail::make_device_uvector_sync(
-            str_col_indexes, stream, rmm::mr::get_current_device_resource())};
+            str_col_indexes, stream, cudf::get_current_device_resource_ref())};
 }
 
 hostdevice_2dvector<rowgroup_rows> calculate_rowgroup_bounds(orc_table_view const& orc_table,
@@ -2239,7 +2240,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
 
       // Create the inverse permutation - i.e. the mapping from the original order to the sorted
       auto order_copy = cudf::detail::make_device_uvector_async<uint32_t>(
-        sd.data_order, current_stream, rmm::mr::get_current_device_resource());
+        sd.data_order, current_stream, cudf::get_current_device_resource_ref());
       thrust::scatter(rmm::exec_policy_nosync(current_stream),
                       thrust::counting_iterator<uint32_t>(0),
                       thrust::counting_iterator<uint32_t>(sd.data_order.size()),
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index c8b8b7a1193..b90ca36c8c7 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -25,12 +25,10 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
@@ -399,7 +397,7 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   std::reference_wrapper<ast::expression const> filter,
   rmm::cuda_stream_view stream) const
 {
-  auto mr = rmm::mr::get_current_device_resource();
+  auto mr = cudf::get_current_device_resource_ref();
   // Create row group indices.
   std::vector<std::vector<size_type>> filtered_row_group_indices;
   std::vector<std::vector<size_type>> all_row_group_indices;
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 65dafb568c0..dd354b905f3 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -16,7 +16,7 @@
 
 #include "reader_impl.hpp"
 
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::io::parquet::detail {
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 9950e2f7d7d..7d817bde7af 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -23,8 +23,7 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -706,7 +705,7 @@ table_with_metadata reader::impl::finalize_output(read_mode mode,
     auto predicate  = cudf::detail::compute_column(*read_table,
                                                   _expr_conv.get_converted_expr().value().get(),
                                                   _stream,
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
     CUDF_EXPECTS(predicate->view().type().id() == type_id::BOOL8,
                  "Predicate filter should return a boolean");
     // Exclude columns present in filter only in output
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 5e3cc4301f9..2d46da14bec 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -28,11 +28,10 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -369,7 +368,7 @@ class reader::impl {
                                                                          size_t chunk_num_rows);
 
   rmm::cuda_stream_view _stream;
-  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{cudf::get_current_device_resource_ref()};
 
   // Reader configs.
   struct {
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 00d62c45962..84f0dab0d8b 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -25,6 +25,7 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/config_utils.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -441,7 +442,7 @@ adjust_cumulative_sizes(device_span<cumulative_page_info const> c_info,
 {
   // sort by row count
   rmm::device_uvector<cumulative_page_info> c_info_sorted =
-    make_device_uvector_async(c_info, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(c_info, stream, cudf::get_current_device_resource_ref());
   thrust::sort(
     rmm::exec_policy_nosync(stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_less{});
 
@@ -846,9 +847,9 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
     });
   }
   auto d_comp_in = cudf::detail::make_device_uvector_async(
-    comp_in, stream, rmm::mr::get_current_device_resource());
+    comp_in, stream, cudf::get_current_device_resource_ref());
   auto d_comp_out = cudf::detail::make_device_uvector_async(
-    comp_out, stream, rmm::mr::get_current_device_resource());
+    comp_out, stream, cudf::get_current_device_resource_ref());
 
   int32_t start_pos = 0;
   for (auto const& codec : codecs) {
@@ -922,9 +923,9 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   // now copy the uncompressed V2 def and rep level data
   if (not copy_in.empty()) {
     auto const d_copy_in = cudf::detail::make_device_uvector_async(
-      copy_in, stream, rmm::mr::get_current_device_resource());
+      copy_in, stream, cudf::get_current_device_resource_ref());
     auto const d_copy_out = cudf::detail::make_device_uvector_async(
-      copy_out, stream, rmm::mr::get_current_device_resource());
+      copy_out, stream, cudf::get_current_device_resource_ref());
 
     gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
     stream.synchronize();
@@ -1143,7 +1144,7 @@ void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunk
 
   // add to the cumulative_page_info data
   rmm::device_uvector<size_t> d_temp_cost = cudf::detail::make_device_uvector_async(
-    temp_cost, stream, rmm::mr::get_current_device_resource());
+    temp_cost, stream, cudf::get_current_device_resource_ref());
   auto iter = thrust::make_counting_iterator(size_t{0});
   thrust::for_each(rmm::exec_policy_nosync(stream),
                    iter,
@@ -1346,7 +1347,7 @@ void reader::impl::setup_next_subpass(read_mode mode)
     [&]() -> std::tuple<rmm::device_uvector<page_span>, size_t, size_t> {
     if (!pass.has_compressed_data || _input_pass_read_limit == 0) {
       rmm::device_uvector<page_span> page_indices(
-        num_columns, _stream, rmm::mr::get_current_device_resource());
+        num_columns, _stream, cudf::get_current_device_resource_ref());
       auto iter = thrust::make_counting_iterator(0);
       thrust::transform(rmm::exec_policy_nosync(_stream),
                         iter,
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 557b1a45c1f..52918f5bc80 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/batched_memset.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -392,7 +393,7 @@ void fill_in_page_info(host_span<ColumnChunkDesc> chunks,
   }
 
   auto d_page_indexes = cudf::detail::make_device_uvector_async(
-    page_indexes, stream, rmm::mr::get_current_device_resource());
+    page_indexes, stream, cudf::get_current_device_resource_ref());
 
   auto iter = thrust::make_counting_iterator<size_type>(0);
   thrust::for_each(
@@ -754,7 +755,7 @@ void reader::impl::build_string_dict_indices()
 
   // allocate and distribute pointers
   pass.str_dict_index = cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-    total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
+    total_str_dict_indexes, _stream, cudf::get_current_device_resource_ref());
 
   auto iter = thrust::make_counting_iterator(0);
   thrust::for_each(
@@ -907,7 +908,7 @@ void reader::impl::allocate_level_decode_space()
   size_t const per_page_decode_buf_size = LEVEL_DECODE_BUF_SIZE * 2 * pass.level_type_size;
   auto const decode_buf_size            = per_page_decode_buf_size * pages.size();
   subpass.level_decode_data =
-    rmm::device_buffer(decode_buf_size, _stream, rmm::mr::get_current_device_resource());
+    rmm::device_buffer(decode_buf_size, _stream, cudf::get_current_device_resource_ref());
 
   // distribute the buffers
   uint8_t* buf = static_cast<uint8_t*>(subpass.level_decode_data.data());
@@ -1551,7 +1552,7 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
         .nesting_depth;
 
     auto const d_cols_info = cudf::detail::make_device_uvector_async(
-      h_cols_info, _stream, rmm::mr::get_current_device_resource());
+      h_cols_info, _stream, cudf::get_current_device_resource_ref());
 
     auto const num_keys = _input_columns.size() * max_depth * subpass.pages.size();
     // size iterator. indexes pages by sorted order
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 46c3151c731..81fd4ab9f82 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -43,6 +43,7 @@
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -1048,7 +1049,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   // TODO(cp): Explore doing this for all columns in a single go outside this ctor. Maybe using
   // hostdevice_vector. Currently this involves a cudaMemcpyAsync for each column.
   _d_nullability = cudf::detail::make_device_uvector_async(
-    _nullability, stream, rmm::mr::get_current_device_resource());
+    _nullability, stream, cudf::get_current_device_resource_ref());
 
   _is_list = (_max_rep_level > 0);
 
@@ -1120,7 +1121,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<PageFragment>& f
                               rmm::cuda_stream_view stream)
 {
   auto d_partitions = cudf::detail::make_device_uvector_async(
-    partitions, stream, rmm::mr::get_current_device_resource());
+    partitions, stream, cudf::get_current_device_resource_ref());
   InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
   frag.device_to_host_sync(stream);
 }
@@ -1140,7 +1141,7 @@ void calculate_page_fragments(device_span<PageFragment> frag,
                               rmm::cuda_stream_view stream)
 {
   auto d_frag_sz = cudf::detail::make_device_uvector_async(
-    frag_sizes, stream, rmm::mr::get_current_device_resource());
+    frag_sizes, stream, cudf::get_current_device_resource_ref());
   CalculatePageFragments(frag, d_frag_sz, stream);
 }
 
@@ -1649,7 +1650,7 @@ std::vector<column_view> convert_decimal_columns_and_metadata(
       case type_id::DECIMAL32:
         // Convert data to decimal128 type
         d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128<int32_t>(
-          column, stream, rmm::mr::get_current_device_resource()));
+          column, stream, cudf::get_current_device_resource_ref()));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
@@ -1664,7 +1665,7 @@ std::vector<column_view> convert_decimal_columns_and_metadata(
       case type_id::DECIMAL64:
         // Convert data to decimal128 type
         d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128<int64_t>(
-          column, stream, rmm::mr::get_current_device_resource()));
+          column, stream, cudf::get_current_device_resource_ref()));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
@@ -1869,7 +1870,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back());
 
   auto d_part_frag_offset = cudf::detail::make_device_uvector_async(
-    part_frag_offset, stream, rmm::mr::get_current_device_resource());
+    part_frag_offset, stream, cudf::get_current_device_resource_ref());
   cudf::detail::hostdevice_2dvector<PageFragment> row_group_fragments(
     num_columns, num_fragments, stream);
 
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index e3435a24b18..028f922bec3 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -33,13 +33,12 @@
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
@@ -345,9 +344,9 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
     auto const concurrency = 2;
     auto num_tile_states   = std::max(32, TILES_PER_CHUNK * concurrency + 32);
     auto tile_multistates =
-      scan_tile_state<multistate>(num_tile_states, stream, rmm::mr::get_current_device_resource());
+      scan_tile_state<multistate>(num_tile_states, stream, cudf::get_current_device_resource_ref());
     auto tile_offsets = scan_tile_state<output_offset>(
-      num_tile_states, stream, rmm::mr::get_current_device_resource());
+      num_tile_states, stream, cudf::get_current_device_resource_ref());
 
     multibyte_split_init_kernel<<<TILES_PER_CHUNK,
                                   THREADS_PER_TILE,
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 8abfb000b94..249dc3b5875 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -24,9 +24,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <iomanip>
 #include <sstream>
@@ -44,7 +42,7 @@ void gather_column_buffer::allocate_strings_data(bool memset_data, rmm::cuda_str
   // default rmm memory resource.
   _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
     cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-      size, stream, rmm::mr::get_current_device_resource()));
+      size, stream, cudf::get_current_device_resource_ref()));
 }
 
 std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index b2290965bb9..e73b2bc88de 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -26,13 +26,12 @@
 #include <cudf/io/types.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -167,7 +166,7 @@ class column_buffer_base {
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
-  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{cudf::get_current_device_resource_ref()};
 
  public:
   data_type type{type_id::EMPTY};
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 73362334e26..f70171eef68 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -28,11 +28,11 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda/functional>
diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh
index 3bc5ccf41ef..f7e6de03354 100644
--- a/cpp/src/io/utilities/output_builder.cuh
+++ b/cpp/src/io/utilities/output_builder.cuh
@@ -16,12 +16,12 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 
@@ -207,7 +207,7 @@ class output_builder {
   output_builder(size_type max_write_size,
                  size_type max_growth,
                  rmm::cuda_stream_view stream,
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+                 rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
     : _max_write_size{max_write_size}, _max_growth{max_growth}
   {
     CUDF_EXPECTS(max_write_size > 0, "Internal error");
diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp
index 0d9e7e40e4e..1d6d5a0a570 100644
--- a/cpp/src/io/utilities/string_parsing.hpp
+++ b/cpp/src/io/utilities/string_parsing.hpp
@@ -19,10 +19,10 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
diff --git a/cpp/src/io/utilities/trie.cu b/cpp/src/io/utilities/trie.cu
index 3be1a8332ca..504e72147e5 100644
--- a/cpp/src/io/utilities/trie.cu
+++ b/cpp/src/io/utilities/trie.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "trie.cuh"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cuda_runtime.h>
@@ -104,7 +105,7 @@ rmm::device_uvector<serial_trie_node> create_serialized_trie(std::vector<std::st
     if (has_children) { nodes.push_back(serial_trie_node(trie_terminating_character)); }
   }
   return cudf::detail::make_device_uvector_sync(
-    nodes, stream, rmm::mr::get_current_device_resource());
+    nodes, stream, cudf::get_current_device_resource_ref());
 }
 
 }  // namespace detail
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 789702ce538..748691fb7d1 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -28,9 +28,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
index 06eb83d6ba8..4f6a9484e8c 100644
--- a/cpp/src/join/conditional_join.hpp
+++ b/cpp/src/join/conditional_join.hpp
@@ -20,10 +20,9 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index a2ee3a7796b..eeb49736bac 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -27,9 +27,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index 3d95b0c5a5c..c7294152982 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -24,11 +24,11 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cooperative_groups.h>
 #include <cub/block/block_scan.cuh>
@@ -139,7 +139,8 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
   } else {
     auto stencil = thrust::counting_iterator<size_type>{0};
     auto const row_bitmask =
-      cudf::detail::bitmask_and(this->_build, stream, rmm::mr::get_current_device_resource()).first;
+      cudf::detail::bitmask_and(this->_build, stream, cudf::get_current_device_resource_ref())
+        .first;
     auto const pred =
       cudf::detail::row_is_valid{reinterpret_cast<bitmask_type const*>(row_bitmask.data())};
 
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 5d01482f44a..beeaabfdaab 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -22,13 +22,13 @@
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/join.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/functional.h>
@@ -385,7 +385,7 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
   if (_is_empty) { return; }
 
   auto const row_bitmask =
-    cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
+    cudf::detail::bitmask_and(build, stream, cudf::get_current_device_resource_ref()).first;
   cudf::detail::build_join_hash_table(_build,
                                       _preprocessed_build,
                                       _hash_table,
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index bc7f09763ec..0abff27667b 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -21,9 +21,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -41,7 +41,7 @@ inner_join(table_view const& left_input,
   auto matched = cudf::dictionary::detail::match_dictionaries(
     {left_input, right_input},
     stream,
-    rmm::mr::get_current_device_resource());  // temporary objects returned
+    cudf::get_current_device_resource_ref());  // temporary objects returned
 
   // now rebuild the table views with the updated ones
   auto const left      = matched.second.front();
@@ -76,7 +76,7 @@ left_join(table_view const& left_input,
   auto matched = cudf::dictionary::detail::match_dictionaries(
     {left_input, right_input},  // these should match
     stream,
-    rmm::mr::get_current_device_resource());  // temporary objects returned
+    cudf::get_current_device_resource_ref());  // temporary objects returned
   // now rebuild the table views with the updated ones
   table_view const left  = matched.second.front();
   table_view const right = matched.second.back();
@@ -101,7 +101,7 @@ full_join(table_view const& left_input,
   auto matched = cudf::dictionary::detail::match_dictionaries(
     {left_input, right_input},  // these should match
     stream,
-    rmm::mr::get_current_device_resource());  // temporary objects returned
+    cudf::get_current_device_resource_ref());  // temporary objects returned
   // now rebuild the table views with the updated ones
   table_view const left  = matched.second.front();
   table_view const right = matched.second.back();
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 3d0f3e4340d..4f75908fe72 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -21,10 +21,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 8d916da9f2c..16302657ac2 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -16,8 +16,9 @@
 
 #include "join_common_utils.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index eb12065c6a9..8ff78dd47f4 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -29,11 +29,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/scan.h>
@@ -138,7 +138,7 @@ mixed_join(
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const row_bitmask =
-    cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
+    cudf::detail::bitmask_and(build, stream, cudf::get_current_device_resource_ref()).first;
   auto const preprocessed_build =
     experimental::row::equality::preprocessed_table::create(build, stream);
   build_join_hash_table(build,
@@ -404,7 +404,7 @@ compute_mixed_join_output_size(table_view const& left_equality,
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const row_bitmask =
-    cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
+    cudf::detail::bitmask_and(build, stream, cudf::get_current_device_resource_ref()).first;
   auto const preprocessed_build =
     experimental::row::equality::preprocessed_table::create(build, stream);
   build_join_hash_table(build,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index a79aa6673d6..cfb785e242c 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -30,11 +30,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -208,7 +208,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
     auto const [row_bitmask, _] =
-      cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource());
+      cudf::detail::bitmask_and(build, stream, cudf::get_current_device_resource_ref());
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
diff --git a/cpp/src/join/mixed_join_size_kernel.hpp b/cpp/src/join/mixed_join_size_kernel.hpp
index b09805c14dc..0f570c601d7 100644
--- a/cpp/src/join/mixed_join_size_kernel.hpp
+++ b/cpp/src/join/mixed_join_size_kernel.hpp
@@ -25,6 +25,9 @@
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <cooperative_groups.h>
 #include <cub/cub.cuh>
 #include <thrust/iterator/discard_iterator.h>
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 91d98d5e8d3..f69ded73e8d 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -25,11 +25,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -72,7 +72,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
                                               compare_nulls,
                                               nan_equality::ALL_EQUAL,
                                               stream,
-                                              rmm::mr::get_current_device_resource());
+                                              cudf::get_current_device_resource_ref());
 
   auto const left_num_rows = left_keys.num_rows();
   auto gather_map =
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index 1bf4bf3b153..59fdbedf089 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -34,10 +34,10 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/optional>
 #include <thrust/pair.h>
@@ -692,7 +692,7 @@ std::pair<cuda::std::optional<rmm::device_uvector<path_operator>>, int> build_co
   auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END;
   return is_empty ? std::pair(cuda::std::nullopt, 0)
                   : std::pair(cuda::std::make_optional(cudf::detail::make_device_uvector_sync(
-                                h_operators, stream, rmm::mr::get_current_device_resource())),
+                                h_operators, stream, cudf::get_current_device_resource_ref())),
                               max_stack_depth);
 }
 
@@ -999,7 +999,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 
   // compute output sizes
   auto sizes =
-    rmm::device_uvector<size_type>(col.size(), stream, rmm::mr::get_current_device_resource());
+    rmm::device_uvector<size_type>(col.size(), stream, cudf::get_current_device_resource_ref());
   auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(col.offsets());
 
   constexpr int block_size = 512;
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 7ee1d540831..18a500069ad 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -24,6 +24,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
@@ -32,7 +33,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 58ec053712d..7ae5db3e84b 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -27,10 +27,10 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index bc1b48b11cd..790c99c494d 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -23,11 +23,11 @@
 #include <cudf/lists/combine.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -219,7 +219,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
   // concatenate the input table into one column.
   std::vector<column_view> cols(input.num_columns());
   std::copy(input.begin(), input.end(), cols.begin());
-  auto concat = cudf::detail::concatenate(cols, stream, rmm::mr::get_current_device_resource());
+  auto concat = cudf::detail::concatenate(cols, stream, cudf::get_current_device_resource_ref());
 
   // whether or not we should be generating a null mask at all
   auto const build_null_mask = concat->has_nulls();
@@ -251,7 +251,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
               return row_null_counts[row_index] != num_columns;
             }),
           stream,
-          rmm::mr::get_current_device_resource());
+          cudf::get_current_device_resource_ref());
       }
       // NULLIFY_OUTPUT_ROW.  Output row is nullfied if any input row is null
       return cudf::detail::valid_if(
@@ -264,7 +264,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
             return row_null_counts[row_index] == 0;
           }),
         stream,
-        rmm::mr::get_current_device_resource());
+        cudf::get_current_device_resource_ref());
     }();
     concat->set_null_mask(std::move(null_mask), null_count);
   }
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 11703527d26..9556ef23784 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -28,11 +28,11 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
@@ -316,7 +316,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
                                       search_key,
                                       duplicate_find_option::FIND_FIRST,
                                       stream,
-                                      rmm::mr::get_current_device_resource());
+                                      cudf::get_current_device_resource_ref());
   return to_contains(std::move(key_indices), stream, mr);
 }
 
@@ -332,7 +332,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
                                       search_keys,
                                       duplicate_find_option::FIND_FIRST,
                                       stream,
-                                      rmm::mr::get_current_device_resource());
+                                      cudf::get_current_device_resource_ref());
   return to_contains(std::move(key_indices), stream, mr);
 }
 
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index 8cd58e7eff2..c8bc4799688 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -25,10 +25,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index 162c6140656..b4c0fb12b8e 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -20,10 +20,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu
index cadeb273a65..0df1801b99b 100644
--- a/cpp/src/lists/copying/gather.cu
+++ b/cpp/src/lists/copying/gather.cu
@@ -16,9 +16,9 @@
 
 #include <cudf/detail/gather.cuh>
 #include <cudf/lists/detail/gather.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index b754fef24e5..9cbb3c59510 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -21,10 +21,9 @@
 #include <cudf/lists/detail/copying.hpp>
 #include <cudf/lists/detail/scatter_helper.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 90f7994b21d..f6e48f141e1 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -22,9 +22,9 @@
 #include <cudf/lists/detail/gather.cuh>
 #include <cudf/lists/gather.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index 19c434d10e1..78f78ff6246 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -24,10 +24,10 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index 50f40924478..469442d46d4 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -22,6 +22,7 @@
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -267,7 +268,7 @@ dremel_data get_encoding(column_view h_col,
   }
 
   auto d_nullability = cudf::detail::make_device_uvector_async(
-    nullability, stream, rmm::mr::get_current_device_resource());
+    nullability, stream, cudf::get_current_device_resource_ref());
 
   rmm::device_uvector<uint8_t> rep_level(max_vals_size, stream);
   rmm::device_uvector<uint8_t> def_level(max_vals_size, stream);
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 74a0d842aad..00e19e2e2cb 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -21,12 +21,12 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <cuda/std/optional>
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index c0ce86fb56e..b6d22955e67 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -26,10 +26,10 @@
 #include <cudf/lists/extract.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/fill.h>
@@ -105,7 +105,7 @@ std::unique_ptr<cudf::column> make_index_offsets(size_type num_lists, rmm::cuda_
   return cudf::detail::sequence(num_lists + 1,
                                 cudf::scalar_type_t<size_type>(0, true, stream),
                                 stream,
-                                rmm::mr::get_current_device_resource());
+                                cudf::get_current_device_resource_ref());
 }
 
 }  // namespace
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 45ae3671d4e..3d6fdda957b 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -24,12 +24,12 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -104,7 +104,7 @@ std::unique_ptr<column> concatenate_and_gather_lists(host_span<column_view const
 {
   // Concatenate all columns into a single (temporary) column.
   auto const concatenated_col =
-    cudf::detail::concatenate(columns_to_concat, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::concatenate(columns_to_concat, stream, cudf::get_current_device_resource_ref());
 
   // The number of input columns is known to be non-zero thus it's safe to call `front()` here.
   auto const num_cols       = columns_to_concat.size();
diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu
index 66ad1c35c33..dea38947a54 100644
--- a/cpp/src/lists/lists_column_factories.cu
+++ b/cpp/src/lists/lists_column_factories.cu
@@ -22,10 +22,10 @@
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/lists_column_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/sequence.h>
@@ -48,7 +48,7 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
                              stream,
                              mr);
   }
-  auto mr_final = size == 1 ? mr : rmm::mr::get_current_device_resource();
+  auto mr_final = size == 1 ? mr : cudf::get_current_device_resource_ref();
 
   // Handcraft a 1-row column
   auto sizes_itr = thrust::constant_iterator<size_type>(value.view().size());
diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu
index d913ce070ae..b80f6c882c8 100644
--- a/cpp/src/lists/reverse.cu
+++ b/cpp/src/lists/reverse.cu
@@ -23,11 +23,11 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/reverse.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -45,7 +45,7 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
 
   // The labels are also a map from each list element to its corresponding zero-based list index.
   auto const labels =
-    generate_labels(input, child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(input, child.size(), stream, cudf::get_current_device_resource_ref());
 
   // The offsets of the output lists column.
   auto out_offsets = get_normalized_offsets(input, stream, mr);
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index f920fb916eb..c78b6d793d4 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -24,10 +24,10 @@
 #include <cudf/lists/sorting.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index 7d57d8ddb60..4b50bf626f2 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -24,11 +24,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 5c7ab68d64b..c0bc10dd266 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -27,12 +27,12 @@
 #include <cudf/lists/detail/set_operations.hpp>
 #include <cudf/lists/detail/stream_compaction.hpp>
 #include <cudf/lists/set_operations.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/functional.h>
@@ -78,15 +78,15 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
   auto const lhs_child = lhs.get_sliced_child(stream);
   auto const rhs_child = rhs.get_sliced_child(stream);
   auto const lhs_labels =
-    generate_labels(lhs, lhs_child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(lhs, lhs_child.size(), stream, cudf::get_current_device_resource_ref());
   auto const rhs_labels =
-    generate_labels(rhs, rhs_child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(rhs, rhs_child.size(), stream, cudf::get_current_device_resource_ref());
   auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
   auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
   // Check existence for each row of the rhs_table in lhs_table.
   auto const contained = cudf::detail::contains(
-    lhs_table, rhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
+    lhs_table, rhs_table, nulls_equal, nans_equal, stream, cudf::get_current_device_resource_ref());
 
   auto const num_rows = lhs.size();
 
@@ -148,20 +148,20 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
   auto const lhs_child = lhs.get_sliced_child(stream);
   auto const rhs_child = rhs.get_sliced_child(stream);
   auto const lhs_labels =
-    generate_labels(lhs, lhs_child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(lhs, lhs_child.size(), stream, cudf::get_current_device_resource_ref());
   auto const rhs_labels =
-    generate_labels(rhs, rhs_child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(rhs, rhs_child.size(), stream, cudf::get_current_device_resource_ref());
   auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
   auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
   auto const contained = cudf::detail::contains(
-    lhs_table, rhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
+    lhs_table, rhs_table, nulls_equal, nans_equal, stream, cudf::get_current_device_resource_ref());
 
   auto const intersect_table = cudf::detail::copy_if(
     rhs_table,
     [contained = contained.begin()] __device__(auto const idx) { return contained[idx]; },
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
 
   // A stable algorithm is required to ensure that list labels remain contiguous.
   auto out_table = cudf::detail::stable_distinct(intersect_table->view(),
@@ -205,7 +205,7 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
     lists::detail::concatenate_rows(table_view{{lhs.parent(), rhs.parent()}},
                                     concatenate_null_policy::NULLIFY_OUTPUT_ROW,
                                     stream,
-                                    rmm::mr::get_current_device_resource());
+                                    cudf::get_current_device_resource_ref());
 
   return cudf::lists::detail::distinct(
     lists_column_view{union_col->view()}, nulls_equal, nans_equal, stream, mr);
@@ -231,20 +231,20 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
   auto const lhs_child = lhs.get_sliced_child(stream);
   auto const rhs_child = rhs.get_sliced_child(stream);
   auto const lhs_labels =
-    generate_labels(lhs, lhs_child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(lhs, lhs_child.size(), stream, cudf::get_current_device_resource_ref());
   auto const rhs_labels =
-    generate_labels(rhs, rhs_child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(rhs, rhs_child.size(), stream, cudf::get_current_device_resource_ref());
   auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
   auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
   auto const contained = cudf::detail::contains(
-    rhs_table, lhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
+    rhs_table, lhs_table, nulls_equal, nans_equal, stream, cudf::get_current_device_resource_ref());
 
   auto const difference_table = cudf::detail::copy_if(
     lhs_table,
     [contained = contained.begin()] __device__(auto const idx) { return !contained[idx]; },
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
 
   // A stable algorithm is required to ensure that list labels remain contiguous.
   auto out_table = cudf::detail::stable_distinct(difference_table->view(),
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index 71aafa3ce12..c78e9c22e2a 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -27,9 +27,9 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
@@ -73,7 +73,7 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                              null_policy::EXCLUDE,
                                              std::nullopt,
                                              stream,
-                                             rmm::mr::get_current_device_resource());
+                                             cudf::get_current_device_resource_ref());
     auto const d_sizes     = column_device_view::create(*sizes, stream);
     auto const sizes_begin = cudf::detail::make_null_replacement_iterator(*d_sizes, size_type{0});
     auto const sizes_end   = sizes_begin + sizes->size();
diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu
index cdcb4aa957f..ab750de9ef2 100644
--- a/cpp/src/lists/stream_compaction/distinct.cu
+++ b/cpp/src/lists/stream_compaction/distinct.cu
@@ -25,9 +25,9 @@
 #include <cudf/lists/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -50,7 +50,7 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
 
   auto const child = input.get_sliced_child(stream);
   auto const labels =
-    generate_labels(input, child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(input, child.size(), stream, cudf::get_current_device_resource_ref());
 
   auto const distinct_table =
     cudf::detail::stable_distinct(table_view{{labels->view(), child}},  // input table
diff --git a/cpp/src/lists/utilities.cu b/cpp/src/lists/utilities.cu
index 7fb960f02ca..53ddc27a8a5 100644
--- a/cpp/src/lists/utilities.cu
+++ b/cpp/src/lists/utilities.cu
@@ -19,8 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/labeling/label_segments.cuh>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::lists::detail {
 
diff --git a/cpp/src/lists/utilities.hpp b/cpp/src/lists/utilities.hpp
index 218ad7872e9..c0fcf7b7182 100644
--- a/cpp/src/lists/utilities.hpp
+++ b/cpp/src/lists/utilities.hpp
@@ -18,10 +18,10 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists::detail {
 
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index e2c8d49a4ab..b9e0da0a3fe 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -34,13 +34,13 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -247,7 +247,7 @@ index_vector generate_merged_indices(table_view const& left_table,
   auto rhs_device_view = table_device_view::create(right_table, stream);
 
   auto d_column_order = cudf::detail::make_device_uvector_async(
-    column_order, stream, rmm::mr::get_current_device_resource());
+    column_order, stream, cudf::get_current_device_resource_ref());
 
   if (has_nulls) {
     auto const new_null_precedence = [&]() {
@@ -261,7 +261,7 @@ index_vector generate_merged_indices(table_view const& left_table,
     }();
 
     auto d_null_precedence = cudf::detail::make_device_uvector_async(
-      new_null_precedence, stream, rmm::mr::get_current_device_resource());
+      new_null_precedence, stream, cudf::get_current_device_resource_ref());
 
     auto ineq_op = detail::row_lexicographic_tagged_comparator<true>(
       *lhs_device_view, *rhs_device_view, d_column_order, d_null_precedence);
@@ -307,7 +307,7 @@ index_vector generate_merged_indices_nested(table_view const& left_table,
                                                           column_order,
                                                           null_precedence,
                                                           stream,
-                                                          rmm::mr::get_current_device_resource());
+                                                          cudf::get_current_device_resource_ref());
   auto const left_indices         = left_indices_col->view();
   auto left_indices_mutable       = left_indices_col->mutable_view();
   auto const left_indices_begin   = left_indices.begin<cudf::size_type>();
@@ -647,7 +647,7 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
-    tables_to_merge, stream, rmm::mr::get_current_device_resource());
+    tables_to_merge, stream, cudf::get_current_device_resource_ref());
   auto merge_tables = matched.second;
 
   // A queue of (table view, table) pairs
@@ -673,7 +673,7 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
     auto const right_table = top_and_pop(merge_queue);
 
     // Only use mr for the output table
-    auto const& new_tbl_mr = merge_queue.empty() ? mr : rmm::mr::get_current_device_resource();
+    auto const& new_tbl_mr = merge_queue.empty() ? mr : cudf::get_current_device_resource_ref();
     auto merged_table      = merge(left_table.view,
                               right_table.view,
                               key_cols,
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index f10388794fc..17008e80e79 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -27,11 +27,11 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_scan.cuh>
 #include <cub/device/device_histogram.cuh>
@@ -501,10 +501,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
 
   // Holds the total number of rows in each partition
   auto global_partition_sizes = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-    num_partitions, stream, rmm::mr::get_current_device_resource());
+    num_partitions, stream, cudf::get_current_device_resource_ref());
 
   auto row_partition_offset = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-    num_rows, stream, rmm::mr::get_current_device_resource());
+    num_rows, stream, cudf::get_current_device_resource_ref());
 
   auto const row_hasher = experimental::row::hash::row_hasher(table_to_hash, stream);
   auto const hasher =
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 9810373b751..5a4c90a67a5 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -26,12 +26,12 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 5d748de0019..80fd72a3088 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -30,11 +30,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -89,7 +89,7 @@ struct quantile_functor {
     auto d_output = mutable_column_device_view::create(output->mutable_view(), stream);
 
     auto q_device =
-      cudf::detail::make_device_uvector_sync(q, stream, rmm::mr::get_current_device_resource());
+      cudf::detail::make_device_uvector_sync(q, stream, cudf::get_current_device_resource_ref());
 
     if (!cudf::is_dictionary(input.type())) {
       auto sorted_data =
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index 0b0e6701304..69421f3bfc4 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -26,9 +26,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -55,7 +55,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
     });
 
   auto const q_device =
-    cudf::detail::make_device_uvector_async(q, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::make_device_uvector_async(q, stream, cudf::get_current_device_resource_ref());
 
   auto quantile_idx_iter = thrust::make_transform_iterator(q_device.begin(), quantile_idx_lookup);
 
@@ -90,7 +90,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
       input, thrust::make_counting_iterator<size_type>(0), q, interp, stream, mr);
   } else {
     auto sorted_idx = detail::sorted_order(
-      input, column_order, null_precedence, stream, rmm::mr::get_current_device_resource());
+      input, column_order, null_precedence, stream, cudf::get_current_device_resource_ref());
     return detail::quantiles(input, sorted_idx->view().data<size_type>(), q, interp, stream, mr);
   }
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 421ed26e26d..0d017cf1f13 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -25,10 +25,10 @@
 #include <cudf/quantiles.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -199,7 +199,7 @@ std::unique_ptr<column> compute_approx_percentiles(tdigest_column_view const& in
                                                           weight.size(),
                                                           mask_state::UNALLOCATED,
                                                           stream,
-                                                          rmm::mr::get_current_device_resource());
+                                                          cudf::get_current_device_resource_ref());
   auto keys               = cudf::detail::make_counting_transform_iterator(
     0,
     cuda::proclaim_return_type<std::ptrdiff_t>(
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 229af89fc46..2dd25a7b890 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -29,11 +29,11 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/unary.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -1082,7 +1082,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                                               {order::ASCENDING},
                                               {},
                                               stream,
-                                              rmm::mr::get_current_device_resource());
+                                              cudf::get_current_device_resource_ref());
                  });
 
   // generate min and max values
@@ -1143,7 +1143,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                  std::back_inserter(tdigest_views),
                  [](std::unique_ptr<table> const& t) { return t->view(); });
   auto merged =
-    cudf::detail::concatenate(tdigest_views, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref());
 
   // generate cumulative weights
   auto merged_weights     = merged->get_column(1).view();
@@ -1220,7 +1220,7 @@ std::unique_ptr<scalar> reduce_tdigest(column_view const& col,
   // order with nulls at the end.
   table_view t({col});
   auto sorted = cudf::detail::sort(
-    t, {order::ASCENDING}, {null_order::AFTER}, stream, rmm::mr::get_current_device_resource());
+    t, {order::ASCENDING}, {null_order::AFTER}, stream, cudf::get_current_device_resource_ref());
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 11b0e2732fe..67ea29a2cb1 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -18,8 +18,7 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/atomic>
 #include <thrust/for_each.h>
@@ -66,7 +65,7 @@ struct all_fn {
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
     }();
-    auto d_result = rmm::device_scalar<int32_t>(1, stream, rmm::mr::get_current_device_resource());
+    auto d_result = rmm::device_scalar<int32_t>(1, stream, cudf::get_current_device_resource_ref());
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index 0ebeb7a48b9..057f038c622 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -18,8 +18,7 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/atomic>
 #include <thrust/for_each.h>
@@ -66,7 +65,7 @@ struct any_fn {
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
     }();
-    auto d_result = rmm::device_scalar<int32_t>(0, stream, rmm::mr::get_current_device_resource());
+    auto d_result = rmm::device_scalar<int32_t>(0, stream, cudf::get_current_device_resource_ref());
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
diff --git a/cpp/src/reductions/collect_ops.cu b/cpp/src/reductions/collect_ops.cu
index c1a1f117ee1..01dfb8f2c7d 100644
--- a/cpp/src/reductions/collect_ops.cu
+++ b/cpp/src/reductions/collect_ops.cu
@@ -22,8 +22,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index aa71546f049..6bc8b48832f 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -19,11 +19,10 @@
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/reduction/detail/reduction.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index d49c0c6f0d2..362b5f74c46 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -20,8 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/atomic>
 #include <cuda/functional>
@@ -223,7 +222,7 @@ compute_row_frequencies(table_view const& input,
       partial_counts ? partial_counts.value().begin<histogram_count_type>() : nullptr},
     histogram_count_type{0},
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
 
   auto const input_it = thrust::make_zip_iterator(
     thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
diff --git a/cpp/src/reductions/max.cu b/cpp/src/reductions/max.cu
index 682889f0fee..0434d043240 100644
--- a/cpp/src/reductions/max.cu
+++ b/cpp/src/reductions/max.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/mean.cu b/cpp/src/reductions/mean.cu
index e8a10f02cc1..c5ab501f607 100644
--- a/cpp/src/reductions/mean.cu
+++ b/cpp/src/reductions/mean.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/min.cu b/cpp/src/reductions/min.cu
index 7986bda5751..26b91ebe868 100644
--- a/cpp/src/reductions/min.cu
+++ b/cpp/src/reductions/min.cu
@@ -18,8 +18,7 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 6cb58786971..139de068050 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -24,9 +24,9 @@
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/extrema.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/reductions/nested_type_minmax_util.cuh b/cpp/src/reductions/nested_type_minmax_util.cuh
index 3cf390d3574..6a2c4c44553 100644
--- a/cpp/src/reductions/nested_type_minmax_util.cuh
+++ b/cpp/src/reductions/nested_type_minmax_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/reduction/detail/reduction_operators.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -104,7 +105,7 @@ class comparison_binop_generator {
         std::vector<null_order>{DEFAULT_NULL_ORDER},
         cudf::structs::detail::column_nullability::MATCH_INCOMING,
         stream,
-        rmm::mr::get_current_device_resource())},
+        cudf::get_current_device_resource_ref())},
       row_comparator{[&input_,
                       &input_tview     = input_tview,
                       &flattened_input = flattened_input,
diff --git a/cpp/src/reductions/nth_element.cu b/cpp/src/reductions/nth_element.cu
index e266f477c5d..4f6198696bd 100644
--- a/cpp/src/reductions/nth_element.cu
+++ b/cpp/src/reductions/nth_element.cu
@@ -19,11 +19,11 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/reductions/product.cu b/cpp/src/reductions/product.cu
index 28ff8db3708..f5fd735a9f4 100644
--- a/cpp/src/reductions/product.cu
+++ b/cpp/src/reductions/product.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index d4ea84742c7..d187375b69f 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -29,10 +29,10 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <utility>
 
@@ -78,7 +78,7 @@ struct reduce_dispatch_functor {
         return standard_deviation(col, output_dtype, var_agg._ddof, stream, mr);
       }
       case aggregation::MEDIAN: {
-        auto current_mr     = rmm::mr::get_current_device_resource();
+        auto current_mr     = cudf::get_current_device_resource_ref();
         auto sorted_indices = cudf::detail::sorted_order(
           table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr);
         auto valid_sorted_indices =
@@ -91,7 +91,7 @@ struct reduce_dispatch_functor {
         auto quantile_agg = static_cast<cudf::detail::quantile_aggregation const&>(agg);
         CUDF_EXPECTS(quantile_agg._quantiles.size() == 1,
                      "Reduction quantile accepts only one quantile value");
-        auto current_mr     = rmm::mr::get_current_device_resource();
+        auto current_mr     = cudf::get_current_device_resource_ref();
         auto sorted_indices = cudf::detail::sorted_order(
           table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr);
         auto valid_sorted_indices =
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index 0dbfc271a25..6d0adc83359 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -21,10 +21,10 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/scan.h>
 #include <thrust/tabulate.h>
@@ -135,7 +135,7 @@ std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
   column_view const& order_by, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto const rank_column =
-    inclusive_rank_scan(order_by, stream, rmm::mr::get_current_device_resource());
+    inclusive_rank_scan(order_by, stream, cudf::get_current_device_resource_ref());
   auto const rank_view = rank_column->view();
 
   // Result type for min 0-index percent rank is independent of input type.
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index de4dcf1de52..d3c0b54f286 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -20,8 +20,7 @@
 #include <cudf/detail/scan.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index 6c237741ac3..76f98fe9a28 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -20,10 +20,10 @@
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/reduction.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <utility>
 
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 7224bf47390..38ed0a68901 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -23,10 +23,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cast_functor.cuh>
 #include <cudf/null_mask.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/scan.h>
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index ee35d716d6e..a876d54d45f 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -28,10 +28,10 @@
 #include <cudf/reduction.hpp>
 #include <cudf/strings/detail/scan.hpp>
 #include <cudf/structs/detail/scan.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/find.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/reductions/segmented/all.cu b/cpp/src/reductions/segmented/all.cu
index 489fc6a283c..e59e6a6896b 100644
--- a/cpp/src/reductions/segmented/all.cu
+++ b/cpp/src/reductions/segmented/all.cu
@@ -17,8 +17,7 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/any.cu b/cpp/src/reductions/segmented/any.cu
index a9a8528548a..444ab689c39 100644
--- a/cpp/src/reductions/segmented/any.cu
+++ b/cpp/src/reductions/segmented/any.cu
@@ -17,8 +17,7 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/compound.cuh b/cpp/src/reductions/segmented/compound.cuh
index 035a8bdcd75..77fabbe485f 100644
--- a/cpp/src/reductions/segmented/compound.cuh
+++ b/cpp/src/reductions/segmented/compound.cuh
@@ -22,11 +22,10 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/reduction/detail/segmented_reduction.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/adjacent_difference.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -73,7 +72,7 @@ std::unique_ptr<column> compound_segmented_reduction(column_view const& col,
                                               offsets,
                                               null_handling,
                                               stream,
-                                              rmm::mr::get_current_device_resource());
+                                              cudf::get_current_device_resource_ref());
 
   // Run segmented reduction
   if (col.has_nulls()) {
diff --git a/cpp/src/reductions/segmented/counts.cu b/cpp/src/reductions/segmented/counts.cu
index 79737828678..5a072d6ca0a 100644
--- a/cpp/src/reductions/segmented/counts.cu
+++ b/cpp/src/reductions/segmented/counts.cu
@@ -17,8 +17,7 @@
 #include "counts.hpp"
 
 #include <cudf/detail/null_mask.cuh>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/adjacent_difference.h>
 
diff --git a/cpp/src/reductions/segmented/counts.hpp b/cpp/src/reductions/segmented/counts.hpp
index f249644e564..c3f3e935f9a 100644
--- a/cpp/src/reductions/segmented/counts.hpp
+++ b/cpp/src/reductions/segmented/counts.hpp
@@ -17,11 +17,11 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 class column_device_view;
diff --git a/cpp/src/reductions/segmented/max.cu b/cpp/src/reductions/segmented/max.cu
index 1c79edcc08c..49d0fe5f01c 100644
--- a/cpp/src/reductions/segmented/max.cu
+++ b/cpp/src/reductions/segmented/max.cu
@@ -17,8 +17,7 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/mean.cu b/cpp/src/reductions/segmented/mean.cu
index 8df6bee97e9..a9919086c8d 100644
--- a/cpp/src/reductions/segmented/mean.cu
+++ b/cpp/src/reductions/segmented/mean.cu
@@ -17,9 +17,9 @@
 #include "compound.cuh"
 
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/min.cu b/cpp/src/reductions/segmented/min.cu
index ae1d5ae42a4..052c81bc2c7 100644
--- a/cpp/src/reductions/segmented/min.cu
+++ b/cpp/src/reductions/segmented/min.cu
@@ -17,8 +17,7 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/nunique.cu b/cpp/src/reductions/segmented/nunique.cu
index d4fcf89e161..9b7e6f9fe57 100644
--- a/cpp/src/reductions/segmented/nunique.cu
+++ b/cpp/src/reductions/segmented/nunique.cu
@@ -24,9 +24,9 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/reductions/segmented/product.cu b/cpp/src/reductions/segmented/product.cu
index 1b82e7e5aec..84e54ce6b6c 100644
--- a/cpp/src/reductions/segmented/product.cu
+++ b/cpp/src/reductions/segmented/product.cu
@@ -17,8 +17,7 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index e6de065dabb..40d1d8a0a53 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -22,10 +22,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/simple.cuh b/cpp/src/reductions/segmented/simple.cuh
index da59df6b314..6c35e750e6b 100644
--- a/cpp/src/reductions/segmented/simple.cuh
+++ b/cpp/src/reductions/segmented/simple.cuh
@@ -28,12 +28,12 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/reduction/detail/segmented_reduction.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -243,7 +243,7 @@ std::unique_ptr<column> fixed_point_segmented_reduction(
                                                   offsets,
                                                   null_policy::EXCLUDE,  // do not count nulls
                                                   stream,
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
 
       auto const max_count = thrust::reduce(rmm::exec_policy(stream),
                                             counts.begin(),
diff --git a/cpp/src/reductions/segmented/std.cu b/cpp/src/reductions/segmented/std.cu
index 0a7eb007f68..1d1a26e5176 100644
--- a/cpp/src/reductions/segmented/std.cu
+++ b/cpp/src/reductions/segmented/std.cu
@@ -17,9 +17,9 @@
 #include "compound.cuh"
 
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/sum.cu b/cpp/src/reductions/segmented/sum.cu
index bb06f6d7c8e..220148a7841 100644
--- a/cpp/src/reductions/segmented/sum.cu
+++ b/cpp/src/reductions/segmented/sum.cu
@@ -17,8 +17,7 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/sum_of_squares.cu b/cpp/src/reductions/segmented/sum_of_squares.cu
index 25d52f9bc79..6f3c1abd942 100644
--- a/cpp/src/reductions/segmented/sum_of_squares.cu
+++ b/cpp/src/reductions/segmented/sum_of_squares.cu
@@ -17,9 +17,9 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/update_validity.cu b/cpp/src/reductions/segmented/update_validity.cu
index 92cfe5417ef..f0c3f0a0f0b 100644
--- a/cpp/src/reductions/segmented/update_validity.cu
+++ b/cpp/src/reductions/segmented/update_validity.cu
@@ -18,10 +18,9 @@
 
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 namespace cudf {
 namespace reduction {
 namespace detail {
diff --git a/cpp/src/reductions/segmented/update_validity.hpp b/cpp/src/reductions/segmented/update_validity.hpp
index c143e1a4761..d60be8e92f4 100644
--- a/cpp/src/reductions/segmented/update_validity.hpp
+++ b/cpp/src/reductions/segmented/update_validity.hpp
@@ -19,10 +19,10 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
diff --git a/cpp/src/reductions/segmented/var.cu b/cpp/src/reductions/segmented/var.cu
index 35f2771dfcf..f70943c19fc 100644
--- a/cpp/src/reductions/segmented/var.cu
+++ b/cpp/src/reductions/segmented/var.cu
@@ -17,9 +17,9 @@
 #include "compound.cuh"
 
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 372ceccf60b..e897deee8a3 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -27,12 +27,12 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/struct_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -344,7 +344,7 @@ struct same_element_type_dispatcher {
       dictionary_column_view(col).get_indices_annotated(),
       init,
       stream,
-      rmm::mr::get_current_device_resource());
+      cudf::get_current_device_resource_ref());
     return resolve_key<ElementType>(dictionary_column_view(col).keys(), *index, stream, mr);
   }
 
diff --git a/cpp/src/reductions/std.cu b/cpp/src/reductions/std.cu
index 9c78b35313b..38076b52b14 100644
--- a/cpp/src/reductions/std.cu
+++ b/cpp/src/reductions/std.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/sum.cu b/cpp/src/reductions/sum.cu
index 51b251a836e..898eadb8435 100644
--- a/cpp/src/reductions/sum.cu
+++ b/cpp/src/reductions/sum.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/sum_of_squares.cu b/cpp/src/reductions/sum_of_squares.cu
index dc0eae56e98..49917f3009e 100644
--- a/cpp/src/reductions/sum_of_squares.cu
+++ b/cpp/src/reductions/sum_of_squares.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/var.cu b/cpp/src/reductions/var.cu
index aaab9dd4604..0e7b2fea9f8 100644
--- a/cpp/src/reductions/var.cu
+++ b/cpp/src/reductions/var.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index cb3caf9d068..7f605f08d8d 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -34,11 +34,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -258,7 +258,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
     return result;
   }();
   auto matched_view = dictionary_column_view(matched_column->view());
-  auto default_mr   = rmm::mr::get_current_device_resource();
+  auto default_mr   = cudf::get_current_device_resource_ref();
 
   // get the indexes for lo_replace and for hi_replace
   auto lo_replace_index =
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index eba6f6b436e..394c2a2de80 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -24,11 +24,11 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 13e130588c1..1df1549432f 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -37,13 +37,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index c2cd03cd761..86ec8cfc91e 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -48,12 +48,12 @@
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -262,14 +262,14 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
     auto new_keys = cudf::detail::concatenate(
       std::vector<cudf::column_view>({values.keys(), replacements.keys()}),
       stream,
-      rmm::mr::get_current_device_resource());
+      cudf::get_current_device_resource_ref());
     return cudf::dictionary::detail::add_keys(input, new_keys->view(), stream, mr);
   }();
   auto matched_view   = cudf::dictionary_column_view(matched_input->view());
   auto matched_values = cudf::dictionary::detail::set_keys(
-    values, matched_view.keys(), stream, rmm::mr::get_current_device_resource());
+    values, matched_view.keys(), stream, cudf::get_current_device_resource_ref());
   auto matched_replacements = cudf::dictionary::detail::set_keys(
-    replacements, matched_view.keys(), stream, rmm::mr::get_current_device_resource());
+    replacements, matched_view.keys(), stream, cudf::get_current_device_resource_ref());
 
   auto indices_type = matched_view.indices().type();
   auto new_indices  = cudf::type_dispatcher<cudf::dispatch_storage_type>(
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 2a03a5504c1..0526594cbef 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -23,11 +23,11 @@
 #include <cudf/reshape.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/for_each.h>
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 7473b6045af..6c47d6f2216 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -29,10 +29,10 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 3d4fb73c000..45c40df3aeb 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -24,9 +24,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh
index cfedcac8ae4..5d5fe9e4aa3 100644
--- a/cpp/src/rolling/detail/lead_lag_nested.cuh
+++ b/cpp/src/rolling/detail/lead_lag_nested.cuh
@@ -24,12 +24,11 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/scatter.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -200,7 +199,7 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
                          out_of_bounds_policy::DONT_CHECK,
                          cudf::detail::negative_index_policy::NOT_ALLOWED,
                          stream,
-                         rmm::mr::get_current_device_resource());
+                         cudf::get_current_device_resource_ref());
 
   // Scatter defaults into locations where LEAD/LAG computed nulls.
   auto scattered_results = cudf::detail::scatter(
diff --git a/cpp/src/rolling/detail/nth_element.cuh b/cpp/src/rolling/detail/nth_element.cuh
index 571f4c02cb5..ce1e666d5a0 100644
--- a/cpp/src/rolling/detail/nth_element.cuh
+++ b/cpp/src/rolling/detail/nth_element.cuh
@@ -21,9 +21,9 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
index 4175c6e34c1..72c23395a93 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
@@ -25,8 +25,7 @@
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::detail {
 
@@ -143,7 +142,7 @@ std::unique_ptr<column> reduction_based_rolling_window(column_view const& input,
                                              return_dtype,
                                              std::nullopt,
                                              stream,
-                                             rmm::mr::get_current_device_resource());
+                                             cudf::get_current_device_resource_ref());
     }
   }();
   // Blow up results into separate column.
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.hpp b/cpp/src/rolling/detail/optimized_unbounded_window.hpp
index 153586b187f..5adba764e9d 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.hpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.hpp
@@ -16,9 +16,9 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace rmm::mr {
 class device_memory_resource;
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index c18bb9d9885..528700137bf 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -44,13 +44,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/climits>
 #include <cuda/std/limits>
@@ -928,7 +928,7 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
                                                      min_periods,
                                                      agg._null_handling,
                                                      stream,
-                                                     rmm::mr::get_current_device_resource());
+                                                     cudf::get_current_device_resource_ref());
 
     result = lists::detail::distinct(
       lists_column_view{collected_list->view()}, agg._nulls_equal, agg._nans_equal, stream, mr);
diff --git a/cpp/src/rolling/detail/rolling.hpp b/cpp/src/rolling/detail/rolling.hpp
index 2624d982712..8820a6264e7 100644
--- a/cpp/src/rolling/detail/rolling.hpp
+++ b/cpp/src/rolling/detail/rolling.hpp
@@ -18,10 +18,9 @@
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 namespace cudf {
 // helper functions - used in the rolling window implementation and tests
 
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cu b/cpp/src/rolling/detail/rolling_collect_list.cu
index b259bd51fc4..8a98b65b406 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cu
+++ b/cpp/src/rolling/detail/rolling_collect_list.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cuh b/cpp/src/rolling/detail/rolling_collect_list.cuh
index 7630898f820..f3eff6b0689 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cuh
+++ b/cpp/src/rolling/detail/rolling_collect_list.cuh
@@ -21,10 +21,10 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/extrema.h>
diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu
index df0e72748ce..23424da13cd 100644
--- a/cpp/src/rolling/detail/rolling_fixed_window.cu
+++ b/cpp/src/rolling/detail/rolling_fixed_window.cu
@@ -20,8 +20,7 @@
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/functional>
 #include <thrust/extrema.h>
diff --git a/cpp/src/rolling/detail/rolling_variable_window.cu b/cpp/src/rolling/detail/rolling_variable_window.cu
index 83e8faec291..c2324947ef6 100644
--- a/cpp/src/rolling/detail/rolling_variable_window.cu
+++ b/cpp/src/rolling/detail/rolling_variable_window.cu
@@ -18,8 +18,7 @@
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/functional>
 #include <thrust/extrema.h>
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 1158bf22494..ac6c7b11ef5 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -28,8 +28,7 @@
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -605,9 +604,9 @@ get_null_bounds_for_orderby_column(column_view const& orderby_column,
 
     // When there are no nulls, just copy the input group offsets to the output.
     return std::make_tuple(cudf::detail::make_device_uvector_async(
-                             group_offsets_span, stream, rmm::mr::get_current_device_resource()),
+                             group_offsets_span, stream, cudf::get_current_device_resource_ref()),
                            cudf::detail::make_device_uvector_async(
-                             group_offsets_span, stream, rmm::mr::get_current_device_resource()));
+                             group_offsets_span, stream, cudf::get_current_device_resource_ref()));
   }
 }
 
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index 5dff40a3396..651bf26b8d9 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -20,8 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 369ed039b66..8988d73fb02 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -30,11 +30,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 #include <thrust/uninitialized_fill.h>
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 83209c55c8a..31535198c58 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -21,10 +21,10 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -591,7 +591,7 @@ table struct_scalar::init_data(table&& data,
 
   // push validity mask down
   auto const validity = cudf::detail::create_null_mask(
-    1, mask_state::ALL_NULL, stream, rmm::mr::get_current_device_resource());
+    1, mask_state::ALL_NULL, stream, cudf::get_current_device_resource_ref());
   for (auto& col : data_cols) {
     col = cudf::structs::detail::superimpose_nulls(
       static_cast<bitmask_type const*>(validity.data()), 1, std::move(col), stream, mr);
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index d59c5c9fc85..656fe61fbbe 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -19,11 +19,11 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace {
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 57f2c59de40..5d21e8f662c 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -21,9 +21,9 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -59,10 +59,10 @@ std::unique_ptr<column> contains_column_dispatch::operator()<dictionary32>(
   dictionary_column_view const needles(needles_in);
   // first combine keys so both dictionaries have the same set
   auto needles_matched = dictionary::detail::add_keys(
-    needles, haystack.keys(), stream, rmm::mr::get_current_device_resource());
+    needles, haystack.keys(), stream, cudf::get_current_device_resource_ref());
   auto const needles_view = dictionary_column_view(needles_matched->view());
   auto haystack_matched   = dictionary::detail::set_keys(
-    haystack, needles_view.keys(), stream, rmm::mr::get_current_device_resource());
+    haystack, needles_view.keys(), stream, cudf::get_current_device_resource_ref());
   auto const haystack_view = dictionary_column_view(haystack_matched->view());
 
   // now just use the indices for the contains
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index 2aa9e24174b..21f2d601d6b 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -27,6 +27,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -146,7 +147,7 @@ bool contains_scalar_dispatch::operator()<cudf::dictionary32>(column_view const&
   auto const dict_col = cudf::dictionary_column_view(haystack);
   // first, find the needle in the dictionary's key set
   auto const index = cudf::dictionary::detail::get_index(
-    dict_col, needle, stream, rmm::mr::get_current_device_resource());
+    dict_col, needle, stream, cudf::get_current_device_resource_ref());
   // if found, check the index is actually in the indices column
   return index->is_valid(stream) && cudf::type_dispatcher(dict_col.indices().type(),
                                                           contains_scalar_dispatch{},
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 66cefd0aa2f..2f6d23b7f7d 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -23,11 +23,11 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 #include <cuda/functional>
@@ -119,7 +119,7 @@ std::pair<rmm::device_buffer, bitmask_type const*> build_row_bitmask(table_view
   if (nullable_columns.size() > 1) {
     auto row_bitmask =
       cudf::detail::bitmask_and(
-        table_view{nullable_columns}, stream, rmm::mr::get_current_device_resource())
+        table_view{nullable_columns}, stream, cudf::get_current_device_resource_ref())
         .first;
     auto const row_bitmask_ptr = static_cast<bitmask_type const*>(row_bitmask.data());
     return std::pair(std::move(row_bitmask), row_bitmask_ptr);
diff --git a/cpp/src/search/search_ordered.cu b/cpp/src/search/search_ordered.cu
index 80651a4ec44..ac93e24b254 100644
--- a/cpp/src/search/search_ordered.cu
+++ b/cpp/src/search/search_ordered.cu
@@ -23,10 +23,10 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 
@@ -64,7 +64,7 @@ std::unique_ptr<column> search_ordered(table_view const& haystack,
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.
   auto const matched = dictionary::detail::match_dictionaries(
-    {haystack, needles}, stream, rmm::mr::get_current_device_resource());
+    {haystack, needles}, stream, cudf::get_current_device_resource_ref());
   auto const& matched_haystack = matched.second.front();
   auto const& matched_needles  = matched.second.back();
 
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index c5dcc7c240d..cbde87198bd 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -27,10 +27,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <cuda/std/type_traits>
diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu
index 408ac29b8a9..5dc5c39f2bc 100644
--- a/cpp/src/sort/segmented_sort.cu
+++ b/cpp/src/sort/segmented_sort.cu
@@ -20,10 +20,10 @@
 #include <cudf/detail/sorting.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index 281fdfa6b8f..a397d4c6630 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -23,11 +23,10 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_segmented_sort.cuh>
 
@@ -77,7 +76,7 @@ struct column_fast_sort_fn {
                                                 input.size(),
                                                 mask_allocation_policy::NEVER,
                                                 stream,
-                                                rmm::mr::get_current_device_resource());
+                                                cudf::get_current_device_resource_ref());
     mutable_column_view output_view = temp_col->mutable_view();
     auto temp_indices               = cudf::column(
       cudf::column_view(indices.type(), indices.size(), indices.head(), nullptr, 0), stream);
@@ -311,12 +310,13 @@ std::unique_ptr<table> segmented_sort_by_key_common(table_view const& values,
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
-  auto sorted_order = segmented_sorted_order_common<method>(keys,
-                                                            segment_offsets,
-                                                            column_order,
-                                                            null_precedence,
-                                                            stream,
-                                                            rmm::mr::get_current_device_resource());
+  auto sorted_order =
+    segmented_sorted_order_common<method>(keys,
+                                          segment_offsets,
+                                          column_order,
+                                          null_precedence,
+                                          stream,
+                                          cudf::get_current_device_resource_ref());
   // Gather segmented sort of child value columns
   return detail::gather(values,
                         sorted_order->view(),
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index 7216bc99e08..ac6fef17952 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -24,9 +24,9 @@
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/sort.h>
@@ -53,7 +53,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                "Mismatch in number of rows for values and keys");
 
   auto sorted_order = detail::sorted_order(
-    keys, column_order, null_precedence, stream, rmm::mr::get_current_device_resource());
+    keys, column_order, null_precedence, stream, cudf::get_current_device_resource_ref());
 
   return detail::gather(values,
                         sorted_order->view(),
diff --git a/cpp/src/sort/sort_column.cu b/cpp/src/sort/sort_column.cu
index 99a45bf91a3..212f4728c05 100644
--- a/cpp/src/sort/sort_column.cu
+++ b/cpp/src/sort/sort_column.cu
@@ -19,10 +19,9 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/sequence.h>
 
 namespace cudf {
diff --git a/cpp/src/sort/sort_column_impl.cuh b/cpp/src/sort/sort_column_impl.cuh
index 564791e0b49..906cfb23894 100644
--- a/cpp/src/sort/sort_column_impl.cuh
+++ b/cpp/src/sort/sort_column_impl.cuh
@@ -21,11 +21,11 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index 20e977e9fd5..d5efebf26e2 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -20,8 +20,7 @@
 #include "sort_column_impl.cuh"
 
 #include <cudf/column/column_factories.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/sort/stable_segmented_sort.cu b/cpp/src/sort/stable_segmented_sort.cu
index 61e37205c98..e814386db66 100644
--- a/cpp/src/sort/stable_segmented_sort.cu
+++ b/cpp/src/sort/stable_segmented_sort.cu
@@ -20,8 +20,7 @@
 #include <cudf/detail/sorting.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index ce05a755756..6ce4dfbead8 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -24,9 +24,9 @@
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -69,7 +69,7 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                "Mismatch in number of rows for values and keys");
 
   auto sorted_order = detail::stable_sorted_order(
-    keys, column_order, null_precedence, stream, rmm::mr::get_current_device_resource());
+    keys, column_order, null_precedence, stream, cudf::get_current_device_resource_ref());
 
   return detail::gather(values,
                         sorted_order->view(),
diff --git a/cpp/src/sort/stable_sort_column.cu b/cpp/src/sort/stable_sort_column.cu
index bdb631a8154..e1aca9d9fe3 100644
--- a/cpp/src/sort/stable_sort_column.cu
+++ b/cpp/src/sort/stable_sort_column.cu
@@ -19,10 +19,9 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/sequence.h>
 
 namespace cudf {
diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu
index 9812f4ffbd7..2c60687b92c 100644
--- a/cpp/src/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/stream_compaction/apply_boolean_mask.cu
@@ -24,10 +24,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 24e2692cb6f..7d11b02d3e1 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -26,11 +26,10 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <utility>
 #include <vector>
@@ -134,7 +133,7 @@ std::unique_ptr<table> distinct(table_view const& input,
                                                    nulls_equal,
                                                    nans_equal,
                                                    stream,
-                                                   rmm::mr::get_current_device_resource());
+                                                   cudf::get_current_device_resource_ref());
   return detail::gather(input,
                         gather_map,
                         out_of_bounds_policy::DONT_CHECK,
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 78eb0fa5212..46a7f088298 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -30,6 +30,7 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -159,7 +160,7 @@ cudf::size_type distinct_count(table_view const& keys,
       // We must consider a row if any of its column entries is valid,
       // hence OR together the validities of the columns.
       auto const [row_bitmask, null_count] =
-        cudf::detail::bitmask_or(keys, stream, rmm::mr::get_current_device_resource());
+        cudf::detail::bitmask_or(keys, stream, cudf::get_current_device_resource_ref());
 
       // Unless all columns have a null mask, row_bitmask will be
       // null, and null_count will be zero. Equally, unless there is
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index bea02e3dbe8..f15807c2434 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -18,10 +18,10 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 #include <cuda/functional>
diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu
index b98ebbc2ecc..8a53a2e8360 100644
--- a/cpp/src/stream_compaction/drop_nans.cu
+++ b/cpp/src/stream_compaction/drop_nans.cu
@@ -22,10 +22,10 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu
index 2497e4e5065..22da762a0dd 100644
--- a/cpp/src/stream_compaction/drop_nulls.cu
+++ b/cpp/src/stream_compaction/drop_nulls.cu
@@ -22,9 +22,9 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu
index 074d4fd7d1a..2097b7bd3d2 100644
--- a/cpp/src/stream_compaction/stable_distinct.cu
+++ b/cpp/src/stream_compaction/stable_distinct.cu
@@ -19,10 +19,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 #include <thrust/uninitialized_fill.h>
@@ -47,7 +46,7 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                                          nulls_equal,
                                                          nans_equal,
                                                          stream,
-                                                         rmm::mr::get_current_device_resource());
+                                                         cudf::get_current_device_resource_ref());
 
   // The only difference between this implementation and the unstable version
   // is that the stable implementation must retain the input order. The
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index 93de0e60b6d..eaabc6f1272 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -31,11 +31,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index 778f546990d..c56d25fde2b 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -26,11 +26,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/warp/warp_reduce.cuh>
 #include <cuda/functional>
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 3f7a98381b8..45e80cc780d 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -25,9 +25,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/pair.h>
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 27befdea209..4c015f3cbed 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -29,10 +29,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda/atomic>
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 58137aced0f..c3b4938da1a 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -27,9 +27,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index a2c77c5e77f..617ff41a043 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -29,11 +29,11 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index b534e9b2e5b..07e659e380e 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -29,11 +29,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index f5dfc1a2012..663dc9dda73 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -27,10 +27,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 79d241205df..67531fea579 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -27,9 +27,9 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index d4ccb685061..3ba17fdb872 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -24,10 +24,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 99c40f00b00..4c9eba5b526 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -28,13 +28,13 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/optional>
 #include <thrust/execution_policy.h>
@@ -161,7 +161,7 @@ struct format_compiler {
 
     // copy format_items to device memory
     d_items = cudf::detail::make_device_uvector_async(
-      items, stream, rmm::mr::get_current_device_resource());
+      items, stream, cudf::get_current_device_resource_ref());
   }
 
   device_span<format_item const> format_items() { return device_span<format_item const>(d_items); }
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 514ab965fc5..0db1adf1223 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -21,10 +21,10 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 73089ad407e..9848c1f605e 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -27,11 +27,11 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/climits>
 #include <cuda/std/limits>
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index bd7b411d3c3..d3d90104252 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -25,12 +25,12 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index a34b148a951..fce83e87645 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -24,12 +24,12 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index aeabc71d300..b4eead05ce5 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -27,11 +27,11 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 13d6e9bc3ba..c0c890341ae 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -24,9 +24,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index 604f928430b..f574f091ab5 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -21,9 +21,9 @@
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 39907a38f2f..520f5897415 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -28,10 +28,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 352e0f9f41a..1d9d12686eb 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -24,11 +24,11 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -87,7 +87,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
     });
   thrust::inclusive_scan(thrust::host, offset_it, input_offsets.end(), offset_it);
   auto d_input_offsets = cudf::detail::make_device_uvector_async(
-    input_offsets, stream, rmm::mr::get_current_device_resource());
+    input_offsets, stream, cudf::get_current_device_resource_ref());
   auto const output_size = input_offsets.back();
 
   // Compute the partition offsets and size of chars column
diff --git a/cpp/src/strings/copying/copy_range.cu b/cpp/src/strings/copying/copy_range.cu
index 2434de1795e..90865a4b73e 100644
--- a/cpp/src/strings/copying/copy_range.cu
+++ b/cpp/src/strings/copying/copy_range.cu
@@ -22,10 +22,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu
index e8b411d50a6..f923f99c131 100644
--- a/cpp/src/strings/copying/copying.cu
+++ b/cpp/src/strings/copying/copying.cu
@@ -21,11 +21,11 @@
 #include <cudf/strings/detail/copying.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index b386c0860d1..e36d5f9f14e 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -22,10 +22,10 @@
 #include <cudf/strings/detail/copying.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index 4ad3a75baf7..ae4e623a9e8 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -20,8 +20,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/string_view.cuh>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
index eab9863b975..f46168a3389 100644
--- a/cpp/src/strings/count_matches.hpp
+++ b/cpp/src/strings/count_matches.hpp
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index b18b50d1b43..7323918dcff 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -26,10 +26,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 897eba58833..a9fbb375e37 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -27,10 +27,10 @@
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/transform_scan.h>
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index 878d0fe11ba..6a2da3542c7 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -20,9 +20,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 48620af8cad..3e8b5e2af57 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -28,10 +28,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/find.h>
@@ -134,8 +134,8 @@ std::unique_ptr<column> filter_characters(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
     });
-  rmm::device_uvector<char_range> table =
-    cudf::detail::make_device_uvector_async(htable, stream, rmm::mr::get_current_device_resource());
+  rmm::device_uvector<char_range> table = cudf::detail::make_device_uvector_async(
+    htable, stream, cudf::get_current_device_resource_ref());
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu
index 4df1b9b4ffe..f8db66f998b 100644
--- a/cpp/src/strings/like.cu
+++ b/cpp/src/strings/like.cu
@@ -21,10 +21,10 @@
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 0d146108436..fb2ce9a251a 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -24,9 +24,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index afbfe9de049..679907788bb 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -24,10 +24,10 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/scan.h>
 
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 022f1eb3232..eae4839b3e4 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -27,9 +27,9 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 86afe4c8b9b..a46b5ebad4f 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -28,9 +28,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <regex>
 
@@ -120,7 +120,7 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
   auto group_count = std::min(99, d_prog->group_counts());  // group count should NOT exceed 99
   auto const parse_result                    = parse_backrefs(replacement, group_count);
   rmm::device_uvector<backref_type> backrefs = cudf::detail::make_device_uvector_async(
-    parse_result.second, stream, rmm::mr::get_current_device_resource());
+    parse_result.second, stream, cudf::get_current_device_resource_ref());
   string_scalar repl_scalar(parse_result.first, true, stream);
   string_view const d_repl_template = repl_scalar.value(stream);
 
diff --git a/cpp/src/strings/replace/find_replace.cu b/cpp/src/strings/replace/find_replace.cu
index 79bf6e3c910..8a8001dd81a 100644
--- a/cpp/src/strings/replace/find_replace.cu
+++ b/cpp/src/strings/replace/find_replace.cu
@@ -18,10 +18,10 @@
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index b5248700d53..352d883bdc5 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -30,10 +30,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -321,9 +321,9 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
     get_offset_value(input.offsets(), input.offset(), stream);
 
   auto d_targets =
-    create_string_vector_from_column(targets, stream, rmm::mr::get_current_device_resource());
+    create_string_vector_from_column(targets, stream, cudf::get_current_device_resource_ref());
   auto d_replacements =
-    create_string_vector_from_column(repls, stream, rmm::mr::get_current_device_resource());
+    create_string_vector_from_column(repls, stream, cudf::get_current_device_resource_ref());
 
   replace_multi_parallel_fn fn{
     *d_strings,
@@ -361,7 +361,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // create a vector of offsets to each string's set of target positions
   auto const targets_offsets = create_offsets_from_positions(
-    input, targets_positions, stream, rmm::mr::get_current_device_resource());
+    input, targets_positions, stream, cudf::get_current_device_resource_ref());
   auto const d_targets_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view());
 
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 0ad3ab2305c..0777253bb38 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -29,9 +29,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -180,7 +180,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
                    return *prog;
                  });
   auto d_progs =
-    cudf::detail::make_device_uvector_async(progs, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::make_device_uvector_async(progs, stream, cudf::get_current_device_resource_ref());
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
   auto const d_repls   = column_device_view::create(replacements.parent(), stream);
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index f7a3a3aea5c..16df0dbabdf 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -29,10 +29,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -312,7 +312,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // create a vector of offsets to each string's set of target positions
   auto const targets_offsets = create_offsets_from_positions(
-    input, targets_positions, stream, rmm::mr::get_current_device_resource());
+    input, targets_positions, stream, cudf::get_current_device_resource_ref());
   auto const d_targets_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view());
 
diff --git a/cpp/src/strings/replace/replace_nulls.cu b/cpp/src/strings/replace/replace_nulls.cu
index ffd9e6c2553..ff86501f02c 100644
--- a/cpp/src/strings/replace/replace_nulls.cu
+++ b/cpp/src/strings/replace/replace_nulls.cu
@@ -25,10 +25,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index fd988855424..19d660e312e 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -27,9 +27,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu
index 04d81218a16..938e3c0270b 100644
--- a/cpp/src/strings/replace/replace_slice.cu
+++ b/cpp/src/strings/replace/replace_slice.cu
@@ -25,9 +25,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 
diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu
index cbd231bc5f3..a207215523d 100644
--- a/cpp/src/strings/reverse.cu
+++ b/cpp/src/strings/reverse.cu
@@ -24,10 +24,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/scan/scan_inclusive.cu b/cpp/src/strings/scan/scan_inclusive.cu
index b3e45f65a21..84cc87bad3e 100644
--- a/cpp/src/strings/scan/scan_inclusive.cu
+++ b/cpp/src/strings/scan/scan_inclusive.cu
@@ -20,11 +20,11 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 45eba39f413..9bd1abb5542 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -27,10 +27,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index 223a941a88a..ec7015878dd 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -24,10 +24,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 2f7e7352458..067a513af96 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -28,10 +28,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index d8324a9b08e..978a844c476 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -28,10 +28,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 93d55c494fe..df1cdcc9d79 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -24,10 +24,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index bc01a46ca6d..352ca83c8b2 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -28,10 +28,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index af70367678e..81aca001d53 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -24,10 +24,10 @@
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/copy.h>
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index d273c93ec12..ef96b9d3f36 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -27,9 +27,9 @@
 #include <cudf/strings/split/split_re.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/functional.h>
@@ -152,7 +152,7 @@ std::pair<rmm::device_uvector<string_index_pair>, std::unique_ptr<column>> gener
   auto const end   = begin + strings_count;
 
   auto [offsets, total_tokens] = cudf::detail::make_offsets_child_column(
-    begin, end, stream, rmm::mr::get_current_device_resource());
+    begin, end, stream, cudf::get_current_device_resource_ref());
   auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
   // build a vector of tokens
@@ -211,7 +211,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
 
   // count the number of delimiters matched in each string
   auto const counts =
-    count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
+    count_matches(*d_strings, *d_prog, stream, cudf::get_current_device_resource_ref());
 
   // get the split tokens from the input column; this also converts the counts into offsets
   auto [tokens, offsets] =
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 3e8be750b9e..6f14462faf1 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -27,9 +27,9 @@
 #include <cudf/strings/split/split.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index a298285f841..07516f91dcf 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -21,11 +21,11 @@
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp
index cf973638cc4..219d1174d42 100644
--- a/cpp/src/strings/strings_scalar_factories.cpp
+++ b/cpp/src/strings/strings_scalar_factories.cpp
@@ -16,9 +16,9 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 // Create a strings-type column from array of pointer/size pairs
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 639097abe63..0dc4c038a02 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -23,10 +23,10 @@
 #include <cudf/strings/strip.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index a242b008a54..22ab5d4fe81 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -25,10 +25,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -107,8 +107,8 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
     return lhs.first < rhs.first;
   });
   // copy translate table to device memory
-  rmm::device_uvector<translate_table> table =
-    cudf::detail::make_device_uvector_async(htable, stream, rmm::mr::get_current_device_resource());
+  rmm::device_uvector<translate_table> table = cudf::detail::make_device_uvector_async(
+    htable, stream, cudf::get_current_device_resource_ref());
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 068d89a52dc..45bd4615435 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -26,11 +26,11 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index dff1891c3cc..38a18aff98d 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -25,11 +25,11 @@
 #include <cudf/strings/wrap.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index 2ccf071711a..2120b4f08c4 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -23,9 +23,9 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 #include <memory>
diff --git a/cpp/src/structs/scan/scan_inclusive.cu b/cpp/src/structs/scan/scan_inclusive.cu
index a6ccea5fca1..28756b25c89 100644
--- a/cpp/src/structs/scan/scan_inclusive.cu
+++ b/cpp/src/structs/scan/scan_inclusive.cu
@@ -20,11 +20,11 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
diff --git a/cpp/src/structs/structs_column_factories.cu b/cpp/src/structs/structs_column_factories.cu
index bbe2bb96fde..86b30d0ccbd 100644
--- a/cpp/src/structs/structs_column_factories.cu
+++ b/cpp/src/structs/structs_column_factories.cu
@@ -17,9 +17,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 81806c92e23..5df9943303d 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -25,11 +25,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 2969557c78f..990c4855a14 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -27,12 +27,10 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
@@ -319,7 +317,7 @@ auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream)
     }
   }
   auto d_dremel_device_views = detail::make_device_uvector_sync(
-    dremel_device_views, stream, rmm::mr::get_current_device_resource());
+    dremel_device_views, stream, cudf::get_current_device_resource_ref());
   return std::make_tuple(std::move(dremel_data), std::move(d_dremel_device_views));
 }
 
@@ -588,12 +586,12 @@ transform_lists_of_structs(column_view const& lhs,
       auto const concatenated_children =
         cudf::detail::concatenate(std::vector<column_view>{child_lhs, child_rhs},
                                   stream,
-                                  rmm::mr::get_current_device_resource());
+                                  cudf::get_current_device_resource_ref());
 
       auto const ranks        = compute_ranks(concatenated_children->view(),
                                        column_null_order,
                                        stream,
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
       auto const ranks_slices = cudf::detail::slice(
         ranks->view(),
         {0, child_lhs.size(), child_lhs.size(), child_lhs.size() + child_rhs.size()},
@@ -647,13 +645,13 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(
 {
   check_lex_compatibility(preprocessed_input);
 
-  auto d_table = table_device_view::create(preprocessed_input, stream);
-  auto d_column_order =
-    detail::make_device_uvector_async(column_order, stream, rmm::mr::get_current_device_resource());
+  auto d_table        = table_device_view::create(preprocessed_input, stream);
+  auto d_column_order = detail::make_device_uvector_async(
+    column_order, stream, cudf::get_current_device_resource_ref());
   auto d_null_precedence = detail::make_device_uvector_async(
-    null_precedence, stream, rmm::mr::get_current_device_resource());
+    null_precedence, stream, cudf::get_current_device_resource_ref());
   auto d_depths = detail::make_device_uvector_async(
-    verticalized_col_depths, stream, rmm::mr::get_current_device_resource());
+    verticalized_col_depths, stream, cudf::get_current_device_resource_ref());
 
   if (detail::has_nested_columns(preprocessed_input)) {
     auto [dremel_data, d_dremel_device_view] = list_lex_preprocess(preprocessed_input, stream);
@@ -699,7 +697,7 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(
           lhs_col,
           null_precedence.empty() ? null_order::BEFORE : new_null_precedence[col_idx],
           stream,
-          rmm::mr::get_current_device_resource());
+          cudf::get_current_device_resource_ref());
 
         transformed_cvs.emplace_back(std::move(transformed));
         transformed_columns.insert(transformed_columns.end(),
@@ -761,7 +759,7 @@ preprocessed_table::create(table_view const& lhs,
           rhs_col,
           null_precedence.empty() ? null_order::BEFORE : null_precedence[col_idx],
           stream,
-          rmm::mr::get_current_device_resource());
+          cudf::get_current_device_resource_ref());
 
       transformed_lhs_cvs.emplace_back(std::move(transformed_lhs));
       transformed_rhs_cvs.emplace_back(std::move(transformed_rhs));
@@ -854,7 +852,7 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(table_view const&
   check_eq_compatibility(t);
 
   auto [null_pushed_table, nullable_data] =
-    structs::detail::push_down_nulls(t, stream, rmm::mr::get_current_device_resource());
+    structs::detail::push_down_nulls(t, stream, cudf::get_current_device_resource_ref());
   auto struct_offset_removed_table = remove_struct_child_offsets(null_pushed_table);
   auto verticalized_t =
     std::get<0>(decompose_structs(struct_offset_removed_table, decompose_lists_column::YES));
diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp
index 9dac7be5efe..cb707c94288 100644
--- a/cpp/src/table/table.cpp
+++ b/cpp/src/table/table.cpp
@@ -18,9 +18,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
index e196eee275f..f46f49ddc0e 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -29,12 +29,12 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/byte_pair_encoding.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index 9fb86aecce3..cd68566bdec 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -23,12 +23,12 @@
 #include <cudf/strings/split/split.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/byte_pair_encoding.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 6635b61093e..15cb53c7c28 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -27,12 +27,12 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -148,7 +148,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
   // the indices may not be in order so we need to build a sorted map
   auto sorted_rows = cudf::detail::stable_sorted_order(
-    cudf::table_view({row_indices}), {}, {}, stream, rmm::mr::get_current_device_resource());
+    cudf::table_view({row_indices}), {}, {}, stream, cudf::get_current_device_resource_ref());
   auto const d_row_map = sorted_rows->view().data<cudf::size_type>();
 
   // create offsets for the tokens for each output string
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index 8d857175407..b04e9961e01 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -22,13 +22,13 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/edit_distance.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 6f700f84ec4..a87ecb81b9d 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -29,12 +29,12 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/detail/generate_ngrams.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
@@ -122,7 +122,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                              return !d_strings.element<cudf::string_view>(idx).empty();
                            },
                            stream,
-                           rmm::mr::get_current_device_resource())
+                           cudf::get_current_device_resource_ref())
                            ->release();
     strings_count = table_offsets.front()->size() - 1;
     auto result   = std::move(table_offsets.front());
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index e856b89b836..2de94a4eb59 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -26,6 +26,7 @@
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <nvtext/jaccard.hpp>
@@ -33,7 +34,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 4318123627d..605582f28a6 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -28,12 +28,12 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/minhash.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 95dd8ff3d6c..eee293268a2 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -27,13 +27,13 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/detail/tokenize.hpp>
 #include <nvtext/ngrams_tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -166,7 +166,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
   auto const count_itr =
     cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{d_strings, d_delimiter});
   auto [token_offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
-    count_itr, count_itr + strings_count, stream, rmm::mr::get_current_device_resource());
+    count_itr, count_itr + strings_count, stream, cudf::get_current_device_resource_ref());
   auto d_token_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());
 
@@ -191,7 +191,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
         return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
       }));
   auto [ngram_offsets, total_ngrams] = cudf::detail::make_offsets_child_column(
-    ngram_counts, ngram_counts + strings_count, stream, rmm::mr::get_current_device_resource());
+    ngram_counts, ngram_counts + strings_count, stream, cudf::get_current_device_resource_ref());
   auto d_ngram_offsets = ngram_offsets->view().begin<cudf::size_type>();
 
   // Compute the total size of the ngrams for each string (not for each ngram)
@@ -207,7 +207,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
   auto const sizes_itr = cudf::detail::make_counting_transform_iterator(
     0, ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions});
   auto [chars_offsets, output_chars_size] = cudf::strings::detail::make_offsets_child_column(
-    sizes_itr, sizes_itr + strings_count, stream, rmm::mr::get_current_device_resource());
+    sizes_itr, sizes_itr + strings_count, stream, cudf::get_current_device_resource_ref());
   auto d_chars_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(chars_offsets->view());
 
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 4db11dc5beb..7e2b766862d 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -32,11 +32,11 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/normalize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 81c787caf86..943bcbe9b3a 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -28,11 +28,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/replace.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index 4746b6b74b9..379e68b891b 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -25,12 +25,12 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/stemmer.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index a08fdea3e84..eca703e2604 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -22,13 +22,13 @@
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/detail/load_hash_file.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index e05427eb6ac..d7e04a0c208 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -25,13 +25,13 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/detail/load_hash_file.hpp>
 #include <nvtext/subword_tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 3ce6064d9c2..df25950e6d5 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -27,6 +27,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/detail/tokenize.hpp>
 #include <nvtext/tokenize.hpp>
@@ -34,7 +35,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/copy.h>
@@ -79,14 +79,14 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
 {
   // get the number of tokens in each string
   auto const token_counts =
-    token_count_fn(strings_count, tokenizer, stream, rmm::mr::get_current_device_resource());
+    token_count_fn(strings_count, tokenizer, stream, cudf::get_current_device_resource_ref());
   auto d_token_counts = token_counts->view();
   // create token-index offsets from the counts
   auto [token_offsets, total_tokens] =
     cudf::detail::make_offsets_child_column(d_token_counts.template begin<cudf::size_type>(),
                                             d_token_counts.template end<cudf::size_type>(),
                                             stream,
-                                            rmm::mr::get_current_device_resource());
+                                            cudf::get_current_device_resource_ref());
   //  build a list of pointers to each token
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
   // now go get the tokens
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index 5945921ed9d..a2297987732 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -32,11 +32,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuco/static_map.cuh>
diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu
index 452aebf4428..f365d690fde 100644
--- a/cpp/src/transform/bools_to_mask.cu
+++ b/cpp/src/transform/bools_to_mask.cu
@@ -23,11 +23,11 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index c4fc8d58552..93105b321dd 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -30,11 +30,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index 1c9d52bce1b..cffb77ba776 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -27,11 +27,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <numeric>
diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu
index be0b80a2633..fe1f6674e8b 100644
--- a/cpp/src/transform/mask_to_bools.cu
+++ b/cpp/src/transform/mask_to_bools.cu
@@ -21,10 +21,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu
index a24ba304004..adb8852c6e6 100644
--- a/cpp/src/transform/nans_to_nulls.cu
+++ b/cpp/src/transform/nans_to_nulls.cu
@@ -22,11 +22,11 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 46e6e55b0b7..e1a784a985e 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -24,12 +24,12 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 6a965d10184..66bbe532e46 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -28,11 +28,11 @@
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <cuda/std/optional>
@@ -526,7 +526,7 @@ std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
 
   // move stack info to the gpu
   rmm::device_uvector<column_info> d_info =
-    cudf::detail::make_device_uvector_async(info, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::make_device_uvector_async(info, stream, cudf::get_current_device_resource_ref());
 
   // each thread needs to maintain a stack of row spans of size max_branch_depth. we will use
   // shared memory to do this rather than allocating a potentially gigantic temporary buffer
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index f5e9048fa0a..52b96bc9039 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -24,11 +24,11 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
 
diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu
index abde43535be..810fd8afd73 100644
--- a/cpp/src/transpose/transpose.cu
+++ b/cpp/src/transpose/transpose.cu
@@ -22,11 +22,11 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/transpose.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index ec21813705a..0913796a527 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -27,12 +27,12 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index ab17da5f8c4..1d506c59cd9 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -22,10 +22,10 @@
 #include <cudf/dictionary/detail/encode.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -349,7 +349,7 @@ std::unique_ptr<cudf::column> transform_fn(cudf::dictionary_column_view const& i
 {
   auto dictionary_view = cudf::column_device_view::create(input.parent(), stream);
   auto dictionary_itr  = dictionary::detail::make_dictionary_iterator<T>(*dictionary_view);
-  auto default_mr      = rmm::mr::get_current_device_resource();
+  auto default_mr      = cudf::get_current_device_resource_ref();
   // call unary-op using temporary output buffer
   auto output = transform_fn<T, UFN>(dictionary_itr,
                                      dictionary_itr + input.size(),
diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu
index 08aa8755624..17a90a14248 100644
--- a/cpp/src/unary/nan_ops.cu
+++ b/cpp/src/unary/nan_ops.cu
@@ -21,10 +21,10 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu
index a223a090128..f6514ea265b 100644
--- a/cpp/src/unary/null_ops.cu
+++ b/cpp/src/unary/null_ops.cu
@@ -18,8 +18,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh
index 61c41705665..34a20d88f37 100644
--- a/cpp/src/unary/unary_ops.cuh
+++ b/cpp/src/unary/unary_ops.cuh
@@ -21,10 +21,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 7c3cea42023..125b98c4a67 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -18,12 +18,12 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index 4bf648bed5a..fe221fb1c48 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -28,6 +28,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -90,7 +91,7 @@ rmm::device_uvector<cudf::bitmask_type> make_mask(cudf::size_type size, bool fil
 {
   if (!fill_valid) {
     return cudf::detail::make_zeroed_device_uvector_sync<cudf::bitmask_type>(
-      size, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      size, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   } else {
     auto ret = rmm::device_uvector<cudf::bitmask_type>(size, cudf::get_default_stream());
     CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(),
diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu
index 65143ec17f1..96f122f21a8 100644
--- a/cpp/tests/bitmask/valid_if_tests.cu
+++ b/cpp/tests/bitmask/valid_if_tests.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -43,7 +44,7 @@ TEST_F(ValidIfTest, EmptyRange)
                                        thrust::make_counting_iterator(0),
                                        odds_valid{},
                                        cudf::get_default_stream(),
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
   auto const& buffer = actual.first;
   EXPECT_EQ(0u, buffer.size());
   EXPECT_EQ(nullptr, buffer.data());
@@ -56,7 +57,7 @@ TEST_F(ValidIfTest, InvalidRange)
                                       thrust::make_counting_iterator(0),
                                       odds_valid{},
                                       cudf::get_default_stream(),
-                                      rmm::mr::get_current_device_resource()),
+                                      cudf::get_current_device_resource_ref()),
                cudf::logic_error);
 }
 
@@ -68,7 +69,7 @@ TEST_F(ValidIfTest, OddsValid)
                                        thrust::make_counting_iterator(10000),
                                        odds_valid{},
                                        cudf::get_default_stream(),
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
   EXPECT_EQ(5000, actual.second);
   EXPECT_EQ(expected.second, actual.second);
@@ -82,7 +83,7 @@ TEST_F(ValidIfTest, AllValid)
                                        thrust::make_counting_iterator(10000),
                                        all_valid{},
                                        cudf::get_default_stream(),
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
   EXPECT_EQ(0, actual.second);
   EXPECT_EQ(expected.second, actual.second);
@@ -96,7 +97,7 @@ TEST_F(ValidIfTest, AllNull)
                                        thrust::make_counting_iterator(10000),
                                        all_null{},
                                        cudf::get_default_stream(),
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
   EXPECT_EQ(10000, actual.second);
   EXPECT_EQ(expected.second, actual.second);
diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp
index 1ba9b14dc1f..14b4197de71 100644
--- a/cpp/tests/column/column_test.cpp
+++ b/cpp/tests/column/column_test.cpp
@@ -33,6 +33,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <numeric>
 #include <random>
@@ -373,7 +374,7 @@ TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorNoMask)
                                                  this->num_elements());
 
   auto original = cudf::detail::make_device_uvector_async(
-    data, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    data, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto original_data = original.data();
   cudf::column moved_to{std::move(original), rmm::device_buffer{}, 0};
   verify_column_views(moved_to);
@@ -389,7 +390,7 @@ TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorWithMask)
                                                  this->num_elements());
 
   auto original = cudf::detail::make_device_uvector_async(
-    data, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    data, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto original_data = original.data();
   auto original_mask = this->all_valid_mask.data();
   cudf::column moved_to{std::move(original), std::move(this->all_valid_mask), 0};
diff --git a/cpp/tests/copying/detail_gather_tests.cu b/cpp/tests/copying/detail_gather_tests.cu
index 17ced5ccd34..b9ae91afd1e 100644
--- a/cpp/tests/copying/detail_gather_tests.cu
+++ b/cpp/tests/copying/detail_gather_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 
@@ -62,7 +63,7 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
                            gather_map.end(),
                            cudf::out_of_bounds_policy::DONT_CHECK,
                            cudf::get_default_stream(),
-                           rmm::mr::get_current_device_resource());
+                           cudf::get_current_device_resource_ref());
 
     for (auto i = 0; i < source_table.num_columns(); ++i) {
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
@@ -79,7 +80,7 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
                            gather_map.data() + gather_map.size(),
                            cudf::out_of_bounds_policy::DONT_CHECK,
                            cudf::get_default_stream(),
-                           rmm::mr::get_current_device_resource());
+                           cudf::get_current_device_resource_ref());
 
     for (auto i = 0; i < source_table.num_columns(); ++i) {
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
@@ -107,7 +108,7 @@ TYPED_TEST(GatherTest, GatherDetailInvalidIndexTest)
                          cudf::out_of_bounds_policy::NULLIFY,
                          cudf::detail::negative_index_policy::NOT_ALLOWED,
                          cudf::get_default_stream(),
-                         rmm::mr::get_current_device_resource());
+                         cudf::get_current_device_resource_ref());
 
   auto expect_data =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2) ? 0 : i; });
diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp
index b31f34504e7..28098878086 100644
--- a/cpp/tests/copying/gather_str_tests.cpp
+++ b/cpp/tests/copying/gather_str_tests.cpp
@@ -24,8 +24,7 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 class GatherTestStr : public cudf::test::BaseFixture {};
 
@@ -91,7 +90,7 @@ TEST_F(GatherTestStr, Gather)
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
                                       cudf::get_default_stream(),
-                                      rmm::mr::get_current_device_resource());
+                                      cudf::get_current_device_resource_ref());
 
   std::vector<char const*> h_expected;
   std::vector<int32_t> expected_validity;
@@ -122,7 +121,7 @@ TEST_F(GatherTestStr, GatherDontCheckOutOfBounds)
                                       cudf::out_of_bounds_policy::DONT_CHECK,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
                                       cudf::get_default_stream(),
-                                      rmm::mr::get_current_device_resource());
+                                      cudf::get_current_device_resource_ref());
 
   std::vector<char const*> h_expected;
   for (int itr : h_map) {
@@ -141,7 +140,7 @@ TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
                                       cudf::get_default_stream(),
-                                      rmm::mr::get_current_device_resource());
+                                      cudf::get_current_device_resource_ref());
   cudf::test::expect_column_empty(results->get_column(0).view());
 }
 
@@ -155,6 +154,6 @@ TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
                                       cudf::get_default_stream(),
-                                      rmm::mr::get_current_device_resource());
+                                      cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results->get_column(0).view());
 }
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index 01ad4f2247c..ff6808d9a79 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -23,10 +23,10 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <limits>
 #include <memory>
@@ -37,7 +37,7 @@ using TestTypes = cudf::test::Types<int32_t>;
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto s = new ScalarType(cudf::test::make_type_param_scalar<T>(0), false, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
@@ -47,7 +47,7 @@ template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
   T value,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto s = new ScalarType(value, true, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index 7ff159cf896..ee3e7da5e0f 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -28,6 +28,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/filling.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_buffer.hpp>
 
@@ -1383,7 +1384,7 @@ struct ContiguousSplitTest : public cudf::test::BaseFixture {};
 
 std::vector<cudf::packed_table> do_chunked_pack(cudf::table_view const& input)
 {
-  auto mr = rmm::mr::get_current_device_resource();
+  auto mr = cudf::get_current_device_resource_ref();
 
   rmm::device_buffer bounce_buff(1 * 1024 * 1024, cudf::get_default_stream(), mr);
   auto bounce_buff_span =
@@ -2383,7 +2384,7 @@ TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall)
 {
   auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024);
   rmm::device_buffer buff(
-    1 * 1024, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+    1 * 1024, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::device_span<uint8_t> too_small(static_cast<uint8_t*>(buff.data()), buff.size());
   std::size_t copied = 0;
   // throws because we created chunked_contig_split with 1MB, but we are giving
@@ -2396,7 +2397,7 @@ TEST_F(ContiguousSplitTableCornerCases, EmptyTableHasNextFalse)
 {
   auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024);
   rmm::device_buffer buff(
-    1 * 1024 * 1024, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+    1 * 1024 * 1024, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::device_span<uint8_t> bounce_buff(static_cast<uint8_t*>(buff.data()), buff.size());
   EXPECT_EQ(chunked_pack->has_next(), false);  // empty input table
   std::size_t copied = 0;
@@ -2409,7 +2410,7 @@ TEST_F(ContiguousSplitTableCornerCases, ExhaustedHasNextFalse)
   cudf::test::strings_column_wrapper a{"abc", "def", "ghi", "jkl", "mno", "", "st", "uvwx"};
   cudf::table_view t({a});
   rmm::device_buffer buff(
-    1 * 1024 * 1024, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+    1 * 1024 * 1024, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::device_span<uint8_t> bounce_buff(static_cast<uint8_t*>(buff.data()), buff.size());
   auto chunked_pack = cudf::chunked_pack::create(t, buff.size());
   EXPECT_EQ(chunked_pack->has_next(), true);
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index ccf5ccae187..b81f8196d89 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
@@ -144,9 +145,9 @@ struct AtomicsTest : public cudf::test::BaseFixture {
     result_init[5] = result_init[2];
 
     auto dev_data = cudf::detail::make_device_uvector_sync(
-      v, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      v, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
     auto dev_result = cudf::detail::make_device_uvector_sync(
-      result_init, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      result_init, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
     if (block_size == 0) { block_size = vec_size; }
 
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index 1b73576e083..25501b4fde7 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -20,6 +20,7 @@
 
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/search.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 struct DictionarySearchTest : public cudf::test::BaseFixture {};
 
@@ -39,7 +40,7 @@ TEST_F(DictionarySearchTest, StringsColumn)
   result   = cudf::dictionary::detail::get_insert_index(dictionary,
                                                       cudf::string_scalar("eee"),
                                                       cudf::get_default_stream(),
-                                                      rmm::mr::get_current_device_resource());
+                                                      cudf::get_current_device_resource_ref());
   n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
   EXPECT_EQ(uint32_t{5}, n_result->value());
 }
@@ -59,7 +60,7 @@ TEST_F(DictionarySearchTest, WithNulls)
   result   = cudf::dictionary::detail::get_insert_index(dictionary,
                                                       cudf::numeric_scalar<int64_t>(5),
                                                       cudf::get_default_stream(),
-                                                      rmm::mr::get_current_device_resource());
+                                                      cudf::get_current_device_resource_ref());
   n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
   EXPECT_EQ(uint32_t{1}, n_result->value());
 }
@@ -71,7 +72,7 @@ TEST_F(DictionarySearchTest, EmptyColumn)
   auto result = cudf::dictionary::get_index(dictionary, key);
   EXPECT_FALSE(result->is_valid());
   result = cudf::dictionary::detail::get_insert_index(
-    dictionary, key, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    dictionary, key, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   EXPECT_FALSE(result->is_valid());
 }
 
@@ -82,6 +83,6 @@ TEST_F(DictionarySearchTest, Errors)
   EXPECT_THROW(cudf::dictionary::get_index(dictionary, key), cudf::data_type_error);
   EXPECT_THROW(
     cudf::dictionary::detail::get_insert_index(
-      dictionary, key, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
+      dictionary, key, cudf::get_default_stream(), cudf::get_current_device_resource_ref()),
     cudf::data_type_error);
 }
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index 24b4e335840..f34760341d8 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -82,7 +83,7 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
 
   std::vector<decimal32> vec1(1000, decimal32{1, scale_type{-2}});
   auto d_vec1 = cudf::detail::make_device_uvector_sync(
-    vec1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    vec1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto const sum = thrust::reduce(rmm::exec_policy(cudf::get_default_stream()),
                                   std::cbegin(d_vec1),
@@ -96,7 +97,7 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
   thrust::inclusive_scan(std::cbegin(vec1), std::cend(vec1), std::begin(vec1));
 
   d_vec1 = cudf::detail::make_device_uvector_sync(
-    vec1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    vec1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<int32_t> vec2(1000);
   std::iota(std::begin(vec2), std::end(vec2), 1);
diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
index 612486d8e5c..2d447025919 100644
--- a/cpp/tests/groupby/histogram_tests.cpp
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/groupby.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
 using int64s_col  = cudf::test::fixed_width_column_wrapper<int64_t>;
@@ -68,7 +69,7 @@ auto groupby_histogram(cudf::column_view const& keys,
                                                    cudf::order::ASCENDING,
                                                    cudf::null_order::BEFORE,
                                                    cudf::get_default_stream(),
-                                                   rmm::mr::get_current_device_resource());
+                                                   cudf::get_current_device_resource_ref());
 
   return std::pair{std::move(sorted_keys), std::move(sorted_histograms)};
 }
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 97edc1c45a7..baa59026b07 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/fill.h>
@@ -468,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
-                                                            rmm::mr::get_current_device_resource());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
-                                                            rmm::mr::get_current_device_resource());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
-                                                            rmm::mr::get_current_device_resource());
+  auto e = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
diff --git a/cpp/tests/io/json/json_chunked_reader.cu b/cpp/tests/io/json/json_chunked_reader.cu
index b9dee54752c..c9ee6542a4d 100644
--- a/cpp/tests/io/json/json_chunked_reader.cu
+++ b/cpp/tests/io/json/json_chunked_reader.cu
@@ -22,7 +22,7 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <fstream>
 #include <string>
@@ -63,7 +63,7 @@ TEST_F(JsonReaderTest, ByteRange_SingleSource)
                                                  json_lines_options,
                                                  chunk_size,
                                                  cudf::get_default_stream(),
-                                                 rmm::mr::get_current_device_resource());
+                                                 cudf::get_current_device_resource_ref());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
@@ -158,7 +158,7 @@ TEST_F(JsonReaderTest, ByteRange_MultiSource)
                                                  json_lines_options,
                                                  chunk_size,
                                                  cudf::get_default_stream(),
-                                                 rmm::mr::get_current_device_resource());
+                                                 cudf::get_current_device_resource_ref());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
index 3a9ba8d9f3b..d23acf3ae00 100644
--- a/cpp/tests/io/json/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json/json_quote_normalization_test.cpp
@@ -25,6 +25,7 @@
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/io/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
@@ -43,7 +44,7 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
 
   auto stream_view  = cudf::test::get_default_stream();
   auto device_input = rmm::device_buffer(
-    host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource());
+    host_input.c_str(), host_input.size(), stream_view, cudf::get_current_device_resource_ref());
 
   // Preprocessing FST
   cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 8bcd5790e99..875cc467b6a 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -26,6 +26,7 @@
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream.hpp>
@@ -590,11 +591,11 @@ TEST_F(JsonTest, TreeRepresentation)
 
   // Parse the JSON and get the token stream
   auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
-    d_input, options, stream, rmm::mr::get_current_device_resource());
+    d_input, options, stream, cudf::get_current_device_resource_ref());
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, cudf::get_current_device_resource_ref());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -678,11 +679,11 @@ TEST_F(JsonTest, TreeRepresentation2)
 
   // Parse the JSON and get the token stream
   auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
-    d_input, options, stream, rmm::mr::get_current_device_resource());
+    d_input, options, stream, cudf::get_current_device_resource_ref());
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, cudf::get_current_device_resource_ref());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -753,11 +754,11 @@ TEST_F(JsonTest, TreeRepresentation3)
 
   // Parse the JSON and get the token stream
   auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
-    d_input, options, stream, rmm::mr::get_current_device_resource());
+    d_input, options, stream, cudf::get_current_device_resource_ref());
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, cudf::get_current_device_resource_ref());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -779,13 +780,13 @@ TEST_F(JsonTest, TreeRepresentationError)
 
   // Parse the JSON and get the token stream
   auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
-    d_input, options, stream, rmm::mr::get_current_device_resource());
+    d_input, options, stream, cudf::get_current_device_resource_ref());
 
   // Get the JSON's tree representation
   // This JSON is invalid and will raise an exception.
   EXPECT_THROW(
     cuio_json::detail::get_tree_representation(
-      tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()),
+      tokens_gpu, token_indices_gpu, false, stream, cudf::get_current_device_resource_ref()),
     cudf::logic_error);
 }
 
@@ -862,7 +863,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
 
   // Parse the JSON and get the token stream
   auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
-    d_input, options, stream, rmm::mr::get_current_device_resource());
+    d_input, options, stream, cudf::get_current_device_resource_ref());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   bool const is_array_of_arrays =
@@ -875,7 +876,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
     records_orient_tree_traversal_cpu(input, cpu_tree, is_array_of_arrays, json_lines, stream);
   // gpu tree generation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, cudf::get_current_device_resource_ref());
 
 #if LIBCUDF_JSON_DEBUG_DUMP
   printf("BEFORE traversal (gpu_tree):\n");
@@ -889,7 +890,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
                                                      is_array_of_arrays,
                                                      json_lines,
                                                      stream,
-                                                     rmm::mr::get_current_device_resource());
+                                                     cudf::get_current_device_resource_ref());
 #if LIBCUDF_JSON_DEBUG_DUMP
   printf("AFTER  traversal (gpu_tree):\n");
   print_tree(gpu_tree);
diff --git a/cpp/tests/io/json/json_type_cast_test.cu b/cpp/tests/io/json/json_type_cast_test.cu
index fe430010f4b..c18d4189626 100644
--- a/cpp/tests/io/json/json_type_cast_test.cu
+++ b/cpp/tests/io/json/json_type_cast_test.cu
@@ -32,6 +32,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -73,7 +74,7 @@ auto default_json_options()
 TEST_F(JSONTypeCastTest, String)
 {
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
   auto const type   = cudf::data_type{cudf::type_id::STRING};
 
   auto in_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; });
@@ -110,7 +111,7 @@ TEST_F(JSONTypeCastTest, String)
 TEST_F(JSONTypeCastTest, Int)
 {
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
   auto const type   = cudf::data_type{cudf::type_id::INT64};
 
   cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"});
@@ -141,7 +142,7 @@ TEST_F(JSONTypeCastTest, Int)
 TEST_F(JSONTypeCastTest, StringEscapes)
 {
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
   auto const type   = cudf::data_type{cudf::type_id::STRING};
 
   cudf::test::strings_column_wrapper data({
@@ -183,7 +184,7 @@ TEST_F(JSONTypeCastTest, StringEscapes)
 TEST_F(JSONTypeCastTest, ErrorNulls)
 {
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
   auto const type   = cudf::data_type{cudf::type_id::STRING};
 
   // error in decoding
diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
index 01dd17fab98..6d79fdc98ef 100644
--- a/cpp/tests/io/json/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu
@@ -23,6 +23,7 @@
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -39,12 +40,12 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
   auto stream_view = cudf::test::get_default_stream();
 
   auto device_input = rmm::device_buffer(
-    host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource());
+    host_input.c_str(), host_input.size(), stream_view, cudf::get_current_device_resource_ref());
 
   // Preprocessing FST
   cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
   cudf::io::json::detail::normalize_whitespace(
-    device_data, stream_view, rmm::mr::get_current_device_resource());
+    device_data, stream_view, cudf::get_current_device_resource_ref());
 
   std::string preprocessed_host_output(device_data.size(), 0);
   CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp
index 5dc25133719..327169ae563 100644
--- a/cpp/tests/io/json/nested_json_test.cpp
+++ b/cpp/tests/io/json/nested_json_test.cpp
@@ -32,6 +32,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -447,7 +448,7 @@ TEST_F(JsonNewlineDelimiterTest, TokenStream)
 
   // Parse the JSON and get the token stream
   auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
-    d_input, default_options, stream, rmm::mr::get_current_device_resource());
+    d_input, default_options, stream, cudf::get_current_device_resource_ref());
   // Copy back the number of tokens that were written
   auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
   auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
@@ -581,7 +582,7 @@ TEST_F(JsonNewlineDelimiterTest, TokenStream2)
 
   // Parse the JSON and get the token stream
   auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
-    d_input, default_options, stream, rmm::mr::get_current_device_resource());
+    d_input, default_options, stream, cudf::get_current_device_resource_ref());
   // Copy back the number of tokens that were written
   auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
   auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
@@ -639,7 +640,7 @@ TEST_F(JsonParserTest, ExtractColumn)
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
 
   // Default parsing options
   cudf::io::json_reader_options default_options{};
@@ -648,7 +649,7 @@ TEST_F(JsonParserTest, ExtractColumn)
   auto const d_input      = cudf::detail::make_device_uvector_async(
     cudf::host_span<char const>{input.c_str(), input.size()},
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, default_options, stream, mr);
 
@@ -739,7 +740,7 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStream)
 
   // Parse the JSON and get the token stream
   auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
-    d_input, default_options, stream, rmm::mr::get_current_device_resource());
+    d_input, default_options, stream, cudf::get_current_device_resource_ref());
   // Copy back the number of tokens that were written
   auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
   auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
@@ -856,9 +857,9 @@ TEST_F(JsonTest, PostProcessTokenStream)
   auto const d_offsets = cudf::detail::make_device_uvector_async(
     cudf::host_span<token_index_t const>{offsets.data(), offsets.size()},
     stream,
-    rmm::mr::get_current_device_resource());
-  auto const d_tokens =
-    cudf::detail::make_device_uvector_async(tokens, stream, rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
+  auto const d_tokens = cudf::detail::make_device_uvector_async(
+    tokens, stream, cudf::get_current_device_resource_ref());
 
   // Run system-under-test
   auto [d_filtered_tokens, d_filtered_indices] =
@@ -883,7 +884,7 @@ TEST_P(JsonDelimiterParamTest, UTF_JSON)
 {
   // Prepare cuda stream for data transfers & kernels
   auto const stream    = cudf::get_default_stream();
-  auto mr              = rmm::mr::get_current_device_resource();
+  auto mr              = cudf::get_current_device_resource_ref();
   auto json_parser     = cuio_json::detail::device_parse_nested_json;
   char const delimiter = GetParam();
 
@@ -904,7 +905,7 @@ TEST_P(JsonDelimiterParamTest, UTF_JSON)
   auto const d_ascii_pass = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{ascii_pass.c_str(), ascii_pass.size()},
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
 
   CUDF_EXPECT_NO_THROW(json_parser(d_ascii_pass, default_options, stream, mr));
 
@@ -921,7 +922,7 @@ TEST_P(JsonDelimiterParamTest, UTF_JSON)
   auto const d_utf_failed = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{utf_failed.c_str(), utf_failed.size()},
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_failed, default_options, stream, mr));
 
   // utf-8 string that passes parsing.
@@ -938,7 +939,7 @@ TEST_P(JsonDelimiterParamTest, UTF_JSON)
   auto const d_utf_pass = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{utf_pass.c_str(), utf_pass.size()},
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_pass, default_options, stream, mr));
 }
 
@@ -949,7 +950,7 @@ TEST_F(JsonParserTest, ExtractColumnWithQuotes)
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
 
   // Default parsing options
   cudf::io::json_reader_options options{};
@@ -959,7 +960,7 @@ TEST_F(JsonParserTest, ExtractColumnWithQuotes)
   auto const d_input      = cudf::detail::make_device_uvector_async(
     cudf::host_span<char const>{input.c_str(), input.size()},
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, options, stream, mr);
 
@@ -982,7 +983,7 @@ TEST_F(JsonParserTest, ExpectFailMixStructAndList)
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
 
   // Default parsing options
   cudf::io::json_reader_options options{};
@@ -1002,7 +1003,7 @@ TEST_F(JsonParserTest, ExpectFailMixStructAndList)
     auto const d_input = cudf::detail::make_device_uvector_async(
       cudf::host_span<char const>{input.c_str(), input.size()},
       stream,
-      rmm::mr::get_current_device_resource());
+      cudf::get_current_device_resource_ref());
     EXPECT_THROW(auto const cudf_table = json_parser(d_input, options, stream, mr),
                  cudf::logic_error);
   }
@@ -1011,7 +1012,7 @@ TEST_F(JsonParserTest, ExpectFailMixStructAndList)
     auto const d_input = cudf::detail::make_device_uvector_async(
       cudf::host_span<char const>{input.c_str(), input.size()},
       stream,
-      rmm::mr::get_current_device_resource());
+      cudf::get_current_device_resource_ref());
     CUDF_EXPECT_NO_THROW(auto const cudf_table = json_parser(d_input, options, stream, mr));
   }
 }
@@ -1023,7 +1024,7 @@ TEST_F(JsonParserTest, EmptyString)
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
 
   // Default parsing options
   cudf::io::json_reader_options default_options{};
@@ -1032,7 +1033,7 @@ TEST_F(JsonParserTest, EmptyString)
   auto const d_input =
     cudf::detail::make_device_uvector_sync(cudf::host_span<char const>{input.c_str(), input.size()},
                                            stream,
-                                           rmm::mr::get_current_device_resource());
+                                           cudf::get_current_device_resource_ref());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, default_options, stream, mr);
 
@@ -1177,7 +1178,7 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter)
 
   // Parse the JSON and get the token stream
   auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
-    d_input, default_options, stream, rmm::mr::get_current_device_resource());
+    d_input, default_options, stream, cudf::get_current_device_resource_ref());
   // Copy back the number of tokens that were written
   auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
   auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 2b78a5e7251..8ad1fea649d 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -37,6 +37,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -79,7 +80,7 @@ auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
         null_count,
         std::move(col),
         cudf::get_default_stream(),
-        rmm::mr::get_current_device_resource());
+        cudf::get_current_device_resource_ref());
 
       // Shift nulls of the next column by one position, to avoid having all nulls
       // in the same table rows.
@@ -121,7 +122,7 @@ auto chunked_read(std::string const& filepath,
 
   // TODO: remove this scope, when we get rid of mem stat in the reader.
   // This is to avoid use-after-free of memory resource created by the mem stat object.
-  auto mr = rmm::mr::get_current_device_resource();
+  auto mr = cudf::get_current_device_resource_ref();
 
   do {
     auto chunk = reader.read_chunk();
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index 66b36aeed63..153a8a0c5aa 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -38,6 +38,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -80,7 +81,7 @@ auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
         null_count,
         std::move(col),
         cudf::get_default_stream(),
-        rmm::mr::get_current_device_resource());
+        cudf::get_current_device_resource_ref());
 
       // Shift nulls of the next column by one position, to avoid having all nulls
       // in the same table rows.
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index e07ebe25322..c8100038942 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -26,6 +26,7 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/unary.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <src/io/parquet/parquet.hpp>
 #include <src/io/parquet/parquet_common.hpp>
@@ -192,7 +193,7 @@ TEST_F(ParquetWriterTest, BufferSource)
       cudf::host_span<uint8_t const>{reinterpret_cast<uint8_t const*>(out_buffer.data()),
                                      out_buffer.size()},
       cudf::get_default_stream(),
-      rmm::mr::get_current_device_resource());
+      cudf::get_current_device_resource_ref());
     auto const d_buffer = cudf::device_span<std::byte const>(
       reinterpret_cast<std::byte const*>(d_input.data()), d_input.size());
     cudf::io::parquet_reader_options in_opts =
diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu
index 37156292f44..b20f2024cb9 100644
--- a/cpp/tests/io/type_inference_test.cu
+++ b/cpp/tests/io/type_inference_test.cu
@@ -22,6 +22,7 @@
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -55,9 +56,9 @@ TEST_F(TypeInference, Basic)
   auto const string_offset   = std::vector<cudf::size_type>{1, 4, 7};
   auto const string_length   = std::vector<cudf::size_type>{2, 2, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -88,9 +89,9 @@ TEST_F(TypeInference, Null)
   auto const string_offset   = std::vector<cudf::size_type>{1, 1, 4};
   auto const string_length   = std::vector<cudf::size_type>{0, 2, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -121,9 +122,9 @@ TEST_F(TypeInference, AllNull)
   auto const string_offset   = std::vector<cudf::size_type>{1, 1, 1};
   auto const string_length   = std::vector<cudf::size_type>{0, 0, 4};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -154,9 +155,9 @@ TEST_F(TypeInference, String)
   auto const string_offset   = std::vector<cudf::size_type>{1, 8, 12};
   auto const string_length   = std::vector<cudf::size_type>{6, 3, 4};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -187,9 +188,9 @@ TEST_F(TypeInference, Bool)
   auto const string_offset   = std::vector<cudf::size_type>{1, 6, 12};
   auto const string_length   = std::vector<cudf::size_type>{4, 5, 5};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -220,9 +221,9 @@ TEST_F(TypeInference, Timestamp)
   auto const string_offset   = std::vector<cudf::size_type>{1, 10};
   auto const string_length   = std::vector<cudf::size_type>{8, 9};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -254,9 +255,9 @@ TEST_F(TypeInference, InvalidInput)
   auto const string_offset   = std::vector<cudf::size_type>{1, 3, 5, 7, 9};
   auto const string_length   = std::vector<cudf::size_type>{1, 1, 1, 1, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index c6da6b75930..5c9f6114eb5 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/transform_unary_functions.cuh>  // for meanvar
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -87,7 +88,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
     InputIterator d_in_last = d_in + num_items;
     EXPECT_EQ(thrust::distance(d_in, d_in_last), num_items);
     auto dev_expected = cudf::detail::make_device_uvector_sync(
-      expected, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      expected, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
     // using a temporary vector and calling transform and all_of separately is
     // equivalent to thrust::equal but compiles ~3x faster
diff --git a/cpp/tests/iterator/value_iterator_test.cuh b/cpp/tests/iterator/value_iterator_test.cuh
index 8252ce88f39..a479a263b09 100644
--- a/cpp/tests/iterator/value_iterator_test.cuh
+++ b/cpp/tests/iterator/value_iterator_test.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <tests/iterator/iterator_tests.cuh>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/host_vector.h>
 
@@ -26,7 +27,7 @@ void non_null_iterator(IteratorTest<T>& testFixture)
 {
   auto host_array = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
   auto dev_array  = cudf::detail::make_device_uvector_sync(
-    host_array, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    host_array, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   // calculate the expected value by CPU.
   thrust::host_vector<T> replaced_array(host_array);
diff --git a/cpp/tests/iterator/value_iterator_test_strings.cu b/cpp/tests/iterator/value_iterator_test_strings.cu
index 10bb3f21ee1..a965c65aef0 100644
--- a/cpp/tests/iterator/value_iterator_test_strings.cu
+++ b/cpp/tests/iterator/value_iterator_test_strings.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 #include "iterator_tests.cuh"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -31,7 +32,7 @@ auto strings_to_string_views(std::vector<std::string>& input_strings)
   std::tie(chars, offsets) = cudf::test::detail::make_chars_and_offsets(
     input_strings.begin(), input_strings.end(), all_valid);
   auto dev_chars = cudf::detail::make_device_uvector_sync(
-    chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    chars, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   // calculate the expected value by CPU. (but contains device pointers)
   thrust::host_vector<cudf::string_view> replaced_array(input_strings.size());
@@ -52,7 +53,7 @@ TEST_F(StringIteratorTest, string_view_null_iterator)
   std::string zero("zero");
   // the char data has to be in GPU
   auto initmsg = cudf::detail::make_device_uvector_sync(
-    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    zero, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   T init = T{initmsg.data(), int(initmsg.size())};
 
   // data and valid arrays
@@ -88,7 +89,7 @@ TEST_F(StringIteratorTest, string_view_no_null_iterator)
   std::string zero("zero");
   // the char data has to be in GPU
   auto initmsg = cudf::detail::make_device_uvector_sync(
-    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    zero, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   T init = T{initmsg.data(), int(initmsg.size())};
 
   // data array
@@ -113,7 +114,7 @@ TEST_F(StringIteratorTest, string_scalar_iterator)
   std::string zero("zero");
   // the char data has to be in GPU
   auto initmsg = cudf::detail::make_device_uvector_sync(
-    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    zero, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   T init = T{initmsg.data(), int(initmsg.size())};
 
   // data array
diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp
index 05ae4ea1d04..93754091b3f 100644
--- a/cpp/tests/join/distinct_join_tests.cpp
+++ b/cpp/tests/join/distinct_join_tests.cpp
@@ -29,6 +29,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <limits>
 #include <vector>
@@ -44,7 +45,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> get_left_indices(cudf::siz
   auto sequence = std::vector<cudf::size_type>(size);
   std::iota(sequence.begin(), sequence.end(), 0);
   auto indices = cudf::detail::make_device_uvector_sync(
-    sequence, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    sequence, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   return std::make_unique<rmm::device_uvector<cudf::size_type>>(std::move(indices));
 }
 
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 4e88414d553..ab387a5c7f5 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -37,8 +37,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <limits>
 
@@ -69,7 +68,7 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto left_selected  = left_input.select(left_on);
   auto right_selected = right_input.select(right_on);
@@ -2028,7 +2027,7 @@ struct JoinTestLists : public cudf::test::BaseFixture {
     auto const probe_tv = cudf::table_view{{probe}};
 
     auto const [left_result_map, right_result_map] =
-      join_func(build_tv, probe_tv, nulls_equal, rmm::mr::get_current_device_resource());
+      join_func(build_tv, probe_tv, nulls_equal, cudf::get_current_device_resource_ref());
 
     auto const left_result_table =
       sort_and_gather(build_tv, column_view_from_device_uvector(*left_result_map), oob_policy);
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index de3d8bdaa23..3e279260b99 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -28,8 +28,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -59,7 +58,7 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto left_selected      = left_input.select(left_on);
   auto right_selected     = right_input.select(right_on);
diff --git a/cpp/tests/large_strings/json_tests.cu b/cpp/tests/large_strings/json_tests.cu
index e34ab991c11..80bde168b75 100644
--- a/cpp/tests/large_strings/json_tests.cu
+++ b/cpp/tests/large_strings/json_tests.cu
@@ -22,6 +22,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 struct JsonLargeReaderTest : public cudf::test::StringsLargeTest {};
@@ -81,7 +82,7 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
                                              json_lines_options,
                                              chunk_size,
                                              cudf::get_default_stream(),
-                                             rmm::mr::get_current_device_resource());
+                                             cudf::get_current_device_resource_ref());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
index ac8159369a1..249319da7f7 100644
--- a/cpp/tests/large_strings/large_strings_fixture.cpp
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -126,7 +126,7 @@ int main(int argc, char** argv)
   auto const cmd_opts = parse_cudf_test_opts(argc, argv);
   // hardcoding the CUDA memory resource to keep from exceeding the pool
   auto mr = cudf::test::make_cuda();
-  rmm::mr::set_current_device_resource(mr.get());
+  cudf::set_current_device_resource(mr.get());
   auto adaptor = make_stream_mode_adaptor(cmd_opts);
 
   // create object to automatically be destroyed at the end of main()
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 24dadf9b520..579d918a31d 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -26,6 +26,7 @@
 #include <cudf/partitioning.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -290,7 +291,7 @@ void run_fixed_width_test(size_t cols,
   // Make a table view of the partition numbers
   constexpr cudf::data_type dtype{cudf::type_id::INT32};
   auto d_partitions = cudf::detail::make_device_uvector_sync(
-    partitions, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    partitions, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::column_view partitions_col(dtype, rows, d_partitions.data(), nullptr, 0);
   cudf::table_view partitions_table({partitions_col});
 
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 06c6b9dfbe4..915717713df 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -29,6 +29,7 @@
 #include <cudf/transform.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <arrow/util/tdigest.h>
 
@@ -371,7 +372,7 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {};
 TEST_F(PercentileApproxTest, EmptyInput)
 {
   auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 668690639a6..19996f827cf 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -23,6 +23,7 @@
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <limits>
 #include <utility>
@@ -49,7 +50,7 @@ TYPED_TEST(SegmentedReductionTest, SumExcludeNulls)
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{6, 4, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -97,7 +98,7 @@ TYPED_TEST(SegmentedReductionTest, ProductExcludeNulls)
     {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{15, 15, 1, XXX, XXX, XXX},
                                                                         {1, 1, 1, 0, 0, 0}};
 
@@ -147,7 +148,7 @@ TYPED_TEST(SegmentedReductionTest, MaxExcludeNulls)
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{3, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -195,7 +196,7 @@ TYPED_TEST(SegmentedReductionTest, MinExcludeNulls)
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{1, 1, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -244,7 +245,7 @@ TYPED_TEST(SegmentedReductionTest, AnyExcludeNulls)
     {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {false, false, true, true, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, true, true, true, false, true, true, false, false}};
@@ -284,7 +285,7 @@ TYPED_TEST(SegmentedReductionTest, AllExcludeNulls)
     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {true, true, bool{XXX}, true, bool{XXX}, bool{XXX}, false, false, false},
     {true, true, false, true, false, false, true, true, true}};
@@ -335,7 +336,7 @@ TYPED_TEST(SegmentedReductionTest, SumIncludeNulls)
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{6, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -386,7 +387,7 @@ TYPED_TEST(SegmentedReductionTest, ProductIncludeNulls)
     {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{15, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -439,7 +440,7 @@ TYPED_TEST(SegmentedReductionTest, MaxIncludeNulls)
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{3, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -490,7 +491,7 @@ TYPED_TEST(SegmentedReductionTest, MinIncludeNulls)
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{1, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -542,7 +543,7 @@ TYPED_TEST(SegmentedReductionTest, AnyIncludeNulls)
     {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {false, bool{XXX}, true, bool{XXX}, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, false, true, false, false, true, true, false, false}};
@@ -605,7 +606,7 @@ TYPED_TEST(SegmentedReductionTest, AllIncludeNulls)
     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {true, bool{XXX}, bool{XXX}, true, bool{XXX}, bool{XXX}, false, bool{XXX}, false},
     {true, false, false, true, false, false, true, false, true}};
@@ -670,7 +671,7 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction)
     {1, 2, 3, 4, 5, 6, 7}, {true, true, true, true, true, true, true}};
   auto const offsets   = std::vector<cudf::size_type>{1, 3, 4};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{{5, 4}, {true, true}};
 
   auto res =
@@ -721,7 +722,7 @@ TEST_F(SegmentedReductionTestUntyped, NonNullableInput)
   auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7};
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 3, 7};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<int32_t>{{1, XXX, 5, 22}, {true, false, true, true}};
 
@@ -767,7 +768,7 @@ TEST_F(SegmentedReductionTestUntyped, Mean)
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -786,7 +787,7 @@ TEST_F(SegmentedReductionTestUntyped, MeanNulls)
     {10, 20, 30, 40, 50, 60, 0, 80, 90}, {true, true, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -808,7 +809,7 @@ TEST_F(SegmentedReductionTestUntyped, SumOfSquares)
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT32};
 
@@ -828,7 +829,7 @@ TEST_F(SegmentedReductionTestUntyped, SumOfSquaresNulls)
     {10, 20, 30, 40, 50, 60, 0, 80, 90}, {true, true, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT64};
 
@@ -851,7 +852,7 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviation)
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -871,7 +872,7 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviationNulls)
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {true, false, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -894,7 +895,7 @@ TEST_F(SegmentedReductionTestUntyped, Variance)
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -914,7 +915,7 @@ TEST_F(SegmentedReductionTestUntyped, VarianceNulls)
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {true, false, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -936,7 +937,7 @@ TEST_F(SegmentedReductionTestUntyped, NUnique)
     cudf::test::fixed_width_column_wrapper<int32_t>({10, 15, 20, 30, 60, 60, 70, 70, 80});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 2, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_nunique_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT32};
 
@@ -956,7 +957,7 @@ TEST_F(SegmentedReductionTestUntyped, NUniqueNulls)
     {10, 0, 20, 30, 60, 60, 70, 70, 0}, {true, false, true, true, true, true, true, true, false});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 2, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_nunique_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT32};
 
@@ -978,7 +979,7 @@ TEST_F(SegmentedReductionTestUntyped, Errors)
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {true, false, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const null_policy = cudf::null_policy::EXCLUDE;
   auto const output_type = cudf::data_type{cudf::type_id::TIMESTAMP_DAYS};
   auto const str_input =
@@ -1047,7 +1048,7 @@ TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
   auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{};
   auto const offsets   = std::vector<cudf::size_type>{0};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{};
 
   auto res =
@@ -1084,7 +1085,7 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets)
   auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{};
   auto const offsets   = std::vector<cudf::size_type>{0, 0, 0, 0, 0, 0};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{
     {XXX, XXX, XXX, XXX, XXX}, {false, false, false, false, false}};
 
@@ -1133,7 +1134,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxWithNulls)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1161,7 +1162,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinWithNulls)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1189,7 +1190,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxNonNullableInput)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 4, 4};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1214,7 +1215,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinNonNullableInput)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 4, 4};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1239,7 +1240,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, Sum)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1277,7 +1278,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, Product)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 12, 12};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_product_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1314,7 +1315,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, SumOfSquares)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1478,7 +1479,7 @@ TEST_F(SegmentedReductionStringTest, EmptyInputWithOffsets)
   auto const input     = cudf::test::strings_column_wrapper{};
   auto const offsets   = std::vector<cudf::size_type>{0, 0, 0, 0};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::strings_column_wrapper({XXX, XXX, XXX}, {false, false, false});
 
   auto result =
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index 5026954403b..2232aefefcd 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -25,6 +25,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <thrust/sequence.h>
@@ -131,7 +132,7 @@ TEST_F(StringScalarDeviceViewTest, Value)
   auto scalar_device_view = cudf::get_scalar_device_view(s);
   rmm::device_scalar<bool> result{cudf::get_default_stream()};
   auto value_v = cudf::detail::make_device_uvector_sync(
-    value, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    value, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   test_string_value<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     scalar_device_view, value_v.data(), value.size(), result.data());
diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp
index f4fe2c5956a..79421a1fa30 100644
--- a/cpp/tests/sort/segmented_sort_tests.cpp
+++ b/cpp/tests/sort/segmented_sort_tests.cpp
@@ -23,6 +23,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <type_traits>
@@ -350,7 +351,7 @@ TEST_F(SegmentedSortInt, UnbalancedOffsets)
   std::fill_n(h_input.begin(), 4, 0);
   std::fill(h_input.begin() + 3533, h_input.end(), 10000);
   auto d_input = cudf::detail::make_device_uvector_sync(
-    h_input, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_input, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto input    = cudf::column_view(cudf::device_span<int64_t const>(d_input));
   auto segments = cudf::test::fixed_width_column_wrapper<int32_t>({0, 4, 3533, 3535});
   // full sort should match handcrafted input data here
diff --git a/cpp/tests/streams/reduction_test.cpp b/cpp/tests/streams/reduction_test.cpp
index e6438ac2834..b4f013fc960 100644
--- a/cpp/tests/streams/reduction_test.cpp
+++ b/cpp/tests/streams/reduction_test.cpp
@@ -23,6 +23,7 @@
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 class ReductionTest : public cudf::test::BaseFixture {};
 
@@ -53,7 +54,7 @@ TEST_F(ReductionTest, SegmentedReductionSum)
     {true, true, true, true, false, true, true, false, false, false}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto res =
     cudf::segmented_reduce(input,
@@ -71,7 +72,7 @@ TEST_F(ReductionTest, SegmentedReductionSumScalarInit)
     {true, true, true, true, false, true, true, false, false, false}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const init_scalar = cudf::make_fixed_width_scalar<int>(3, cudf::test::get_default_stream());
   auto res =
     cudf::segmented_reduce(input,
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 59423d5b927..c816316d0ff 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -23,6 +23,7 @@
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -298,10 +299,10 @@ TEST_F(StringsContainsTests, HexTest)
     {thrust::make_counting_iterator<cudf::size_type>(0),
      thrust::make_counting_iterator<cudf::size_type>(0) + count + 1});
   auto d_chars = cudf::detail::make_device_uvector_sync(
-    ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    ascii_chars, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d_offsets = std::make_unique<cudf::column>(
     cudf::detail::make_device_uvector_sync(
-      offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
+      offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref()),
     rmm::device_buffer{},
     0);
   auto input = cudf::make_strings_column(count, std::move(d_offsets), d_chars.release(), 0, {});
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 35d648f16e0..90054e41d36 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -28,6 +28,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -79,7 +80,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     h_offsets[idx + 1] = offset;
   }
   auto d_strings = cudf::detail::make_device_uvector_sync(
-    strings, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    strings, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   CUDF_CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyDefault));
   auto column = cudf::make_strings_column(d_strings);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
@@ -140,14 +141,14 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
 
   std::vector<cudf::bitmask_type> h_nulls{h_null_mask};
   auto d_buffer = cudf::detail::make_device_uvector_sync(
-    h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_buffer, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d_offsets = std::make_unique<cudf::column>(
     cudf::detail::make_device_uvector_sync(
-      h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
+      h_offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref()),
     rmm::device_buffer{},
     0);
   auto d_nulls = cudf::detail::make_device_uvector_sync(
-    h_nulls, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_nulls, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto column = cudf::make_strings_column(
     count, std::move(d_offsets), d_buffer.release(), null_count, d_nulls.release());
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
@@ -191,7 +192,7 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn)
   auto d_chars   = rmm::device_uvector<char>(0, cudf::get_default_stream());
   auto d_offsets = std::make_unique<cudf::column>(
     cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(
-      1, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
+      1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()),
     rmm::device_buffer{},
     0);
   rmm::device_uvector<cudf::bitmask_type> d_nulls{0, cudf::get_default_stream()};
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 7a038fa6d75..ce5f68de3c9 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -295,7 +296,7 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   h_integers.push_back(std::numeric_limits<TypeParam>::min());
   h_integers.push_back(std::numeric_limits<TypeParam>::max());
   auto const d_integers = cudf::detail::make_device_uvector_sync(
-    h_integers, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_integers, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
   auto integers_view = integers->mutable_view();
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
index e5ff700a242..c33eedf9bd9 100644
--- a/cpp/tests/structs/utilities_tests.cpp
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -30,6 +30,7 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 template <typename T>
 using nums = cudf::test::fixed_width_column_wrapper<T, int32_t>;
@@ -60,7 +61,7 @@ TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevel)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(table, flattened_table->flattened_columns());
 }
@@ -82,7 +83,7 @@ TYPED_TEST(TypedStructUtilitiesTest, NoStructs)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(table, flattened_table->flattened_columns());
 }
@@ -114,7 +115,7 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -147,7 +148,7 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -196,7 +197,7 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -246,7 +247,7 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -297,7 +298,7 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -348,7 +349,7 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -363,7 +364,7 @@ void test_non_struct_columns(cudf::column_view const& input)
 {
   // push_down_nulls() on non-struct columns should return the input column, unchanged.
   auto [superimposed, backing_data] = cudf::structs::detail::push_down_nulls(
-    input, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    input, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(input, superimposed);
   EXPECT_TRUE(backing_data.new_null_masks.empty());
@@ -427,7 +428,7 @@ TYPED_TEST(TypedSuperimposeTest, BasicStruct)
                                  make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5})));
 
   auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
-    structs_view, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    structs_view, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   // After push_down_nulls(), the struct nulls (i.e. at index-0) should have been pushed
   // down to the children. All members should have nulls at row-index 0.
@@ -453,7 +454,7 @@ TYPED_TEST(TypedSuperimposeTest, NonNullableParentStruct)
                          .release();
 
   auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
-    structs_input->view(), cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    structs_input->view(), cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   // After push_down_nulls(), none of the child structs should have changed,
   // because the parent had no nulls to begin with.
@@ -487,8 +488,10 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_ChildNullable_ParentNonNullable)
   auto structs_of_structs =
     cudf::test::structs_column_wrapper{std::move(outer_struct_members)}.release();
 
-  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
-    structs_of_structs->view(), cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto [output, backing_data] =
+    cudf::structs::detail::push_down_nulls(structs_of_structs->view(),
+                                           cudf::get_default_stream(),
+                                           cudf::get_current_device_resource_ref());
 
   // After push_down_nulls(), outer-struct column should not have pushed nulls to child
   // structs. But the child struct column must push its nulls to its own children.
@@ -530,8 +533,10 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_ChildNullable_ParentNullable)
   cudf::detail::set_null_mask(
     structs_of_structs_view.null_mask(), 1, 2, false, cudf::get_default_stream());
 
-  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
-    structs_of_structs->view(), cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto [output, backing_data] =
+    cudf::structs::detail::push_down_nulls(structs_of_structs->view(),
+                                           cudf::get_default_stream(),
+                                           cudf::get_current_device_resource_ref());
 
   // After push_down_nulls(), outer-struct column should not have pushed nulls to child
   // structs. But the child struct column must push its nulls to its own children.
@@ -587,7 +592,7 @@ TYPED_TEST(TypedSuperimposeTest, Struct_Sliced)
   // lists_member: 00111
 
   auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
-    sliced_structs, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    sliced_structs, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   // After push_down_nulls(), the null masks should be:
   // STRUCT:       11110
@@ -640,7 +645,7 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_Sliced)
   // lists_member:   00110
 
   auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
-    sliced_structs, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    sliced_structs, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   // After push_down_nulls(), the null masks will be:
   // STRUCT<STRUCT>: 11101
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index 77b3c6c475c..a393c655fbb 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -47,7 +48,7 @@ void row_comparison(cudf::table_view input1,
   auto device_table_1 = cudf::table_device_view::create(input1, stream);
   auto device_table_2 = cudf::table_device_view::create(input2, stream);
   auto d_column_order = cudf::detail::make_device_uvector_sync(
-    column_order, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    column_order, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto comparator = cudf::row_lexicographic_comparator(
     cudf::nullate::NO{}, *device_table_1, *device_table_2, d_column_order.data());
diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu
index 21e56de4621..f18e9afc09c 100644
--- a/cpp/tests/types/type_dispatcher_test.cu
+++ b/cpp/tests/types/type_dispatcher_test.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -70,7 +71,7 @@ CUDF_KERNEL void dispatch_test_kernel(cudf::type_id id, bool* d_result)
 TYPED_TEST(TypedDispatcherTest, DeviceDispatch)
 {
   auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   dispatch_test_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     cudf::type_to_id<TypeParam>(), result.data());
   CUDF_CUDA_TRY(cudaDeviceSynchronize());
@@ -131,7 +132,7 @@ CUDF_KERNEL void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id
 TYPED_TEST(TypedDoubleDispatcherTest, DeviceDoubleDispatch)
 {
   auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   double_dispatch_test_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     cudf::type_to_id<TypeParam>(), cudf::type_to_id<TypeParam>(), result.data());
   CUDF_CUDA_TRY(cudaDeviceSynchronize());
diff --git a/cpp/tests/utilities/tdigest_utilities.cu b/cpp/tests/utilities/tdigest_utilities.cu
index ec3ea0d9a83..233a307cde4 100644
--- a/cpp/tests/utilities/tdigest_utilities.cu
+++ b/cpp/tests/utilities/tdigest_utilities.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -65,11 +66,11 @@ void tdigest_sample_compare(cudf::tdigest::tdigest_column_view const& tdv,
   }
 
   auto d_expected_src = cudf::detail::make_device_uvector_async(
-    h_expected_src, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_expected_src, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d_expected_mean = cudf::detail::make_device_uvector_async(
-    h_expected_mean, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_expected_mean, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d_expected_weight = cudf::detail::make_device_uvector_async(
-    h_expected_weight, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_expected_weight, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto iter = thrust::make_counting_iterator(0);
   thrust::for_each(
diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu
index 9fc5baeec97..bed0f40d70e 100644
--- a/cpp/tests/utilities_tests/batched_memset_tests.cu
+++ b/cpp/tests/utilities_tests/batched_memset_tests.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/batched_memset.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -41,7 +42,7 @@ TEST(MultiBufferTestIntegral, BasicTest1)
 
   // Device init
   auto stream = cudf::get_default_stream();
-  auto mr     = rmm::mr::get_current_device_resource();
+  auto mr     = cudf::get_current_device_resource_ref();
 
   // Creating base vector for data and setting it to all 0xFF
   std::vector<std::vector<uint64_t>> expected;
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index 93259fd63ee..ae7c6fa8b8c 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -25,7 +25,6 @@
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 class PinnedMemoryTest : public cudf::test::BaseFixture {
   size_t prev_copy_threshold;
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index 30496728083..019d6adc007 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -23,6 +23,7 @@
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -253,7 +254,7 @@ CUDF_KERNEL void simple_device_kernel(device_span<bool> result) { result[0] = tr
 TEST(SpanTest, CanUseDeviceSpan)
 {
   auto d_message = cudf::detail::make_zeroed_device_uvector_async<bool>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_span = device_span<bool>(d_message.data(), d_message.size());
 
diff --git a/docs/cudf/source/libcudf_docs/api_docs/index.rst b/docs/cudf/source/libcudf_docs/api_docs/index.rst
index c077a7cd452..96ff0eb7850 100644
--- a/docs/cudf/source/libcudf_docs/api_docs/index.rst
+++ b/docs/cudf/source/libcudf_docs/api_docs/index.rst
@@ -7,6 +7,7 @@ libcudf documentation
 
    cudf_namespace
    default_stream
+   memory_resource
    cudf_classes
    column_apis
    datetime_apis
diff --git a/docs/cudf/source/libcudf_docs/api_docs/memory_resource.rst b/docs/cudf/source/libcudf_docs/api_docs/memory_resource.rst
new file mode 100644
index 00000000000..e32f8a9beb0
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/memory_resource.rst
@@ -0,0 +1,5 @@
+Memory Resource Management
+==========================
+
+.. doxygengroup:: memory_resource
+   :members:
diff --git a/java/src/main/native/include/maps_column_view.hpp b/java/src/main/native/include/maps_column_view.hpp
index be25dbd2e55..93c117aef18 100644
--- a/java/src/main/native/include/maps_column_view.hpp
+++ b/java/src/main/native/include/maps_column_view.hpp
@@ -19,10 +19,9 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -87,7 +86,7 @@ class maps_column_view {
   std::unique_ptr<column> get_values_for(
     column_view const& keys,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
    * @brief Map lookup by a scalar key.
@@ -106,7 +105,7 @@ class maps_column_view {
   std::unique_ptr<column> get_values_for(
     scalar const& key,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
    * @brief Check if each map row contains a specified scalar key.
@@ -127,7 +126,7 @@ class maps_column_view {
   std::unique_ptr<column> contains(
     scalar const& key,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
    * @brief Check if each map row contains keys specified by a column
@@ -149,7 +148,7 @@ class maps_column_view {
   std::unique_ptr<column> contains(
     column_view const& key,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
  private:
   lists_column_view keys_, values_;
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 9b718b2ed83..7285a0f1b5c 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -34,8 +34,7 @@
 #include <cudf/strings/combine.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <arrow/api.h>
 #include <arrow/c/bridge.h>
@@ -399,7 +398,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv* env
     return release_as_jlong(
       is_lists_column
         ? cudf::lists::detail::concatenate(
-            columns, cudf::get_default_stream(), rmm::mr::get_current_device_resource())
+            columns, cudf::get_default_stream(), cudf::get_current_device_resource_ref())
         : cudf::concatenate(columns));
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 46261b087ae..9558c3ccbeb 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -28,11 +28,11 @@
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/logical.h>
@@ -134,7 +134,7 @@ void post_process_list_overlap(cudf::column_view const& lhs,
                            validity.end(),
                            thrust::identity{},
                            cudf::get_default_stream(),
-                           rmm::mr::get_current_device_resource());
+                           cudf::get_current_device_resource_ref());
 
   if (new_null_count > 0) {
     // If the `overlap_result` column is nullable, perform `bitmask_and` of its nullmask and the
@@ -146,7 +146,7 @@ void post_process_list_overlap(cudf::column_view const& lhs,
         std::vector<cudf::size_type>{0, 0},
         overlap_cv.size(),
         stream,
-        rmm::mr::get_current_device_resource());
+        cudf::get_current_device_resource_ref());
       overlap_result->set_null_mask(std::move(null_mask), null_count);
     } else {
       // Just set the output nullmask as the new nullmask.
@@ -179,7 +179,7 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
                        cudf::null_equality::EQUAL,
                        cudf::nan_equality::ALL_EQUAL,
                        stream,
-                       rmm::mr::get_current_device_resource())
+                       cudf::get_current_device_resource_ref())
                        ->release();
   auto const out_labels = out_columns.front()->view();
 
@@ -206,7 +206,7 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
     std::move(out_offsets),
     std::move(out_structs),
     input.null_count(),
-    cudf::detail::copy_bitmask(input.parent(), stream, rmm::mr::get_current_device_resource()),
+    cudf::detail::copy_bitmask(input.parent(), stream, cudf::get_current_device_resource_ref()),
     stream);
 }
 
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 09c04a77590..23c7b7fb243 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -16,6 +16,7 @@
 
 #include "cudf_jni_apis.hpp"
 
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/aligned.hpp>
@@ -27,10 +28,8 @@
 #include <rmm/mr/device/logging_resource_adaptor.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <atomic>
 #include <ctime>
@@ -617,7 +616,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv* env,
 {
   try {
     cudf::jni::auto_set_device(env);
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
     void* ret     = mr.allocate_async(size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
     return reinterpret_cast<jlong>(ret);
@@ -630,7 +629,7 @@ Java_ai_rapids_cudf_Rmm_free(JNIEnv* env, jclass clazz, jlong ptr, jlong size, j
 {
   try {
     cudf::jni::auto_set_device(env);
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
     void* cptr                        = reinterpret_cast<void*>(ptr);
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
     mr.deallocate_async(cptr, size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
@@ -1002,7 +1001,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCurrentDeviceResourceInternal(
   try {
     cudf::jni::auto_set_device(env);
     auto mr = reinterpret_cast<rmm::mr::device_memory_resource*>(new_handle);
-    rmm::mr::set_current_device_resource(mr);
+    cudf::set_current_device_resource(mr);
   }
   CATCH_STD(env, )
 }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c749c8c84bf..c5abf08a59d 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -47,6 +47,7 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -3951,7 +3952,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(
                      nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
                      cudf::nan_equality::ALL_EQUAL,
                      cudf::get_default_stream(),
-                     rmm::mr::get_current_device_resource());
+                     cudf::get_current_device_resource_ref());
     return convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
@@ -4116,7 +4117,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_makeChunkedPack(
     // and scratch memory only.
     auto temp_mr      = memoryResourceHandle != 0
                           ? reinterpret_cast<rmm::mr::device_memory_resource*>(memoryResourceHandle)
-                          : rmm::mr::get_current_device_resource();
+                          : cudf::get_current_device_resource_ref();
     auto chunked_pack = cudf::chunked_pack::create(*n_table, bounce_buffer_size, temp_mr);
     return reinterpret_cast<jlong>(chunked_pack.release());
   }
diff --git a/java/src/main/native/src/maps_column_view.cu b/java/src/main/native/src/maps_column_view.cu
index d3ee52c074c..d26ae86f531 100644
--- a/java/src/main/native/src/maps_column_view.cu
+++ b/java/src/main/native/src/maps_column_view.cu
@@ -18,9 +18,9 @@
 #include <cudf/lists/detail/contains.hpp>
 #include <cudf/lists/detail/extract.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <maps_column_view.hpp>
 
@@ -65,7 +65,7 @@ std::unique_ptr<column> get_values_for_impl(maps_column_view const& maps_view,
                                              lookup_keys,
                                              lists::duplicate_find_option::FIND_LAST,
                                              stream,
-                                             rmm::mr::get_current_device_resource());
+                                             cudf::get_current_device_resource_ref());
   auto constexpr absent_offset  = size_type{-1};
   auto constexpr nullity_offset = std::numeric_limits<size_type>::min();
   thrust::replace(rmm::exec_policy(stream),
@@ -103,7 +103,7 @@ std::unique_ptr<column> contains_impl(maps_column_view const& maps_view,
   CUDF_EXPECTS(lookup_keys.type().id() == keys.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
   auto const contains =
-    lists::detail::contains(keys, lookup_keys, stream, rmm::mr::get_current_device_resource());
+    lists::detail::contains(keys, lookup_keys, stream, cudf::get_current_device_resource_ref());
   // Replace nulls with BOOL8{false};
   auto const scalar_false = numeric_scalar<bool>{false, true, stream};
   return detail::replace_nulls(contains->view(), scalar_false, stream, mr);
diff --git a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
index b924995cf4b..6fab2684ce4 100644
--- a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
+++ b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
@@ -20,6 +20,7 @@
 #include <cudf/strings/udf/udf_string.cuh>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -58,7 +59,7 @@ std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const
 {
   return std::make_unique<rmm::device_buffer>(
     std::move(cudf::strings::create_string_vector_from_column(
-                cudf::strings_column_view(input), stream, rmm::mr::get_current_device_resource())
+                cudf::strings_column_view(input), stream, cudf::get_current_device_resource_ref())
                 .release()));
 }
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
index 9228c017d93..30b97fdec34 100644
--- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
@@ -70,8 +70,8 @@ cdef extern from *:
 
     ArrowArray* to_arrow_host_raw(
       cudf::table_view const& tbl,
-      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) {
+      rmm::cuda_stream_view stream       = cudf::get_default_stream(),
+      rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) {
       // Assumes the sync event is null and the data is already on the host.
       ArrowArray *arr = new ArrowArray();
       auto device_arr = cudf::to_arrow_host(tbl, stream, mr);

From afc9f4f84e4031fd028046a7668afec27d79627e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 10 Sep 2024 07:21:28 -1000
Subject: [PATCH 184/270] Add labeling pylibcudf doc pages (#16779)

Follow up to https://github.com/rapidsai/cudf/pull/16761, I forgot to add the doc pages for the labeling pylibcudf APIs

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16779
---
 docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst    | 1 +
 docs/cudf/source/user_guide/api_docs/pylibcudf/labeling.rst | 6 ++++++
 2 files changed, 7 insertions(+)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/labeling.rst

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 6a2b66e8ea0..d6f8cd2a1ff 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf.
     groupby
     interop
     join
+    labeling
     lists
     merge
     null_mask
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/labeling.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/labeling.rst
new file mode 100644
index 00000000000..3f3ae4c5a77
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/labeling.rst
@@ -0,0 +1,6 @@
+========
+labeling
+========
+
+.. automodule:: pylibcudf.labeling
+   :members:

From 6dd5689d123bdb68be849fd15ff4cb6348535c72 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 10 Sep 2024 13:13:32 -0500
Subject: [PATCH 185/270] use libkvikio wheels in wheel builds (#16778)

Follow-up to #15483.
Contributes to https://github.com/rapidsai/build-planning/issues/33.

Adds a build-time dependency on `libkvikio` wheels for `libcudf` wheels (per https://github.com/rapidsai/cudf/pull/15483#discussion_r1583969365).

With this change, CPM is no longer used to download and install the kvikio headers.

Before:

```text
  -- Found cuFile: /usr/local/cuda/lib64/libcufile.so
  -- CPM: Adding package KvikIO@24.10 (branch-24.10)
```

([recent build link from branch-24.10](https://github.com/rapidsai/cudf/actions/runs/10780576194/job/29896649202#step:9:7673))

After:

```text
  -- KvikIO: Found cuFile Batch API: TRUE
  -- KvikIO: Found cuFile Stream API: TRUE
  -- CPM: Using local package KvikIO@24.10.0
```

([build link from this PR](https://github.com/rapidsai/cudf/actions/runs/10780504202/job/29896555443?pr=16778#step:9:7754))

## Notes for Reviewers

### This removes kvikio headers/CMake files from libcudf wheels

Cuts around 0.8 MB (23 files) out of `libcudf` wheels.

As of this PR, these would no longer be vendored in `libcudf` wheels:

```text
    0  09-08-2024 06:17   libcudf/include/kvikio/
    0  09-08-2024 06:17   libcudf/include/kvikio/shim/
 6356  09-08-2024 06:17   libcudf/include/kvikio/batch.hpp
 3812  09-08-2024 06:17   libcudf/include/kvikio/buffer.hpp
10499  09-08-2024 06:17   libcudf/include/kvikio/utils.hpp
 1399  09-08-2024 06:17   libcudf/include/kvikio/cufile_config.hpp
33385  09-08-2024 06:17   libcudf/include/kvikio/file_handle.hpp
 7299  09-08-2024 06:17   libcudf/include/kvikio/driver.hpp
 9678  09-08-2024 06:17   libcudf/include/kvikio/defaults.hpp
 5352  09-08-2024 06:17   libcudf/include/kvikio/stream.hpp
 6002  09-08-2024 06:17   libcudf/include/kvikio/error.hpp
 4501  09-08-2024 06:17   libcudf/include/kvikio/bounce_buffer.hpp
 3197  09-08-2024 06:17   libcudf/include/kvikio/parallel_operation.hpp
 9864  09-08-2024 06:17   libcudf/include/kvikio/posix_io.hpp
  717  09-08-2024 06:17   libcudf/include/kvikio/version_config.hpp
 4529  09-08-2024 06:17   libcudf/include/kvikio/shim/cuda.hpp
 3331  09-08-2024 06:17   libcudf/include/kvikio/shim/utils.hpp
 4055  09-08-2024 06:17   libcudf/include/kvikio/shim/cufile_h_wrapper.hpp
 2242  09-08-2024 06:17   libcudf/include/kvikio/shim/cuda_h_wrapper.hpp
 7510  09-08-2024 06:17   libcudf/include/kvikio/shim/cufile.hpp
    0  09-08-2024 06:17   libcudf/lib64/cmake/kvikio/
 5031  09-08-2024 06:17   libcudf/lib64/cmake/kvikio/kvikio-targets.cmake
 3681  09-08-2024 06:17   libcudf/lib64/cmake/kvikio/kvikio-config-version.cmake
 6915  09-08-2024 06:17   libcudf/lib64/cmake/kvikio/kvikio-config.cmake
 1529  09-08-2024 06:17   libcudf/lib64/cmake/kvikio/kvikio-dependencies.cmake
 3851  09-08-2024 06:17   libcudf/lib64/cmake/kvikio/FindcuFile.cmake
```

This is safe because kvikio is a PRIVATE dependency of `libcudf`.

https://github.com/rapidsai/cudf/blob/150f1b10ed9c702d5283216b746df685e1708716/cpp/CMakeLists.txt#L796-L802


#

Authors:
  - James Lamb (https://github.com/jameslamb)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16778
---
 dependencies.yaml             | 29 +++++++++++++++++++++++++++--
 python/libcudf/pyproject.toml |  1 +
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 32c1d7a0845..483335c02ff 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -13,6 +13,7 @@ files:
       - cuda
       - cuda_version
       - depends_on_cupy
+      - depends_on_libkvikio
       - depends_on_librmm
       - depends_on_rmm
       - develop
@@ -135,6 +136,7 @@ files:
     includes:
       - build_base
       - build_cpp
+      - depends_on_libkvikio
       - depends_on_librmm
   py_build_pylibcudf:
     output: pyproject
@@ -349,8 +351,6 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - librmm==24.10.*,>=0.0.0a0
-          - libkvikio==24.10.*,>=0.0.0a0
           - flatbuffers==24.3.25
           - librdkafka>=2.5.0,<2.6.0a0
           # Align nvcomp version with rapids-cmake
@@ -889,6 +889,31 @@ dependencies:
             packages: &cupy_packages_cu11
               - cupy-cuda11x>=12.0.0
           - {matrix: null, packages: *cupy_packages_cu11}
+  depends_on_libkvikio:
+    common:
+      - output_types: conda
+        packages:
+          - &libkvikio_unsuffixed libkvikio==24.10.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - libkvikio-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - libkvikio-cu11==24.10.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *libkvikio_unsuffixed
   depends_on_librmm:
     common:
       - output_types: conda
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 5f4b9957fd0..2c98b97eddf 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -66,6 +66,7 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
+    "libkvikio==24.10.*,>=0.0.0a0",
     "librmm==24.10.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 5192b885bba82039823da687bc0a013ee74566a7 Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Tue, 10 Sep 2024 14:19:27 -0700
Subject: [PATCH 186/270] Fix empty cluster handling in tdigest merge (#16675)

This PR fixes an edge case bug in the tdigest merge. When there are multiple distinct keys but all values are empty clusters, the value column is currently merged into a single empty cluster after merge, which leads to an error while creating a result table because of the mismatching number of rows in the key and value columns. This bug can be reproduced only when all values are empty clusters. If some values are empty but some are not, the current implementation returns a valid result. This bug was originally reported in https://github.com/NVIDIA/spark-rapids/issues/11367.

The bug exists in `merge_tdigests()` as it assumes that there is no empty cluster in the merge stage even when there are (`has_nulls` are fixed to `false`). It is rather safe to assume that always there could be empty clusters. This PR fixes the flag by fixing it to true. Also, `has_nulls` has been renamed to a more descriptive name, `may_have_empty_clusters`.

The tdigest reduce does not have the same issue as it does not call `merge_tdigests()`.

Authors:
  - Jihoon Son (https://github.com/jihoonson)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/16675
---
 cpp/include/cudf/detail/tdigest/tdigest.hpp   | 17 ++--
 cpp/include/cudf_test/tdigest_utilities.cuh   | 20 ++---
 cpp/src/quantiles/tdigest/tdigest.cu          | 23 ++---
 .../quantiles/tdigest/tdigest_aggregation.cu  | 70 +++++++++------
 cpp/tests/groupby/tdigest_tests.cu            | 90 +++++++++++++++++--
 .../quantiles/percentile_approx_test.cpp      |  4 +-
 6 files changed, 162 insertions(+), 62 deletions(-)

diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 80a4460023f..672b95e2d01 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -143,28 +143,29 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create an empty tdigest column.
+ * @brief Create a tdigest column of empty clusters.
  *
- * An empty tdigest column contains a single row of length 0
+ * The column created contains the specified number of rows of empty clusters.
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns An empty tdigest column.
+ * @returns A tdigest column of empty clusters.
  */
 CUDF_EXPORT
-std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr);
+std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create an empty tdigest scalar.
+ * @brief Create a scalar of an empty tdigest cluster.
  *
- * An empty tdigest scalar is a struct_scalar that contains a single row of length 0
+ * The returned scalar is a struct_scalar that contains a single row of an empty cluster.
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns An empty tdigest scalar.
+ * @returns A scalar of an empty tdigest cluster.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index 1758790cd64..be7d19b2227 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -562,12 +562,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto b = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto b = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 0d017cf1f13..76cd55bf994 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -292,32 +292,33 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
   return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr);
 }
 
-std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr)
+std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::device_async_resource_ref mr)
 {
   auto offsets = cudf::make_fixed_width_column(
-    data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                offsets->mutable_view().begin<size_type>(),
                offsets->mutable_view().end<size_type>(),
                0);
 
-  auto min_col =
-    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  auto min_col = cudf::make_numeric_column(
+    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                min_col->mutable_view().begin<double>(),
                min_col->mutable_view().end<double>(),
                0);
-  auto max_col =
-    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  auto max_col = cudf::make_numeric_column(
+    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                max_col->mutable_view().begin<double>(),
                max_col->mutable_view().end<double>(),
                0);
 
-  return make_tdigest_column(1,
-                             make_empty_column(type_id::FLOAT64),
-                             make_empty_column(type_id::FLOAT64),
+  return make_tdigest_column(num_rows,
+                             cudf::make_empty_column(type_id::FLOAT64),
+                             cudf::make_empty_column(type_id::FLOAT64),
                              std::move(offsets),
                              std::move(min_col),
                              std::move(max_col),
@@ -338,7 +339,7 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr)
 {
-  auto contents = make_empty_tdigest_column(stream, mr)->release();
+  auto contents = make_tdigest_column_of_empty_clusters(1, stream, mr)->release();
   return std::make_unique<struct_scalar>(
     std::move(*std::make_unique<table>(std::move(contents.children))), true, stream, mr);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 2dd25a7b890..d591fb5c171 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -366,8 +366,8 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
  * @param group_cluster_wl    Output.  The set of cluster weight limits for each group.
  * @param group_num_clusters  Output.  The number of output clusters for each input group.
  * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
- * @param has_nulls Whether or not the input contains nulls
- *
+ * @param may_have_empty_clusters Whether or not there could be empty clusters. Must only be
+ * set to false when there is no empty cluster, true otherwise.
  */
 
 template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
@@ -379,7 +379,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
                                                 double* group_cluster_wl,
                                                 size_type* group_num_clusters,
                                                 size_type const* group_cluster_offsets,
-                                                bool has_nulls)
+                                                bool may_have_empty_clusters)
 {
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -399,11 +399,12 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
   // a group with nothing in it.
   group_num_clusters[group_index] = 0;
   if (total_weight <= 0) {
-    // if the input contains nulls we can potentially have a group that generates no
-    // clusters because -all- of the input values are null.  in that case, the reduce_by_key call
-    // in the tdigest generation step will need a location to store the unused reduction value for
-    // that group of nulls. these "stubs" will be postprocessed out afterwards.
-    if (has_nulls) { group_num_clusters[group_index] = 1; }
+    // If the input contains empty clusters, we can potentially have a group that also generates
+    // empty clusters because -all- of the input values are null or empty cluster. In that case, the
+    // `reduce_by_key` call in the tdigest generation step will need a location to store the unused
+    // reduction value for that group of nulls and empty clusters. These "stubs" will be
+    // postprocessed out afterwards.
+    if (may_have_empty_clusters) { group_num_clusters[group_index] = 1; }
     return;
   }
 
@@ -502,7 +503,8 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
  * stream that falls before our current cluster limit
  * @param group_info         A functor which returns the info for the specified group (total weight,
  * size and start offset)
- * @param has_nulls          Whether or not the input data contains nulls
+ * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
+ * set to false only when there is no empty cluster.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -516,7 +518,7 @@ generate_group_cluster_info(int delta,
                             NearestWeight nearest_weight,
                             GroupInfo group_info,
                             CumulativeWeight cumulative_weight,
-                            bool has_nulls,
+                            bool may_have_empty_clusters,
                             rmm::cuda_stream_view stream,
                             rmm::device_async_resource_ref mr)
 {
@@ -535,7 +537,7 @@ generate_group_cluster_info(int delta,
     nullptr,
     group_num_clusters.begin(),
     nullptr,
-    has_nulls);
+    may_have_empty_clusters);
 
   // generate group cluster offsets (where the clusters for a given group start and end)
   auto group_cluster_offsets = cudf::make_numeric_column(
@@ -567,7 +569,7 @@ generate_group_cluster_info(int delta,
     group_cluster_wl.begin(),
     group_num_clusters.begin(),
     group_cluster_offsets->view().begin<size_type>(),
-    has_nulls);
+    may_have_empty_clusters);
 
   return {std::move(group_cluster_wl),
           std::move(group_cluster_offsets),
@@ -580,7 +582,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                             std::unique_ptr<column>&& offsets,
                                             std::unique_ptr<column>&& min_col,
                                             std::unique_ptr<column>&& max_col,
-                                            bool has_nulls,
+                                            bool may_have_empty_clusters,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
@@ -595,7 +597,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                           size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; };
 
   size_type const num_stubs = [&]() {
-    if (!has_nulls) { return 0; }
+    if (!may_have_empty_clusters) { return 0; }
     auto iter = cudf::detail::make_counting_transform_iterator(
       0, cuda::proclaim_return_type<size_type>(is_stub_digest));
     return thrust::reduce(rmm::exec_policy(stream), iter, iter + num_rows);
@@ -661,6 +663,10 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                                     mr);
 }
 
+/**
+ * @brief A functor which returns the cluster index within a group that the value at
+ * the given value index falls into.
+ */
 template <typename CumulativeWeight>
 struct compute_tdigests_keys_fn {
   int const delta;
@@ -706,8 +712,8 @@ struct compute_tdigests_keys_fn {
  * boundaries.
  *
  * @param delta              tdigest compression level
- * @param values_begin       Beginning of the range of input values.
- * @param values_end         End of the range of input values.
+ * @param centroids_begin    Beginning of the range of centroids.
+ * @param centroids_end      End of the range of centroids.
  * @param cumulative_weight  Functor which returns cumulative weight and group information for
  * an absolute input value index.
  * @param min_col            Column containing the minimum value per group.
@@ -715,7 +721,8 @@ struct compute_tdigests_keys_fn {
  * @param group_cluster_wl   Cluster weight limits for each group.
  * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits.
  * @param total_clusters     Total number of clusters in all groups.
- * @param has_nulls          Whether or not the input contains nulls
+ * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
+ * set to false only when there is no empty cluster.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -731,7 +738,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                                          rmm::device_uvector<double> const& group_cluster_wl,
                                          std::unique_ptr<column>&& group_cluster_offsets,
                                          size_type total_clusters,
-                                         bool has_nulls,
+                                         bool may_have_empty_clusters,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
@@ -750,7 +757,9 @@ std::unique_ptr<column> compute_tdigests(int delta,
   //   double       // max
   // }
   //
-  if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
+  if (total_clusters == 0) {
+    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
+  }
 
   // each input group represents an individual tdigest.  within each tdigest, we want the keys
   // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
@@ -793,7 +802,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                              std::move(group_cluster_offsets),
                              std::move(min_col),
                              std::move(max_col),
-                             has_nulls,
+                             may_have_empty_clusters,
                              stream,
                              mr);
 }
@@ -1145,8 +1154,13 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
   auto merged =
     cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref());
 
+  auto merged_weights = merged->get_column(1).view();
+  // If there are no values, we can simply return a column that has only empty tdigests.
+  if (merged_weights.size() == 0) {
+    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(num_groups, stream, mr);
+  }
+
   // generate cumulative weights
-  auto merged_weights     = merged->get_column(1).view();
   auto cumulative_weights = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream);
   auto keys = cudf::detail::make_counting_transform_iterator(
@@ -1161,6 +1175,10 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   auto const delta = max_centroids;
 
+  // We do not know whether there is any empty cluster in the input without actually reading the
+  // data, which could be expensive. So, we just assume that there could be empty clusters.
+  auto const may_have_empty_clusters = true;
+
   // generate cluster info
   auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
     delta,
@@ -1177,7 +1195,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       group_labels,
       group_offsets,
       {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
-    false,
+    may_have_empty_clusters,
     stream,
     mr);
 
@@ -1202,7 +1220,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
     group_cluster_wl,
     std::move(group_cluster_offsets),
     total_clusters,
-    false,
+    may_have_empty_clusters,
     stream,
     mr);
 }
@@ -1267,7 +1285,9 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
+  if (col.size() == 0) {
+    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
+  }
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(col.type(),
@@ -1293,7 +1313,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   tdigest_column_view tdv(input);
 
   if (num_groups == 0 || input.size() == 0) {
-    return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr);
+    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
   }
 
   // bring group offsets back to the host
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index baa59026b07..3780dbb1d95 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto e = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
@@ -507,3 +507,81 @@ TEST_F(TDigestMergeTest, EmptyGroups)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
 }
+
+std::unique_ptr<cudf::table> do_agg(
+  cudf::column_view key,
+  cudf::column_view val,
+  std::function<std::unique_ptr<cudf::groupby_aggregation>()> make_agg)
+{
+  std::vector<cudf::column_view> keys;
+  keys.push_back(key);
+  cudf::table_view const key_table(keys);
+
+  cudf::groupby::groupby gb(key_table);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  cudf::groupby::aggregation_request req;
+  req.values = val;
+  req.aggregations.push_back(make_agg());
+  requests.push_back(std::move(req));
+
+  auto result = gb.aggregate(std::move(requests));
+
+  std::vector<std::unique_ptr<cudf::column>> result_columns;
+  for (auto&& c : result.first->release()) {
+    result_columns.push_back(std::move(c));
+  }
+
+  EXPECT_EQ(result.second.size(), 1);
+  EXPECT_EQ(result.second[0].results.size(), 1);
+  result_columns.push_back(std::move(result.second[0].results[0]));
+
+  return std::make_unique<cudf::table>(std::move(result_columns));
+}
+
+TEST_F(TDigestMergeTest, AllGroupsHaveEmptyClusters)
+{
+  // The input must be sorted by the key.
+  // See `aggregate_result_functor::operator()<aggregation::TDIGEST>` for details.
+  auto const keys      = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 0, 1, 1, 2}};
+  auto const keys_view = cudf::column_view(keys);
+  auto val_elems  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+    // All values are null
+    return false;
+  });
+  auto const vals = cudf::test::fixed_width_column_wrapper<int32_t>{
+    val_elems, val_elems + keys_view.size(), val_valids};
+
+  auto const delta = 10000;
+
+  // Compute tdigest. The result should have 3 empty clusters, one per group.
+  auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() {
+    return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+  });
+
+  auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
+  cudf::column_view const expected_computed_keys_view{expected_computed_keys};
+  auto const expected_computed_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    expected_computed_keys_view.size(),
+    cudf::get_default_stream(),
+    rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view());
+  // The computed values are nullable even though the input values are not.
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(),
+                                 compute_result->get_column(1).view());
+
+  // Merge tdigest. The result should have 3 empty clusters, one per group.
+  auto const merge_result =
+    do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
+      return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+    });
+
+  auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
+  cudf::column_view const expected_merged_keys_view{expected_merged_keys};
+  auto const expected_merged_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    expected_merged_keys_view.size(),
+    cudf::get_default_stream(),
+    rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view());
+}
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 915717713df..7359f0406fc 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {};
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto empty_ = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;

From c3d323df1df5ba4c5377374b5b4ffdc06829c02b Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Tue, 10 Sep 2024 17:14:28 -0700
Subject: [PATCH 187/270] Move NDS-H examples into benchmarks (#16663)

Moving the TPC-H examples into benchmarks by converting each of them into NVBench's. The benchmarks can be built by

```bash
./build.sh libcudf benchmarks
```

Also, addresses #16711

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16663
---
 .gitignore                                    |   1 -
 cpp/benchmarks/CMakeLists.txt                 |  26 +-
 .../ndsh_data_generator.cpp}                  |  66 +--
 .../ndsh_data_generator.hpp}                  |   0
 .../random_column_generator.cu                |   0
 .../random_column_generator.hpp               |   0
 .../table_helpers.cpp                         |   0
 .../table_helpers.hpp                         |   0
 cpp/benchmarks/ndsh/README.md                 |  11 +
 .../tpch/q1.cpp => benchmarks/ndsh/q01.cpp}   |  49 +-
 .../tpch/q5.cpp => benchmarks/ndsh/q05.cpp}   |  56 ++-
 .../tpch/q6.cpp => benchmarks/ndsh/q06.cpp}   |  52 +-
 .../tpch/q9.cpp => benchmarks/ndsh/q09.cpp}   |  62 +--
 .../tpch => benchmarks/ndsh}/q10.cpp          |  51 +-
 cpp/benchmarks/ndsh/utilities.cpp             | 400 +++++++++++++++
 cpp/benchmarks/ndsh/utilities.hpp             | 227 +++++++++
 cpp/examples/build.sh                         |   1 -
 cpp/examples/tpch/CMakeLists.txt              |  36 --
 cpp/examples/tpch/README.md                   |  39 --
 .../tpch/datagen/correct_datatypes.py         |  60 ---
 cpp/examples/tpch/datagen/datagen.sh          |  31 --
 cpp/examples/tpch/datagen/tpch.patch          |  33 --
 cpp/examples/tpch/utils.hpp                   | 458 ------------------
 23 files changed, 846 insertions(+), 813 deletions(-)
 rename cpp/benchmarks/common/{tpch_data_generator/tpch_data_generator.cpp => ndsh_data_generator/ndsh_data_generator.cpp} (97%)
 rename cpp/benchmarks/common/{tpch_data_generator/tpch_data_generator.hpp => ndsh_data_generator/ndsh_data_generator.hpp} (100%)
 rename cpp/benchmarks/common/{tpch_data_generator => ndsh_data_generator}/random_column_generator.cu (100%)
 rename cpp/benchmarks/common/{tpch_data_generator => ndsh_data_generator}/random_column_generator.hpp (100%)
 rename cpp/benchmarks/common/{tpch_data_generator => ndsh_data_generator}/table_helpers.cpp (100%)
 rename cpp/benchmarks/common/{tpch_data_generator => ndsh_data_generator}/table_helpers.hpp (100%)
 create mode 100644 cpp/benchmarks/ndsh/README.md
 rename cpp/{examples/tpch/q1.cpp => benchmarks/ndsh/q01.cpp} (82%)
 rename cpp/{examples/tpch/q5.cpp => benchmarks/ndsh/q05.cpp} (80%)
 rename cpp/{examples/tpch/q6.cpp => benchmarks/ndsh/q06.cpp} (79%)
 rename cpp/{examples/tpch/q9.cpp => benchmarks/ndsh/q09.cpp} (78%)
 rename cpp/{examples/tpch => benchmarks/ndsh}/q10.cpp (81%)
 create mode 100644 cpp/benchmarks/ndsh/utilities.cpp
 create mode 100644 cpp/benchmarks/ndsh/utilities.hpp
 delete mode 100644 cpp/examples/tpch/CMakeLists.txt
 delete mode 100644 cpp/examples/tpch/README.md
 delete mode 100644 cpp/examples/tpch/datagen/correct_datatypes.py
 delete mode 100755 cpp/examples/tpch/datagen/datagen.sh
 delete mode 100644 cpp/examples/tpch/datagen/tpch.patch
 delete mode 100644 cpp/examples/tpch/utils.hpp

diff --git a/.gitignore b/.gitignore
index 619e1464b2a..180a6a286e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -80,7 +80,6 @@ build/
 cpp/build/
 cpp/examples/*/install/
 cpp/examples/*/build/
-cpp/examples/tpch/datagen/datafusion
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index d2c22b788cb..3bf9d02b384 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -36,25 +36,25 @@ target_include_directories(
 )
 
 add_library(
-  tpch_data_generator STATIC
-  common/tpch_data_generator/tpch_data_generator.cpp common/tpch_data_generator/table_helpers.cpp
-  common/tpch_data_generator/random_column_generator.cu
+  ndsh_data_generator STATIC
+  common/ndsh_data_generator/ndsh_data_generator.cpp common/ndsh_data_generator/table_helpers.cpp
+  common/ndsh_data_generator/random_column_generator.cu
 )
-target_compile_features(tpch_data_generator PUBLIC cxx_std_17 cuda_std_17)
+target_compile_features(ndsh_data_generator PUBLIC cxx_std_17 cuda_std_17)
 
 target_compile_options(
-  tpch_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+  ndsh_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
                              "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
 )
 
 target_link_libraries(
-  tpch_data_generator
+  ndsh_data_generator
   PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
 target_include_directories(
-  tpch_data_generator
+  ndsh_data_generator
   PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
          "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
 )
@@ -127,8 +127,8 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
   target_link_libraries(
-    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen nvbench::nvbench
-                                $<TARGET_NAME_IF_EXISTS:conda_env>
+    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen
+                                nvbench::nvbench $<TARGET_NAME_IF_EXISTS:conda_env>
   )
   install(
     TARGETS ${CMAKE_BENCH_NAME}
@@ -175,6 +175,14 @@ ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else.cpp)
 # * transpose benchmark ---------------------------------------------------------------------------
 ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp)
 
+# ##################################################################################################
+# * nds-h benchmark --------------------------------------------------------------------------------
+ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp)
+
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp
similarity index 97%
rename from cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
rename to cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp
index 236fe8095ad..fa7edd225ba 100644
--- a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
+++ b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "tpch_data_generator.hpp"
+#include "ndsh_data_generator.hpp"
 
 #include "random_column_generator.hpp"
 #include "table_helpers.hpp"
@@ -435,46 +435,37 @@ std::unique_ptr<cudf::table> generate_lineitem_partial(cudf::table_view const& o
   columns.push_back(std::move(l_quantity));
   columns.push_back(std::move(l_discount));
   columns.push_back(std::move(l_tax));
+  columns.push_back(std::move(l_returnflag));
+  columns.push_back(std::move(l_linestatus));
   columns.push_back(std::move(l_shipdate_ts));
   columns.push_back(std::move(l_commitdate_ts));
   columns.push_back(std::move(l_receiptdate_ts));
-  columns.push_back(std::move(l_returnflag));
-  columns.push_back(std::move(l_linestatus));
   columns.push_back(std::move(l_shipinstruct));
   columns.push_back(std::move(l_shipmode));
   columns.push_back(std::move(l_comment));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem,
+/**
+ * @brief Generate the part of the `orders` table dependent on the `lineitem` table
+ *
+ * @param lineitem_partial The partially generated `lineitem` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem_partial,
                                                        rmm::cuda_stream_view stream,
                                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  auto const l_linestatus_mask = lineitem.column(0);
-  auto const l_orderkey        = lineitem.column(1);
-  auto const l_discount        = lineitem.column(6);
-  auto const l_tax             = lineitem.column(7);
-  auto const l_extendedprice   = lineitem.column(16);
+  auto const l_linestatus_mask = lineitem_partial.column(0);
+  auto const l_orderkey        = lineitem_partial.column(1);
+  auto const l_extendedprice   = lineitem_partial.column(6);
+  auto const l_discount        = lineitem_partial.column(7);
+  auto const l_tax             = lineitem_partial.column(8);
 
   std::vector<std::unique_ptr<cudf::column>> orders_dependent_columns;
 
-  // Generate the `o_totalprice` column
-  // We calculate the `charge` column, which is a function of `l_extendedprice`,
-  // `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
-  auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
-  auto o_totalprice   = [&]() {
-    auto const keys = cudf::table_view({l_orderkey});
-    cudf::groupby::groupby gb(keys);
-    std::vector<cudf::groupby::aggregation_request> requests;
-    requests.push_back(cudf::groupby::aggregation_request());
-    requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
-    requests[0].values = l_charge->view();
-    auto agg_result    = gb.aggregate(requests);
-    return cudf::round(agg_result.second[0].results[0]->view(), 2);
-  }();
-  orders_dependent_columns.push_back(std::move(o_totalprice));
-
   // Generate the `o_orderstatus` column
   auto o_orderstatus = [&]() {
     auto const keys = cudf::table_view({l_orderkey});
@@ -529,6 +520,22 @@ std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& l
       cudf::string_scalar("P"), o_orderstatus_intermediate->view(), mask_b->view());
   }();
   orders_dependent_columns.push_back(std::move(o_orderstatus));
+
+  // Generate the `o_totalprice` column
+  // We calculate the `charge` column, which is a function of `l_extendedprice`,
+  // `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
+  auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
+  auto o_totalprice   = [&]() {
+    auto const keys = cudf::table_view({l_orderkey});
+    cudf::groupby::groupby gb(keys);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    requests.push_back(cudf::groupby::aggregation_request());
+    requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+    requests[0].values = l_charge->view();
+    auto agg_result    = gb.aggregate(requests);
+    return cudf::round(agg_result.second[0].results[0]->view(), 2);
+  }();
+  orders_dependent_columns.push_back(std::move(o_totalprice));
   return std::make_unique<cudf::table>(std::move(orders_dependent_columns));
 }
 
@@ -730,9 +737,7 @@ generate_orders_lineitem_part(double scale_factor,
   // Generate the `part` table
   auto part = generate_part(scale_factor, stream, mr);
 
-  // Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column,
-  // add the column to the `lineitem` table, and write the `lineitem` table to a parquet file
-
+  // Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column
   auto l_extendedprice = [&]() {
     auto const left = cudf::table_view(
       {lineitem_partial->get_column(2).view(), lineitem_partial->get_column(5).view()});
@@ -752,8 +757,9 @@ generate_orders_lineitem_part(double scale_factor,
     return cudf::round(col->view(), 2);
   }();
 
+  // Insert the `l_extendedprice` column into the partial columns of the `lineitem` table
   auto lineitem_partial_columns = lineitem_partial->release();
-  lineitem_partial_columns.push_back(std::move(l_extendedprice));
+  lineitem_partial_columns.insert(lineitem_partial_columns.begin() + 6, std::move(l_extendedprice));
   auto lineitem_temp = std::make_unique<cudf::table>(std::move(lineitem_partial_columns));
 
   // Generate the dependent columns of the `orders` table
@@ -762,7 +768,7 @@ generate_orders_lineitem_part(double scale_factor,
 
   auto orders_independent_columns = orders_independent->release();
   auto orders_dependent_columns   = orders_dependent->release();
-  orders_independent_columns.insert(orders_independent_columns.end(),
+  orders_independent_columns.insert(orders_independent_columns.begin() + 2,
                                     std::make_move_iterator(orders_dependent_columns.begin()),
                                     std::make_move_iterator(orders_dependent_columns.end()));
 
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.hpp
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
rename to cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.hpp
diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu b/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu
rename to cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu
diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp b/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.hpp
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
rename to cpp/benchmarks/common/ndsh_data_generator/random_column_generator.hpp
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
rename to cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.hpp
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
rename to cpp/benchmarks/common/ndsh_data_generator/table_helpers.hpp
diff --git a/cpp/benchmarks/ndsh/README.md b/cpp/benchmarks/ndsh/README.md
new file mode 100644
index 00000000000..0a462e1684e
--- /dev/null
+++ b/cpp/benchmarks/ndsh/README.md
@@ -0,0 +1,11 @@
+# NDS-H Benchmarks for `libcudf`
+
+## Disclaimer
+
+NDS-H is derived from the TPC-H Benchmarks and as such any results obtained using NDS-H are not
+comparable to published TPC-H Benchmark results, as the results obtained from using NDS-H do not
+comply with the TPC-H Benchmarks.
+
+## Current Status
+
+For now, only Q1, Q5, Q6, Q9, and Q10 have been implemented
diff --git a/cpp/examples/tpch/q1.cpp b/cpp/benchmarks/ndsh/q01.cpp
similarity index 82%
rename from cpp/examples/tpch/q1.cpp
rename to cpp/benchmarks/ndsh/q01.cpp
index 87b7e613766..ef709926ae9 100644
--- a/cpp/examples/tpch/q1.cpp
+++ b/cpp/benchmarks/ndsh/q01.cpp
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
- * @file q1.cpp
- * @brief Implement query 1 of the TPC-H benchmark.
+ * @file q01.cpp
+ * @brief Implement query 1 of the NDS-H benchmark.
  *
  * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
  *
@@ -59,7 +61,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_disc_price(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_disc_price(
   cudf::column_view const& discount,
   cudf::column_view const& extendedprice,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -86,7 +88,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_charge(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_charge(
   cudf::column_view const& tax,
   cudf::column_view const& disc_price,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -101,16 +103,9 @@
   return charge;
 }
 
-int main(int argc, char const** argv)
+void run_ndsh_q1(nvbench::state& state,
+                 std::unordered_map<std::string, parquet_device_buffer>& sources)
 {
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
-
   // Define the column projections and filter predicate for `lineitem` table
   std::vector<std::string> const lineitem_cols = {"l_returnflag",
                                                   "l_linestatus",
@@ -130,12 +125,12 @@ int main(int argc, char const** argv)
 
   // Read out the `lineitem` table from parquet file
   auto lineitem =
-    read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
+    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Calculate the discount price and charge columns and append to lineitem table
   auto disc_price =
-    calc_disc_price(lineitem->column("l_discount"), lineitem->column("l_extendedprice"));
-  auto charge = calc_charge(lineitem->column("l_tax"), disc_price->view());
+    calculate_disc_price(lineitem->column("l_discount"), lineitem->column("l_extendedprice"));
+  auto charge = calculate_charge(lineitem->column("l_tax"), disc_price->view());
   (*lineitem).append(disc_price, "disc_price").append(charge, "charge");
 
   // Perform the group by operation
@@ -167,9 +162,21 @@ int main(int argc, char const** argv)
                                              {"l_returnflag", "l_linestatus"},
                                              {cudf::order::ASCENDING, cudf::order::ASCENDING});
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   orderedby_table->to_parquet("q1.parquet");
-  return 0;
 }
+
+void ndsh_q1(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q1(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q1).set_name("ndsh_q1").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/examples/tpch/q5.cpp b/cpp/benchmarks/ndsh/q05.cpp
similarity index 80%
rename from cpp/examples/tpch/q5.cpp
rename to cpp/benchmarks/ndsh/q05.cpp
index 12c186db10e..522bc4789c2 100644
--- a/cpp/examples/tpch/q5.cpp
+++ b/cpp/benchmarks/ndsh/q05.cpp
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
- * @file q5.cpp
- * @brief Implement query 5 of the TPC-H benchmark.
+ * @file q05.cpp
+ * @brief Implement query 5 of the NDS-H benchmark.
  *
  * create view customer as select * from '/tables/scale-1/customer.parquet';
  * create view orders as select * from '/tables/scale-1/orders.parquet';
@@ -67,7 +69,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_revenue(
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -86,16 +88,9 @@
   return revenue;
 }
 
-int main(int argc, char const** argv)
+void run_ndsh_q5(nvbench::state& state,
+                 std::unordered_map<std::string, parquet_device_buffer>& sources)
 {
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
-
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
   auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
@@ -125,17 +120,17 @@ int main(int argc, char const** argv)
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer =
-    read_parquet(args.dataset_dir + "/customer.parquet", {"c_custkey", "c_nationkey"});
+    read_parquet(sources["customer"].make_source_info(), {"c_custkey", "c_nationkey"});
   auto const orders =
-    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
-  auto const lineitem = read_parquet(args.dataset_dir + "/lineitem.parquet",
+    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
+  auto const lineitem = read_parquet(sources["lineitem"].make_source_info(),
                                      {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"});
   auto const supplier =
-    read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"});
+    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
   auto const nation =
-    read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_regionkey", "n_name"});
+    read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_regionkey", "n_name"});
   auto const region =
-    read_parquet(args.dataset_dir + "/region.parquet", region_cols, std::move(region_pred));
+    read_parquet(sources["region"].make_source_info(), region_cols, std::move(region_pred));
 
   // Perform the joins
   auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"});
@@ -147,7 +142,7 @@ int main(int argc, char const** argv)
 
   // Calculate and append the `revenue` column
   auto revenue =
-    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+    calculate_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
   (*joined_table).append(revenue, "revenue");
 
   // Perform the groupby operation
@@ -162,9 +157,22 @@ int main(int argc, char const** argv)
   auto const orderedby_table =
     apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   orderedby_table->to_parquet("q5.parquet");
-  return 0;
 }
+
+void ndsh_q5(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(
+    scale_factor, {"customer", "orders", "lineitem", "supplier", "nation", "region"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q5(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q5).set_name("ndsh_q5").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/examples/tpch/q6.cpp b/cpp/benchmarks/ndsh/q06.cpp
similarity index 79%
rename from cpp/examples/tpch/q6.cpp
rename to cpp/benchmarks/ndsh/q06.cpp
index 92dac40c768..04078547973 100644
--- a/cpp/examples/tpch/q6.cpp
+++ b/cpp/benchmarks/ndsh/q06.cpp
@@ -14,17 +14,20 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
- * @file q6.cpp
- * @brief Implement query 6 of the TPC-H benchmark.
+ * @file q06.cpp
+ * @brief Implement query 6 of the NDS-H benchmark.
  *
  * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
  *
@@ -48,7 +51,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_revenue(
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -60,16 +63,9 @@
   return revenue;
 }
 
-int main(int argc, char const** argv)
+void run_ndsh_q6(nvbench::state& state,
+                 std::unordered_map<std::string, parquet_device_buffer>& sources)
 {
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
-
   // Read out the `lineitem` table from parquet file
   std::vector<std::string> const lineitem_cols = {
     "l_extendedprice", "l_discount", "l_shipdate", "l_quantity"};
@@ -88,7 +84,7 @@ int main(int argc, char const** argv)
   auto const lineitem_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
   auto lineitem =
-    read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
+    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Cast the discount and quantity columns to float32 and append to lineitem table
   auto discout_float =
@@ -99,8 +95,8 @@ int main(int argc, char const** argv)
   (*lineitem).append(discout_float, "l_discount_float").append(quantity_float, "l_quantity_float");
 
   // Apply the filters
-  auto const discount_ref = cudf::ast::column_reference(lineitem->col_id("l_discount_float"));
-  auto const quantity_ref = cudf::ast::column_reference(lineitem->col_id("l_quantity_float"));
+  auto const discount_ref = cudf::ast::column_reference(lineitem->column_id("l_discount_float"));
+  auto const quantity_ref = cudf::ast::column_reference(lineitem->column_id("l_quantity_float"));
 
   auto discount_lower               = cudf::numeric_scalar<float_t>(0.05);
   auto const discount_lower_literal = cudf::ast::literal(discount_lower);
@@ -123,16 +119,28 @@ int main(int argc, char const** argv)
   auto const filtered_table = apply_filter(lineitem, discount_quantity_pred);
 
   // Calculate the `revenue` column
-  auto revenue =
-    calc_revenue(filtered_table->column("l_extendedprice"), filtered_table->column("l_discount"));
+  auto revenue = calculate_revenue(filtered_table->column("l_extendedprice"),
+                                   filtered_table->column("l_discount"));
 
   // Sum the `revenue` column
   auto const revenue_view = revenue->view();
   auto const result_table = apply_reduction(revenue_view, cudf::aggregation::Kind::SUM, "revenue");
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   result_table->to_parquet("q6.parquet");
-  return 0;
 }
+
+void ndsh_q6(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q6(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q6).set_name("ndsh_q6").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/examples/tpch/q9.cpp b/cpp/benchmarks/ndsh/q09.cpp
similarity index 78%
rename from cpp/examples/tpch/q9.cpp
rename to cpp/benchmarks/ndsh/q09.cpp
index 2882182aa2b..59218ab8912 100644
--- a/cpp/examples/tpch/q9.cpp
+++ b/cpp/benchmarks/ndsh/q09.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/datetime.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -24,9 +24,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
- * @file q9.cpp
- * @brief Implement query 9 of the TPC-H benchmark.
+ * @file q09.cpp
+ * @brief Implement query 9 of the NDS-H benchmark.
  *
  * create view part as select * from '/tables/scale-1/part.parquet';
  * create view supplier as select * from '/tables/scale-1/supplier.parquet';
@@ -79,7 +81,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_amount(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_amount(
   cudf::column_view const& discount,
   cudf::column_view const& extendedprice,
   cudf::column_view const& supplycost,
@@ -109,28 +111,21 @@
   return amount;
 }
 
-int main(int argc, char const** argv)
+void run_ndsh_q9(nvbench::state& state,
+                 std::unordered_map<std::string, parquet_device_buffer>& sources)
 {
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
-
   // Read out the table from parquet files
   auto const lineitem = read_parquet(
-    args.dataset_dir + "/lineitem.parquet",
+    sources["lineitem"].make_source_info(),
     {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"});
-  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_name"});
+  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_name"});
   auto const orders =
-    read_parquet(args.dataset_dir + "/orders.parquet", {"o_orderkey", "o_orderdate"});
-  auto const part     = read_parquet(args.dataset_dir + "/part.parquet", {"p_partkey", "p_name"});
-  auto const partsupp = read_parquet(args.dataset_dir + "/partsupp.parquet",
+    read_parquet(sources["orders"].make_source_info(), {"o_orderkey", "o_orderdate"});
+  auto const part     = read_parquet(sources["part"].make_source_info(), {"p_partkey", "p_name"});
+  auto const partsupp = read_parquet(sources["partsupp"].make_source_info(),
                                      {"ps_suppkey", "ps_partkey", "ps_supplycost"});
   auto const supplier =
-    read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"});
+    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
 
   // Generating the `profit` table
   // Filter the part table using `p_name like '%green%'`
@@ -150,10 +145,10 @@ int main(int argc, char const** argv)
   // Calculate the `nation`, `o_year`, and `amount` columns
   auto n_name = std::make_unique<cudf::column>(joined_table->column("n_name"));
   auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate"));
-  auto amount = calc_amount(joined_table->column("l_discount"),
-                            joined_table->column("l_extendedprice"),
-                            joined_table->column("ps_supplycost"),
-                            joined_table->column("l_quantity"));
+  auto amount = calculate_amount(joined_table->column("l_discount"),
+                                 joined_table->column("l_extendedprice"),
+                                 joined_table->column("ps_supplycost"),
+                                 joined_table->column("l_quantity"));
 
   // Put together the `profit` table
   std::vector<std::unique_ptr<cudf::column>> profit_columns;
@@ -175,9 +170,22 @@ int main(int argc, char const** argv)
   auto const orderedby_table = apply_orderby(
     groupedby_table, {"nation", "o_year"}, {cudf::order::ASCENDING, cudf::order::DESCENDING});
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   orderedby_table->to_parquet("q9.parquet");
-  return 0;
 }
+
+void ndsh_q9(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(
+    scale_factor, {"part", "supplier", "lineitem", "partsupp", "orders", "nation"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q9(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q9).set_name("ndsh_q9").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/examples/tpch/q10.cpp b/cpp/benchmarks/ndsh/q10.cpp
similarity index 81%
rename from cpp/examples/tpch/q10.cpp
rename to cpp/benchmarks/ndsh/q10.cpp
index fdf147b50e0..a520480020a 100644
--- a/cpp/examples/tpch/q10.cpp
+++ b/cpp/benchmarks/ndsh/q10.cpp
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
  * @file q10.cpp
- * @brief Implement query 10 of the TPC-H benchmark.
+ * @brief Implement query 10 of the NDS-H benchmark.
  *
  * create view customer as select * from '/tables/scale-1/customer.parquet';
  * create view orders as select * from '/tables/scale-1/orders.parquet';
@@ -72,7 +74,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_revenue(
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -90,16 +92,10 @@
                                         mr);
   return revenue;
 }
-int main(int argc, char const** argv)
-{
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
 
+void run_ndsh_q10(nvbench::state& state,
+                  std::unordered_map<std::string, parquet_device_buffer>& sources)
+{
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
   auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
@@ -126,15 +122,15 @@ int main(int argc, char const** argv)
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer = read_parquet(
-    args.dataset_dir + "/customer.parquet",
+    sources["customer"].make_source_info(),
     {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"});
   auto const orders =
-    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
+    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
   auto const lineitem =
-    read_parquet(args.dataset_dir + "/lineitem.parquet",
+    read_parquet(sources["lineitem"].make_source_info(),
                  {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"},
                  std::move(lineitem_pred));
-  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_name", "n_nationkey"});
+  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_name", "n_nationkey"});
 
   // Perform the joins
   auto const join_a       = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"});
@@ -143,7 +139,7 @@ int main(int argc, char const** argv)
 
   // Calculate and append the `revenue` column
   auto revenue =
-    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+    calculate_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
   (*joined_table).append(revenue, "revenue");
 
   // Perform the groupby operation
@@ -159,9 +155,22 @@ int main(int argc, char const** argv)
   auto const orderedby_table =
     apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   orderedby_table->to_parquet("q10.parquet");
-  return 0;
 }
+
+void ndsh_q10(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(
+    scale_factor, {"customer", "orders", "lineitem", "nation"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q10(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q10).set_name("ndsh_q10").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp
new file mode 100644
index 00000000000..2d514764fc2
--- /dev/null
+++ b/cpp/benchmarks/ndsh/utilities.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utilities.hpp"
+
+#include "common/ndsh_data_generator/ndsh_data_generator.hpp"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/join.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/transform.hpp>
+
+#include <cstdlib>
+#include <ctime>
+
+namespace {
+
+std::vector<std::string> const ORDERS_SCHEMA   = {"o_orderkey",
+                                                  "o_custkey",
+                                                  "o_orderstatus",
+                                                  "o_totalprice",
+                                                  "o_orderdate",
+                                                  "o_orderpriority",
+                                                  "o_clerk",
+                                                  "o_shippriority",
+                                                  "o_comment"};
+std::vector<std::string> const LINEITEM_SCHEMA = {"l_orderkey",
+                                                  "l_partkey",
+                                                  "l_suppkey",
+                                                  "l_linenumber",
+                                                  "l_quantity",
+                                                  "l_extendedprice",
+                                                  "l_discount",
+                                                  "l_tax",
+                                                  "l_returnflag",
+                                                  "l_linestatus",
+                                                  "l_shipdate",
+                                                  "l_commitdate",
+                                                  "l_receiptdate",
+                                                  "l_shipinstruct",
+                                                  "l_shipmode",
+                                                  "l_comment"};
+std::vector<std::string> const PART_SCHEMA     = {"p_partkey",
+                                                  "p_name",
+                                                  "p_mfgr",
+                                                  "p_brand",
+                                                  "p_type",
+                                                  "p_size",
+                                                  "p_container",
+                                                  "p_retailprice",
+                                                  "p_comment"};
+std::vector<std::string> const PARTSUPP_SCHEMA = {
+  "ps_partkey", "ps_suppkey", "ps_availqty", "ps_supplycost", "ps_comment"};
+std::vector<std::string> const SUPPLIER_SCHEMA = {
+  "s_suppkey", "s_name", "s_address", "s_nationkey", "s_phone", "s_acctbal", "s_comment"};
+std::vector<std::string> const CUSTOMER_SCHEMA = {"c_custkey",
+                                                  "c_name",
+                                                  "c_address",
+                                                  "c_nationkey",
+                                                  "c_phone",
+                                                  "c_acctbal",
+                                                  "c_mktsegment",
+                                                  "c_comment"};
+std::vector<std::string> const NATION_SCHEMA   = {
+  "n_nationkey", "n_name", "n_regionkey", "n_comment"};
+std::vector<std::string> const REGION_SCHEMA = {"r_regionkey", "r_name", "r_comment"};
+
+}  // namespace
+
+cudf::table_view table_with_names::table() const { return tbl->view(); }
+
+cudf::column_view table_with_names::column(std::string const& col_name) const
+{
+  return tbl->view().column(column_id(col_name));
+}
+
+std::vector<std::string> const& table_with_names::column_names() const { return col_names; }
+
+cudf::size_type table_with_names::column_id(std::string const& col_name) const
+{
+  auto it = std::find(col_names.begin(), col_names.end(), col_name);
+  if (it == col_names.end()) {
+    std::string err_msg = "Column `" + col_name + "` not found";
+    throw std::runtime_error(err_msg);
+  }
+  return std::distance(col_names.begin(), it);
+}
+
+table_with_names& table_with_names::append(std::unique_ptr<cudf::column>& col,
+                                           std::string const& col_name)
+{
+  auto cols = tbl->release();
+  cols.push_back(std::move(col));
+  tbl = std::make_unique<cudf::table>(std::move(cols));
+  col_names.push_back(col_name);
+  return (*this);
+}
+
+cudf::table_view table_with_names::select(std::vector<std::string> const& col_names) const
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::size_type> col_indices;
+  for (auto const& col_name : col_names) {
+    col_indices.push_back(column_id(col_name));
+  }
+  return tbl->select(col_indices);
+}
+
+void table_with_names::to_parquet(std::string const& filepath) const
+{
+  CUDF_FUNC_RANGE();
+  auto const sink_info = cudf::io::sink_info(filepath);
+  cudf::io::table_metadata metadata;
+  metadata.schema_info =
+    std::vector<cudf::io::column_name_info>(col_names.begin(), col_names.end());
+  auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
+  auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view());
+  builder.metadata(table_input_metadata);
+  auto const options = builder.build();
+  cudf::io::write_parquet(options);
+}
+
+std::unique_ptr<cudf::table> join_and_gather(cudf::table_view const& left_input,
+                                             cudf::table_view const& right_input,
+                                             std::vector<cudf::size_type> const& left_on,
+                                             std::vector<cudf::size_type> const& right_on,
+                                             cudf::null_equality compare_nulls)
+{
+  CUDF_FUNC_RANGE();
+  constexpr auto oob_policy                          = cudf::out_of_bounds_policy::DONT_CHECK;
+  auto const left_selected                           = left_input.select(left_on);
+  auto const right_selected                          = right_input.select(right_on);
+  auto const [left_join_indices, right_join_indices] = cudf::inner_join(
+    left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref());
+
+  auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
+  auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
+
+  auto const left_indices_col  = cudf::column_view{left_indices_span};
+  auto const right_indices_col = cudf::column_view{right_indices_span};
+
+  auto const left_result  = cudf::gather(left_input, left_indices_col, oob_policy);
+  auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy);
+
+  auto joined_cols = left_result->release();
+  auto right_cols  = right_result->release();
+  joined_cols.insert(joined_cols.end(),
+                     std::make_move_iterator(right_cols.begin()),
+                     std::make_move_iterator(right_cols.end()));
+  return std::make_unique<cudf::table>(std::move(joined_cols));
+}
+
+std::unique_ptr<table_with_names> apply_inner_join(
+  std::unique_ptr<table_with_names> const& left_input,
+  std::unique_ptr<table_with_names> const& right_input,
+  std::vector<std::string> const& left_on,
+  std::vector<std::string> const& right_on,
+  cudf::null_equality compare_nulls)
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::size_type> left_on_indices;
+  std::vector<cudf::size_type> right_on_indices;
+  std::transform(
+    left_on.begin(), left_on.end(), std::back_inserter(left_on_indices), [&](auto const& col_name) {
+      return left_input->column_id(col_name);
+    });
+  std::transform(right_on.begin(),
+                 right_on.end(),
+                 std::back_inserter(right_on_indices),
+                 [&](auto const& col_name) { return right_input->column_id(col_name); });
+  auto table = join_and_gather(
+    left_input->table(), right_input->table(), left_on_indices, right_on_indices, compare_nulls);
+  ;
+  std::vector<std::string> merged_column_names;
+  merged_column_names.reserve(left_input->column_names().size() +
+                              right_input->column_names().size());
+  std::copy(left_input->column_names().begin(),
+            left_input->column_names().end(),
+            std::back_inserter(merged_column_names));
+  std::copy(right_input->column_names().begin(),
+            right_input->column_names().end(),
+            std::back_inserter(merged_column_names));
+  return std::make_unique<table_with_names>(std::move(table), merged_column_names);
+  return std::make_unique<table_with_names>(std::move(table), merged_column_names);
+}
+
+std::unique_ptr<table_with_names> apply_filter(std::unique_ptr<table_with_names> const& table,
+                                               cudf::ast::operation const& predicate)
+{
+  CUDF_FUNC_RANGE();
+  auto const boolean_mask = cudf::compute_column(table->table(), predicate);
+  auto result_table       = cudf::apply_boolean_mask(table->table(), boolean_mask->view());
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+std::unique_ptr<table_with_names> apply_mask(std::unique_ptr<table_with_names> const& table,
+                                             std::unique_ptr<cudf::column> const& mask)
+{
+  CUDF_FUNC_RANGE();
+  auto result_table = cudf::apply_boolean_mask(table->table(), mask->view());
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+std::unique_ptr<table_with_names> apply_groupby(std::unique_ptr<table_with_names> const& table,
+                                                groupby_context_t const& ctx)
+{
+  CUDF_FUNC_RANGE();
+  auto const keys = table->select(ctx.keys);
+  cudf::groupby::groupby groupby_obj(keys);
+  std::vector<std::string> result_column_names;
+  result_column_names.insert(result_column_names.end(), ctx.keys.begin(), ctx.keys.end());
+  std::vector<cudf::groupby::aggregation_request> requests;
+  for (auto& [value_col, aggregations] : ctx.values) {
+    requests.emplace_back(cudf::groupby::aggregation_request());
+    for (auto& agg : aggregations) {
+      if (agg.first == cudf::aggregation::Kind::SUM) {
+        requests.back().aggregations.push_back(
+          cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+      } else if (agg.first == cudf::aggregation::Kind::MEAN) {
+        requests.back().aggregations.push_back(
+          cudf::make_mean_aggregation<cudf::groupby_aggregation>());
+      } else if (agg.first == cudf::aggregation::Kind::COUNT_ALL) {
+        requests.back().aggregations.push_back(
+          cudf::make_count_aggregation<cudf::groupby_aggregation>());
+      } else {
+        throw std::runtime_error("Unsupported aggregation");
+      }
+      result_column_names.push_back(agg.second);
+    }
+    requests.back().values = table->column(value_col);
+  }
+  auto agg_results = groupby_obj.aggregate(requests);
+  std::vector<std::unique_ptr<cudf::column>> result_columns;
+  for (auto i = 0; i < agg_results.first->num_columns(); i++) {
+    auto col = std::make_unique<cudf::column>(agg_results.first->get_column(i));
+    result_columns.push_back(std::move(col));
+  }
+  for (size_t i = 0; i < agg_results.second.size(); i++) {
+    for (size_t j = 0; j < agg_results.second[i].results.size(); j++) {
+      result_columns.push_back(std::move(agg_results.second[i].results[j]));
+    }
+  }
+  auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
+  return std::make_unique<table_with_names>(std::move(result_table), result_column_names);
+}
+
+std::unique_ptr<table_with_names> apply_orderby(std::unique_ptr<table_with_names> const& table,
+                                                std::vector<std::string> const& sort_keys,
+                                                std::vector<cudf::order> const& sort_key_orders)
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::column_view> column_views;
+  for (auto& key : sort_keys) {
+    column_views.push_back(table->column(key));
+  }
+  auto result_table =
+    cudf::sort_by_key(table->table(), cudf::table_view{column_views}, sort_key_orders);
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+std::unique_ptr<table_with_names> apply_reduction(cudf::column_view const& column,
+                                                  cudf::aggregation::Kind const& agg_kind,
+                                                  std::string const& col_name)
+{
+  CUDF_FUNC_RANGE();
+  auto const agg            = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  auto const result         = cudf::reduce(column, *agg, column.type());
+  cudf::size_type const len = 1;
+  auto col                  = cudf::make_column_from_scalar(*result, len);
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(col));
+  auto result_table                  = std::make_unique<cudf::table>(std::move(columns));
+  std::vector<std::string> col_names = {col_name};
+  return std::make_unique<table_with_names>(std::move(result_table), col_names);
+}
+
+std::unique_ptr<table_with_names> read_parquet(
+  cudf::io::source_info const& source_info,
+  std::vector<std::string> const& columns,
+  std::unique_ptr<cudf::ast::operation> const& predicate)
+{
+  CUDF_FUNC_RANGE();
+  auto builder = cudf::io::parquet_reader_options_builder(source_info);
+  if (!columns.empty()) { builder.columns(columns); }
+  if (predicate) { builder.filter(*predicate); }
+  auto const options       = builder.build();
+  auto table_with_metadata = cudf::io::read_parquet(options);
+  std::vector<std::string> column_names;
+  for (auto const& col_info : table_with_metadata.metadata.schema_info) {
+    column_names.push_back(col_info.name);
+  }
+  return std::make_unique<table_with_names>(std::move(table_with_metadata.tbl), column_names);
+}
+
+std::tm make_tm(int year, int month, int day)
+{
+  std::tm tm{};
+  tm.tm_year = year - 1900;
+  tm.tm_mon  = month - 1;
+  tm.tm_mday = day;
+  return tm;
+}
+
+int32_t days_since_epoch(int year, int month, int day)
+{
+  std::tm tm             = make_tm(year, month, day);
+  std::tm epoch          = make_tm(1970, 1, 1);
+  std::time_t time       = std::mktime(&tm);
+  std::time_t epoch_time = std::mktime(&epoch);
+  double diff            = std::difftime(time, epoch_time) / (60 * 60 * 24);
+  return static_cast<int32_t>(diff);
+}
+
+void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
+                                    std::vector<std::string> const& col_names,
+                                    parquet_device_buffer& source)
+{
+  CUDF_FUNC_RANGE();
+  auto const stream = cudf::get_default_stream();
+
+  // Prepare the table metadata
+  cudf::io::table_metadata metadata;
+  std::vector<cudf::io::column_name_info> col_name_infos;
+  for (auto& col_name : col_names) {
+    col_name_infos.push_back(cudf::io::column_name_info(col_name));
+  }
+  metadata.schema_info            = col_name_infos;
+  auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
+
+  // Declare a host and device buffer
+  std::vector<char> h_buffer;
+
+  // Write parquet data to host buffer
+  auto builder =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&h_buffer), table->view());
+  builder.metadata(table_input_metadata);
+  auto const options = builder.build();
+  cudf::io::write_parquet(options);
+
+  // Copy host buffer to device buffer
+  source.d_buffer.resize(h_buffer.size(), stream);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
+    source.d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value()));
+}
+
+void generate_parquet_data_sources(double scale_factor,
+                                   std::vector<std::string> const& table_names,
+                                   std::unordered_map<std::string, parquet_device_buffer>& sources)
+{
+  CUDF_FUNC_RANGE();
+  std::for_each(table_names.begin(), table_names.end(), [&](auto const& table_name) {
+    sources[table_name] = parquet_device_buffer();
+  });
+
+  auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
+    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+
+  auto partsupp = cudf::datagen::generate_partsupp(
+    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+
+  auto supplier = cudf::datagen::generate_supplier(
+    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+
+  auto customer = cudf::datagen::generate_customer(
+    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+
+  auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
+                                               cudf::get_current_device_resource_ref());
+
+  auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
+                                               cudf::get_current_device_resource_ref());
+
+  write_to_parquet_device_buffer(std::move(orders), ORDERS_SCHEMA, sources["orders"]);
+  write_to_parquet_device_buffer(std::move(lineitem), LINEITEM_SCHEMA, sources["lineitem"]);
+  write_to_parquet_device_buffer(std::move(part), PART_SCHEMA, sources["part"]);
+  write_to_parquet_device_buffer(std::move(partsupp), PARTSUPP_SCHEMA, sources["partsupp"]);
+  write_to_parquet_device_buffer(std::move(customer), CUSTOMER_SCHEMA, sources["customer"]);
+  write_to_parquet_device_buffer(std::move(supplier), SUPPLIER_SCHEMA, sources["supplier"]);
+  write_to_parquet_device_buffer(std::move(nation), NATION_SCHEMA, sources["nation"]);
+  write_to_parquet_device_buffer(std::move(region), REGION_SCHEMA, sources["region"]);
+}
diff --git a/cpp/benchmarks/ndsh/utilities.hpp b/cpp/benchmarks/ndsh/utilities.hpp
new file mode 100644
index 00000000000..762e43deccf
--- /dev/null
+++ b/cpp/benchmarks/ndsh/utilities.hpp
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/io/parquet.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+/**
+ * @brief A class to represent a table with column names attached
+ */
+class table_with_names {
+ public:
+  table_with_names(std::unique_ptr<cudf::table> tbl, std::vector<std::string> col_names)
+    : tbl(std::move(tbl)), col_names(col_names){};
+  /**
+   * @brief Return the table view
+   */
+  [[nodiscard]] cudf::table_view table() const;
+  /**
+   * @brief Return the column view for a given column name
+   *
+   * @param col_name The name of the column
+   */
+  [[nodiscard]] cudf::column_view column(std::string const& col_name) const;
+  /**
+   * @param Return the column names of the table
+   */
+  [[nodiscard]] std::vector<std::string> const& column_names() const;
+  /**
+   * @brief Translate a column name to a column index
+   *
+   * @param col_name The name of the column
+   */
+  [[nodiscard]] cudf::size_type column_id(std::string const& col_name) const;
+  /**
+   * @brief Append a column to the table
+   *
+   * @param col The column to append
+   * @param col_name The name of the appended column
+   */
+  table_with_names& append(std::unique_ptr<cudf::column>& col, std::string const& col_name);
+  /**
+   * @brief Select a subset of columns from the table
+   *
+   * @param col_names The names of the columns to select
+   */
+  [[nodiscard]] cudf::table_view select(std::vector<std::string> const& col_names) const;
+  /**
+   * @brief Write the table to a parquet file
+   *
+   * @param filepath The path to the parquet file
+   */
+  void to_parquet(std::string const& filepath) const;
+
+ private:
+  std::unique_ptr<cudf::table> tbl;
+  std::vector<std::string> col_names;
+};
+
+/**
+ * @brief Inner join two tables and gather the result
+ *
+ * @param left_input The left input table
+ * @param right_input The right input table
+ * @param left_on The columns to join on in the left table
+ * @param right_on The columns to join on in the right table
+ * @param compare_nulls The null equality policy
+ */
+[[nodiscard]] std::unique_ptr<cudf::table> join_and_gather(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls);
+
+/**
+ * @brief Apply an inner join operation to two tables
+ *
+ * @param left_input The left input table
+ * @param right_input The right input table
+ * @param left_on The columns to join on in the left table
+ * @param right_on The columns to join on in the right table
+ * @param compare_nulls The null equality policy
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_inner_join(
+  std::unique_ptr<table_with_names> const& left_input,
+  std::unique_ptr<table_with_names> const& right_input,
+  std::vector<std::string> const& left_on,
+  std::vector<std::string> const& right_on,
+  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL);
+
+/**
+ * @brief Apply a filter predicate to a table
+ *
+ * @param table The input table
+ * @param predicate The filter predicate
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_filter(
+  std::unique_ptr<table_with_names> const& table, cudf::ast::operation const& predicate);
+
+/**
+ * @brief Apply a boolean mask to a table
+ *
+ * @param table The input table
+ * @param mask The boolean mask
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_mask(
+  std::unique_ptr<table_with_names> const& table, std::unique_ptr<cudf::column> const& mask);
+
+/**
+ * Struct representing group by key columns, value columns, and the type of aggregations to perform
+ * on the value columns
+ */
+struct groupby_context_t {
+  std::vector<std::string> keys;
+  std::unordered_map<std::string, std::vector<std::pair<cudf::aggregation::Kind, std::string>>>
+    values;
+};
+
+/**
+ * @brief Apply a groupby operation to a table
+ *
+ * @param table The input table
+ * @param ctx The groupby context
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_groupby(
+  std::unique_ptr<table_with_names> const& table, groupby_context_t const& ctx);
+
+/**
+ * @brief Apply an order by operation to a table
+ *
+ * @param table The input table
+ * @param sort_keys The sort keys
+ * @param sort_key_orders The sort key orders
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_orderby(
+  std::unique_ptr<table_with_names> const& table,
+  std::vector<std::string> const& sort_keys,
+  std::vector<cudf::order> const& sort_key_orders);
+
+/**
+ * @brief Apply a reduction operation to a column
+ *
+ * @param column The input column
+ * @param agg_kind The aggregation kind
+ * @param col_name The name of the output column
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_reduction(
+  cudf::column_view const& column,
+  cudf::aggregation::Kind const& agg_kind,
+  std::string const& col_name);
+
+/**
+ * @brief Read a parquet file into a table
+ *
+ * @param source_info The source of the parquet file
+ * @param columns The columns to read
+ * @param predicate The filter predicate to pushdown
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> read_parquet(
+  cudf::io::source_info const& source_info,
+  std::vector<std::string> const& columns                = {},
+  std::unique_ptr<cudf::ast::operation> const& predicate = nullptr);
+
+/**
+ * @brief Generate the `std::tm` structure from year, month, and day
+ *
+ * @param year The year
+ * @param month The month
+ * @param day The day
+ */
+std::tm make_tm(int year, int month, int day);
+
+/**
+ * @brief Calculate the number of days since the UNIX epoch
+ *
+ * @param year The year
+ * @param month The month
+ * @param day The day
+ */
+int32_t days_since_epoch(int year, int month, int day);
+
+/**
+ * @brief Struct representing a parquet device buffer
+ */
+struct parquet_device_buffer {
+  parquet_device_buffer() : d_buffer{0, cudf::get_default_stream()} {};
+  cudf::io::source_info make_source_info() { return cudf::io::source_info(d_buffer); }
+  rmm::device_uvector<std::byte> d_buffer;
+};
+
+/**
+ * @brief Write a `cudf::table` to a parquet device buffer
+ *
+ * @param table The `cudf::table` to write
+ * @param col_names The column names of the table
+ * @param parquet_device_buffer The parquet device buffer to write the table to
+ */
+void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
+                                    std::vector<std::string> const& col_names,
+                                    parquet_device_buffer& source);
+
+/**
+ * @brief Generate NDS-H tables and write to parquet device buffers
+ *
+ * @param scale_factor The scale factor of NDS-H tables to generate
+ * @param table_names The names of the tables to generate
+ * @param sources The parquet data sources to populate
+ */
+void generate_parquet_data_sources(double scale_factor,
+                                   std::vector<std::string> const& table_names,
+                                   std::unordered_map<std::string, parquet_device_buffer>& sources);
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 8e8d8bd0b78..25984df1b60 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -57,7 +57,6 @@ build_example() {
 }
 
 build_example basic
-build_example tpch
 build_example strings
 build_example nested_types
 build_example parquet_io
diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt
deleted file mode 100644
index 373a6d72d56..00000000000
--- a/cpp/examples/tpch/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-cmake_minimum_required(VERSION 3.26.4)
-
-include(../set_cuda_architecture.cmake)
-
-rapids_cuda_init_architectures(tpch_example)
-rapids_cuda_set_architectures(RAPIDS)
-
-project(
-  tpch_example
-  VERSION 0.0.1
-  LANGUAGES CXX CUDA
-)
-
-include(../fetch_dependencies.cmake)
-
-add_executable(tpch_q1 q1.cpp)
-target_link_libraries(tpch_q1 PRIVATE cudf::cudf)
-target_compile_features(tpch_q1 PRIVATE cxx_std_17)
-
-add_executable(tpch_q5 q5.cpp)
-target_link_libraries(tpch_q5 PRIVATE cudf::cudf)
-target_compile_features(tpch_q5 PRIVATE cxx_std_17)
-
-add_executable(tpch_q6 q6.cpp)
-target_link_libraries(tpch_q6 PRIVATE cudf::cudf)
-target_compile_features(tpch_q6 PRIVATE cxx_std_17)
-
-add_executable(tpch_q9 q9.cpp)
-target_link_libraries(tpch_q9 PRIVATE cudf::cudf)
-target_compile_features(tpch_q9 PRIVATE cxx_std_17)
-
-add_executable(tpch_q10 q10.cpp)
-target_link_libraries(tpch_q10 PRIVATE cudf::cudf)
-target_compile_features(tpch_q10 PRIVATE cxx_std_17)
diff --git a/cpp/examples/tpch/README.md b/cpp/examples/tpch/README.md
deleted file mode 100644
index 8c046c3f1e8..00000000000
--- a/cpp/examples/tpch/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# TPC-H Derived Examples
-
-Implements TPC-H queries using `libcudf`. We leverage the data generator (wrapper around official TPC-H datagen) from [Apache Datafusion](https://github.com/apache/datafusion) for generating data in Parquet format.
-
-## Requirements
-
-- Rust
-- [libcudf](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment)
-
-## Running Queries
-
-1. Build the `libcudf` examples.
-```bash
-cd cudf/cpp/examples
-./build.sh
-```
-The TPC-H query binaries would be built inside `tpch/build`.
-
-2. Generate the dataset.
-```bash
-cd tpch/datagen
-./datagen.sh [scale factor (1/10)]
-```
-
-The parquet files will be generated in `tpch/datagen/datafusion/benchmarks/data/tpch_sf[scale factor]`.
-
-3. Set these environment variables for optimized runtimes.
-```bash
-export KVIKIO_COMPAT_MODE="on"
-export LIBCUDF_CUFILE_POLICY="KVIKIO"
-export CUDA_MODULE_LOADING="EAGER"
-```
-
-4. Execute the queries.
-```bash
-./tpch/build/tpch_q[query no] [path to dataset] [memory resource type (cuda/pool/managed/managed_pool)]
-```
-
-A parquet file named `q[query no].parquet` would be generated containing the results of the query.
diff --git a/cpp/examples/tpch/datagen/correct_datatypes.py b/cpp/examples/tpch/datagen/correct_datatypes.py
deleted file mode 100644
index 8564774647b..00000000000
--- a/cpp/examples/tpch/datagen/correct_datatypes.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import os
-import sys
-
-import pyarrow as pa
-import pyarrow.parquet as pq
-import pandas as pd
-
-if __name__ == "__main__":
-    dataset_path = str(sys.argv[1])
-    tables = ["lineitem", "part", "partsupp", "orders", "supplier", "customer", "nation", "region"]
-    for table in tables:
-        filepath = os.path.join(dataset_path, f"{table}.parquet")
-        print("Reading file ", filepath)
-
-        if filepath.endswith("lineitem.parquet"):
-            df = pd.read_parquet(filepath)
-            df["l_linenumber"] = df["l_linenumber"].astype("int64")
-            df["l_quantity"] = df["l_quantity"].astype("int64")
-            df["l_extendedprice"] = df["l_extendedprice"].astype("float64")
-            df["l_discount"] = df["l_discount"].astype("float64")
-            df["l_tax"] = df["l_tax"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("part.parquet"):
-            df = pd.read_parquet(filepath)
-            df["p_size"] = df["p_size"].astype("int64")
-            df["p_retailprice"] = df["p_retailprice"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("partsupp.parquet"):
-            df = pd.read_parquet(filepath)
-            df["ps_availqty"] = df["ps_availqty"].astype("int64")
-            df["ps_supplycost"] = df["ps_supplycost"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("orders.parquet"):
-            df = pd.read_parquet(filepath)
-            df["o_totalprice"] = df["o_totalprice"].astype("float64")
-            df["o_shippriority"] = df["o_shippriority"].astype("int64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("supplier.parquet"):
-            df = pd.read_parquet(filepath)
-            df["s_acctbal"] = df["s_acctbal"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("customer.parquet"):
-            df = pd.read_parquet(filepath)
-            df["c_acctbal"] = df["c_acctbal"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("nation.parquet"):
-            df = pd.read_parquet(filepath)
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("region.parquet"):
-            df = pd.read_parquet(filepath)
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
diff --git a/cpp/examples/tpch/datagen/datagen.sh b/cpp/examples/tpch/datagen/datagen.sh
deleted file mode 100755
index 0b03753daea..00000000000
--- a/cpp/examples/tpch/datagen/datagen.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-set -e
-
-scale_factor=$1
-script_dir=$(pwd)
-
-# Clone the datafusion repository and apply a patch
-# for single threaded data generation so that a
-# single parquet file is generated for each table
-rm -rf datafusion
-git clone https://github.com/apache/datafusion.git datafusion
-cd datafusion/
-git checkout 679a85f
-git apply ${script_dir}/tpch.patch
-cd benchmarks/
-
-# Generate the data
-# Currently, we support only scale factor 1 and 10
-if [ ${scale_factor} -eq 1 ]; then
-    ./bench.sh data tpch
-elif [ ${scale_factor} -eq 10 ]; then
-    ./bench.sh data tpch10
-else
-    echo "Unsupported scale factor"
-    exit 1
-fi
-
-# Correct the datatypes of the parquet files
-python3 ${script_dir}/correct_datatypes.py data/tpch_sf${scale_factor}
diff --git a/cpp/examples/tpch/datagen/tpch.patch b/cpp/examples/tpch/datagen/tpch.patch
deleted file mode 100644
index 42727aa9904..00000000000
--- a/cpp/examples/tpch/datagen/tpch.patch
+++ /dev/null
@@ -1,33 +0,0 @@
-diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
-index 3b854f6dc..f000f09c0 100755
---- a/benchmarks/bench.sh
-+++ b/benchmarks/bench.sh
-@@ -311,6 +311,15 @@ data_tpch() {
-         $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
-         popd > /dev/null
-     fi
-+
-+    cp ${TPCH_DIR}/lineitem/part-0.parquet ${TPCH_DIR}/lineitem.parquet
-+    cp ${TPCH_DIR}/orders/part-0.parquet ${TPCH_DIR}/orders.parquet
-+    cp ${TPCH_DIR}/part/part-0.parquet ${TPCH_DIR}/part.parquet
-+    cp ${TPCH_DIR}/partsupp/part-0.parquet ${TPCH_DIR}/partsupp.parquet
-+    cp ${TPCH_DIR}/customer/part-0.parquet ${TPCH_DIR}/customer.parquet
-+    cp ${TPCH_DIR}/supplier/part-0.parquet ${TPCH_DIR}/supplier.parquet
-+    cp ${TPCH_DIR}/nation/part-0.parquet ${TPCH_DIR}/nation.parquet
-+    cp ${TPCH_DIR}/region/part-0.parquet ${TPCH_DIR}/region.parquet
- }
-
- # Runs the tpch benchmark
-diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
-index b5204b343..84fd2e78d 100644
---- a/datafusion/common/src/config.rs
-+++ b/datafusion/common/src/config.rs
-@@ -250,7 +250,7 @@ config_namespace! {
-         /// concurrency.
-         ///
-         /// Defaults to the number of CPU cores on the system
--        pub target_partitions: usize, default = num_cpus::get()
-+        pub target_partitions: usize, default = 1
-
-         /// The default time zone
-         ///
diff --git a/cpp/examples/tpch/utils.hpp b/cpp/examples/tpch/utils.hpp
deleted file mode 100644
index 8102fa8f976..00000000000
--- a/cpp/examples/tpch/utils.hpp
+++ /dev/null
@@ -1,458 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/binaryop.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/io/parquet.hpp>
-#include <cudf/join.hpp>
-#include <cudf/reduction.hpp>
-#include <cudf/sorting.hpp>
-#include <cudf/stream_compaction.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/transform.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <rmm/cuda_device.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/owning_wrapper.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-#include <ctime>
-
-// RMM memory resource creation utilities
-inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
-inline auto make_pool()
-{
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
-    make_cuda(), rmm::percent_of_free_device_memory(50));
-}
-inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
-inline auto make_managed_pool()
-{
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
-    make_managed(), rmm::percent_of_free_device_memory(50));
-}
-inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
-  std::string const& mode)
-{
-  if (mode == "cuda") return make_cuda();
-  if (mode == "pool") return make_pool();
-  if (mode == "managed") return make_managed();
-  if (mode == "managed_pool") return make_managed_pool();
-  CUDF_FAIL("Unknown rmm_mode parameter: " + mode +
-            "\nExpecting: cuda, pool, managed, or managed_pool");
-}
-
-/**
- * @brief A class to represent a table with column names attached
- */
-class table_with_names {
- public:
-  table_with_names(std::unique_ptr<cudf::table> tbl, std::vector<std::string> col_names)
-    : tbl(std::move(tbl)), col_names(col_names)
-  {
-  }
-  /**
-   * @brief Return the table view
-   */
-  [[nodiscard]] cudf::table_view table() const { return tbl->view(); }
-  /**
-   * @brief Return the column view for a given column name
-   *
-   * @param col_name The name of the column
-   */
-  [[nodiscard]] cudf::column_view column(std::string const& col_name) const
-  {
-    return tbl->view().column(col_id(col_name));
-  }
-  /**
-   * @param Return the column names of the table
-   */
-  [[nodiscard]] std::vector<std::string> column_names() const { return col_names; }
-  /**
-   * @brief Translate a column name to a column index
-   *
-   * @param col_name The name of the column
-   */
-  [[nodiscard]] cudf::size_type col_id(std::string const& col_name) const
-  {
-    CUDF_FUNC_RANGE();
-    auto it = std::find(col_names.begin(), col_names.end(), col_name);
-    if (it == col_names.end()) { throw std::runtime_error("Column not found"); }
-    return std::distance(col_names.begin(), it);
-  }
-  /**
-   * @brief Append a column to the table
-   *
-   * @param col The column to append
-   * @param col_name The name of the appended column
-   */
-  table_with_names& append(std::unique_ptr<cudf::column>& col, std::string const& col_name)
-  {
-    CUDF_FUNC_RANGE();
-    auto cols = tbl->release();
-    cols.push_back(std::move(col));
-    tbl = std::make_unique<cudf::table>(std::move(cols));
-    col_names.push_back(col_name);
-    return (*this);
-  }
-  /**
-   * @brief Select a subset of columns from the table
-   *
-   * @param col_names The names of the columns to select
-   */
-  [[nodiscard]] cudf::table_view select(std::vector<std::string> const& col_names) const
-  {
-    CUDF_FUNC_RANGE();
-    std::vector<cudf::size_type> col_indices;
-    for (auto const& col_name : col_names) {
-      col_indices.push_back(col_id(col_name));
-    }
-    return tbl->select(col_indices);
-  }
-  /**
-   * @brief Write the table to a parquet file
-   *
-   * @param filepath The path to the parquet file
-   */
-  void to_parquet(std::string const& filepath) const
-  {
-    CUDF_FUNC_RANGE();
-    auto const sink_info = cudf::io::sink_info(filepath);
-    cudf::io::table_metadata metadata;
-    metadata.schema_info =
-      std::vector<cudf::io::column_name_info>(col_names.begin(), col_names.end());
-    auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
-    auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view());
-    builder.metadata(table_input_metadata);
-    auto const options = builder.build();
-    cudf::io::write_parquet(options);
-  }
-
- private:
-  std::unique_ptr<cudf::table> tbl;
-  std::vector<std::string> col_names;
-};
-
-/**
- * @brief Concatenate two vectors
- *
- * @param lhs The left vector
- * @param rhs The right vector
- */
-template <typename T>
-std::vector<T> concat(std::vector<T> const& lhs, std::vector<T> const& rhs)
-{
-  std::vector<T> result;
-  result.reserve(lhs.size() + rhs.size());
-  std::copy(lhs.begin(), lhs.end(), std::back_inserter(result));
-  std::copy(rhs.begin(), rhs.end(), std::back_inserter(result));
-  return result;
-}
-
-/**
- * @brief Inner join two tables and gather the result
- *
- * @param left_input The left input table
- * @param right_input The right input table
- * @param left_on The columns to join on in the left table
- * @param right_on The columns to join on in the right table
- * @param compare_nulls The null equality policy
- */
-[[nodiscard]] std::unique_ptr<cudf::table> join_and_gather(
-  cudf::table_view const& left_input,
-  cudf::table_view const& right_input,
-  std::vector<cudf::size_type> const& left_on,
-  std::vector<cudf::size_type> const& right_on,
-  cudf::null_equality compare_nulls)
-{
-  CUDF_FUNC_RANGE();
-  constexpr auto oob_policy                          = cudf::out_of_bounds_policy::DONT_CHECK;
-  auto const left_selected                           = left_input.select(left_on);
-  auto const right_selected                          = right_input.select(right_on);
-  auto const [left_join_indices, right_join_indices] = cudf::inner_join(
-    left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref());
-
-  auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
-  auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
-
-  auto const left_indices_col  = cudf::column_view{left_indices_span};
-  auto const right_indices_col = cudf::column_view{right_indices_span};
-
-  auto const left_result  = cudf::gather(left_input, left_indices_col, oob_policy);
-  auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy);
-
-  auto joined_cols = left_result->release();
-  auto right_cols  = right_result->release();
-  joined_cols.insert(joined_cols.end(),
-                     std::make_move_iterator(right_cols.begin()),
-                     std::make_move_iterator(right_cols.end()));
-  return std::make_unique<cudf::table>(std::move(joined_cols));
-}
-
-/**
- * @brief Apply an inner join operation to two tables
- *
- * @param left_input The left input table
- * @param right_input The right input table
- * @param left_on The columns to join on in the left table
- * @param right_on The columns to join on in the right table
- * @param compare_nulls The null equality policy
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_inner_join(
-  std::unique_ptr<table_with_names> const& left_input,
-  std::unique_ptr<table_with_names> const& right_input,
-  std::vector<std::string> const& left_on,
-  std::vector<std::string> const& right_on,
-  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
-{
-  CUDF_FUNC_RANGE();
-  std::vector<cudf::size_type> left_on_indices;
-  std::vector<cudf::size_type> right_on_indices;
-  std::transform(
-    left_on.begin(), left_on.end(), std::back_inserter(left_on_indices), [&](auto const& col_name) {
-      return left_input->col_id(col_name);
-    });
-  std::transform(right_on.begin(),
-                 right_on.end(),
-                 std::back_inserter(right_on_indices),
-                 [&](auto const& col_name) { return right_input->col_id(col_name); });
-  auto table = join_and_gather(
-    left_input->table(), right_input->table(), left_on_indices, right_on_indices, compare_nulls);
-  return std::make_unique<table_with_names>(
-    std::move(table), concat(left_input->column_names(), right_input->column_names()));
-}
-
-/**
- * @brief Apply a filter predicated to a table
- *
- * @param table The input table
- * @param predicate The filter predicate
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_filter(
-  std::unique_ptr<table_with_names> const& table, cudf::ast::operation const& predicate)
-{
-  CUDF_FUNC_RANGE();
-  auto const boolean_mask = cudf::compute_column(table->table(), predicate);
-  auto result_table       = cudf::apply_boolean_mask(table->table(), boolean_mask->view());
-  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
-}
-
-/**
- * @brief Apply a boolean mask to a table
- *
- * @param table The input table
- * @param mask The boolean mask
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_mask(
-  std::unique_ptr<table_with_names> const& table, std::unique_ptr<cudf::column> const& mask)
-{
-  CUDF_FUNC_RANGE();
-  auto result_table = cudf::apply_boolean_mask(table->table(), mask->view());
-  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
-}
-
-struct groupby_context_t {
-  std::vector<std::string> keys;
-  std::unordered_map<std::string, std::vector<std::pair<cudf::aggregation::Kind, std::string>>>
-    values;
-};
-
-/**
- * @brief Apply a groupby operation to a table
- *
- * @param table The input table
- * @param ctx The groupby context
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_groupby(
-  std::unique_ptr<table_with_names> const& table, groupby_context_t const& ctx)
-{
-  CUDF_FUNC_RANGE();
-  auto const keys = table->select(ctx.keys);
-  cudf::groupby::groupby groupby_obj(keys);
-  std::vector<std::string> result_column_names;
-  result_column_names.insert(result_column_names.end(), ctx.keys.begin(), ctx.keys.end());
-  std::vector<cudf::groupby::aggregation_request> requests;
-  for (auto& [value_col, aggregations] : ctx.values) {
-    requests.emplace_back(cudf::groupby::aggregation_request());
-    for (auto& agg : aggregations) {
-      if (agg.first == cudf::aggregation::Kind::SUM) {
-        requests.back().aggregations.push_back(
-          cudf::make_sum_aggregation<cudf::groupby_aggregation>());
-      } else if (agg.first == cudf::aggregation::Kind::MEAN) {
-        requests.back().aggregations.push_back(
-          cudf::make_mean_aggregation<cudf::groupby_aggregation>());
-      } else if (agg.first == cudf::aggregation::Kind::COUNT_ALL) {
-        requests.back().aggregations.push_back(
-          cudf::make_count_aggregation<cudf::groupby_aggregation>());
-      } else {
-        throw std::runtime_error("Unsupported aggregation");
-      }
-      result_column_names.push_back(agg.second);
-    }
-    requests.back().values = table->column(value_col);
-  }
-  auto agg_results = groupby_obj.aggregate(requests);
-  std::vector<std::unique_ptr<cudf::column>> result_columns;
-  for (size_t i = 0; i < agg_results.first->num_columns(); i++) {
-    auto col = std::make_unique<cudf::column>(agg_results.first->get_column(i));
-    result_columns.push_back(std::move(col));
-  }
-  for (size_t i = 0; i < agg_results.second.size(); i++) {
-    for (size_t j = 0; j < agg_results.second[i].results.size(); j++) {
-      result_columns.push_back(std::move(agg_results.second[i].results[j]));
-    }
-  }
-  auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
-  return std::make_unique<table_with_names>(std::move(result_table), result_column_names);
-}
-
-/**
- * @brief Apply an order by operation to a table
- *
- * @param table The input table
- * @param sort_keys The sort keys
- * @param sort_key_orders The sort key orders
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_orderby(
-  std::unique_ptr<table_with_names> const& table,
-  std::vector<std::string> const& sort_keys,
-  std::vector<cudf::order> const& sort_key_orders)
-{
-  CUDF_FUNC_RANGE();
-  std::vector<cudf::column_view> column_views;
-  for (auto& key : sort_keys) {
-    column_views.push_back(table->column(key));
-  }
-  auto result_table =
-    cudf::sort_by_key(table->table(), cudf::table_view{column_views}, sort_key_orders);
-  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
-}
-
-/**
- * @brief Apply a reduction operation to a column
- *
- * @param column The input column
- * @param agg_kind The aggregation kind
- * @param col_name The name of the output column
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_reduction(
-  cudf::column_view const& column,
-  cudf::aggregation::Kind const& agg_kind,
-  std::string const& col_name)
-{
-  CUDF_FUNC_RANGE();
-  auto const agg            = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
-  auto const result         = cudf::reduce(column, *agg, column.type());
-  cudf::size_type const len = 1;
-  auto col                  = cudf::make_column_from_scalar(*result, len);
-  std::vector<std::unique_ptr<cudf::column>> columns;
-  columns.push_back(std::move(col));
-  auto result_table                  = std::make_unique<cudf::table>(std::move(columns));
-  std::vector<std::string> col_names = {col_name};
-  return std::make_unique<table_with_names>(std::move(result_table), col_names);
-}
-
-/**
- * @brief Read a parquet file into a table
- *
- * @param filename The path to the parquet file
- * @param columns The columns to read
- * @param predicate The filter predicate to pushdown
- */
-[[nodiscard]] std::unique_ptr<table_with_names> read_parquet(
-  std::string const& filename,
-  std::vector<std::string> const& columns                = {},
-  std::unique_ptr<cudf::ast::operation> const& predicate = nullptr)
-{
-  CUDF_FUNC_RANGE();
-  auto const source = cudf::io::source_info(filename);
-  auto builder      = cudf::io::parquet_reader_options_builder(source);
-  if (!columns.empty()) { builder.columns(columns); }
-  if (predicate) { builder.filter(*predicate); }
-  auto const options       = builder.build();
-  auto table_with_metadata = cudf::io::read_parquet(options);
-  std::vector<std::string> column_names;
-  for (auto const& col_info : table_with_metadata.metadata.schema_info) {
-    column_names.push_back(col_info.name);
-  }
-  return std::make_unique<table_with_names>(std::move(table_with_metadata.tbl), column_names);
-}
-
-/**
- * @brief Generate the `std::tm` structure from year, month, and day
- *
- * @param year The year
- * @param month The month
- * @param day The day
- */
-std::tm make_tm(int year, int month, int day)
-{
-  std::tm tm{};
-  tm.tm_year = year - 1900;
-  tm.tm_mon  = month - 1;
-  tm.tm_mday = day;
-  return tm;
-}
-
-/**
- * @brief Calculate the number of days since the UNIX epoch
- *
- * @param year The year
- * @param month The month
- * @param day The day
- */
-int32_t days_since_epoch(int year, int month, int day)
-{
-  std::tm tm             = make_tm(year, month, day);
-  std::tm epoch          = make_tm(1970, 1, 1);
-  std::time_t time       = std::mktime(&tm);
-  std::time_t epoch_time = std::mktime(&epoch);
-  double diff            = std::difftime(time, epoch_time) / (60 * 60 * 24);
-  return static_cast<int32_t>(diff);
-}
-
-struct tpch_example_args {
-  std::string dataset_dir;
-  std::string memory_resource_type;
-};
-
-/**
- * @brief Parse command line arguments into a struct
- *
- * @param argc The number of command line arguments
- * @param argv The command line arguments
- */
-tpch_example_args parse_args(int argc, char const** argv)
-{
-  if (argc < 3) {
-    std::string usage_message = "Usage: " + std::string(argv[0]) +
-                                " <dataset_dir> <memory_resource_type>\n The query result will be "
-                                "saved to a parquet file named q{query_no}.parquet in the current "
-                                "working directory ";
-    throw std::runtime_error(usage_message);
-  }
-  tpch_example_args args;
-  args.dataset_dir          = argv[1];
-  args.memory_resource_type = argv[2];
-  return args;
-}

From 4cdb1bf9cf7ad4f19b8abd034513172902d187a3 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 10 Sep 2024 20:20:29 -0400
Subject: [PATCH 188/270] [FEA] Add support for `cudf.NamedAgg` (#16744)

Closes #15118

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16744
---
 python/cudf/cudf/__init__.py               |  2 +-
 python/cudf/cudf/core/groupby/__init__.py  |  5 ++-
 python/cudf/cudf/core/groupby/groupby.py   | 46 ++++++++++++++++++++--
 python/cudf/cudf/tests/groupby/test_agg.py | 16 ++++++++
 4 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index d7da42a1708..99b759e2166 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -46,7 +46,7 @@
     ListDtype,
     StructDtype,
 )
-from cudf.core.groupby import Grouper
+from cudf.core.groupby import Grouper, NamedAgg
 from cudf.core.index import (
     BaseIndex,
     CategoricalIndex,
diff --git a/python/cudf/cudf/core/groupby/__init__.py b/python/cudf/cudf/core/groupby/__init__.py
index 4375ed3e3da..621edb316cf 100644
--- a/python/cudf/cudf/core/groupby/__init__.py
+++ b/python/cudf/cudf/core/groupby/__init__.py
@@ -1,8 +1,9 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf.core.groupby.groupby import GroupBy, Grouper
+from cudf.core.groupby.groupby import GroupBy, Grouper, NamedAgg
 
 __all__ = [
     "GroupBy",
     "Grouper",
+    "NamedAgg",
 ]
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 4f283d41b17..6424c8af877 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -76,6 +76,34 @@ def _is_row_of(chunk, obj):
     )
 
 
+NamedAgg = pd.NamedAgg
+
+
+NamedAgg.__doc__ = """
+Helper for column specific aggregation with control over output column names.
+
+Subclass of typing.NamedTuple.
+
+Parameters
+----------
+column : Hashable
+    Column label in the DataFrame to apply aggfunc.
+aggfunc : function or str
+    Function to apply to the provided column.
+
+Examples
+--------
+>>> df = cudf.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]})
+>>> agg_a = cudf.NamedAgg(column="a", aggfunc="min")
+>>> agg_1 = cudf.NamedAgg(column=1, aggfunc=lambda x: x.mean())
+>>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1)
+        result_a  result_1
+key
+1          -1      10.5
+2           1      12.0
+"""
+
+
 groupby_doc_template = textwrap.dedent(
     """Group using a mapper or by a Series of columns.
 
@@ -1296,9 +1324,21 @@ def _normalize_aggs(
                 columns = values._columns
                 aggs_per_column = (aggs,) * len(columns)
         elif not aggs and kwargs:
-            column_names, aggs_per_column = kwargs.keys(), kwargs.values()
-            columns = tuple(self.obj._data[x[0]] for x in kwargs.values())
-            aggs_per_column = tuple(x[1] for x in kwargs.values())
+            column_names = kwargs.keys()
+
+            def _raise_invalid_type(x):
+                raise TypeError(
+                    f"Invalid keyword argument {x} of type {type(x)} was passed to agg"
+                )
+
+            columns, aggs_per_column = zip(
+                *(
+                    (self.obj._data[x[0]], x[1])
+                    if isinstance(x, tuple)
+                    else _raise_invalid_type(x)
+                    for x in kwargs.values()
+                )
+            )
         else:
             raise TypeError("Must provide at least one aggregation function.")
 
diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py
index 99e7523031b..dc20a27177a 100644
--- a/python/cudf/cudf/tests/groupby/test_agg.py
+++ b/python/cudf/cudf/tests/groupby/test_agg.py
@@ -56,3 +56,19 @@ def test_dataframe_agg(attr, func):
     )
 
     assert_eq(agg, pd_agg)
+
+    agg = getattr(df.groupby("a"), attr)(
+        foo=cudf.NamedAgg(column="b", aggfunc=func),
+        bar=cudf.NamedAgg(column="a", aggfunc=func),
+    )
+    pd_agg = getattr(pdf.groupby(["a"]), attr)(
+        foo=("b", func), bar=("a", func)
+    )
+
+    assert_eq(agg, pd_agg)
+
+
+def test_dataframe_agg_with_invalid_kwarg():
+    with pytest.raises(TypeError, match="Invalid keyword argument"):
+        df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]})
+        df.groupby("a").agg(foo=set())

From 750adca4e4cc7b18ef80ba39950ed1d250919016 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 10 Sep 2024 17:40:49 -0700
Subject: [PATCH 189/270] nvCOMP GZIP integration (#16770)

nvCOMP GZIP integration. Opt-in for now.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16770
---
 cpp/include/cudf/io/nvcomp_adapter.hpp     |  2 +-
 cpp/src/io/comp/nvcomp_adapter.cpp         | 14 +++++++++++---
 cpp/src/io/parquet/reader_impl_chunking.cu | 14 ++++++++++++--
 docs/cudf/source/user_guide/io/io.md       |  6 +++++-
 4 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp
index e7fe3cc7214..0d74a4158ad 100644
--- a/cpp/include/cudf/io/nvcomp_adapter.hpp
+++ b/cpp/include/cudf/io/nvcomp_adapter.hpp
@@ -24,7 +24,7 @@
 namespace CUDF_EXPORT cudf {
 namespace io::nvcomp {
 
-enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 };
+enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4, GZIP };
 
 /**
  * @brief Set of parameters that impact whether nvCOMP features are enabled.
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 261a8eb401d..c3187f73a95 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -23,6 +23,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <nvcomp/deflate.h>
+#include <nvcomp/gzip.h>
 #include <nvcomp/lz4.h>
 #include <nvcomp/snappy.h>
 #include <nvcomp/zstd.h>
@@ -44,6 +45,8 @@ auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&...
       return nvcompBatchedLZ4DecompressGetTempSizeEx(std::forward<Args>(args)...);
     case compression_type::DEFLATE:
       return nvcompBatchedDeflateDecompressGetTempSizeEx(std::forward<Args>(args)...);
+    case compression_type::GZIP:
+      return nvcompBatchedGzipDecompressGetTempSizeEx(std::forward<Args>(args)...);
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -73,6 +76,8 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
     case compression_type::DEFLATE:
       return nvcompBatchedDeflateDecompressAsync(std::forward<Args>(args)...);
     case compression_type::LZ4: return nvcompBatchedLZ4DecompressAsync(std::forward<Args>(args)...);
+    case compression_type::GZIP:
+      return nvcompBatchedGzipDecompressAsync(std::forward<Args>(args)...);
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -84,6 +89,7 @@ std::string compression_type_name(compression_type compression)
     case compression_type::ZSTD: return "Zstandard";
     case compression_type::DEFLATE: return "Deflate";
     case compression_type::LZ4: return "LZ4";
+    case compression_type::GZIP: return "GZIP";
   }
   return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
 }
@@ -359,8 +365,8 @@ std::optional<std::string> is_compression_disabled_impl(compression_type compres
         return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
       }
       return std::nullopt;
+    default: return "Unsupported compression type";
   }
-  return "Unsupported compression type";
 }
 
 std::optional<std::string> is_compression_disabled(compression_type compression,
@@ -396,7 +402,8 @@ std::optional<std::string> is_decompression_disabled_impl(compression_type compr
                                                           feature_status_parameters params)
 {
   switch (compression) {
-    case compression_type::DEFLATE: {
+    case compression_type::DEFLATE:
+    case compression_type::GZIP: {
       if (not params.are_all_integrations_enabled) {
         return "DEFLATE decompression is experimental, you can enable it through "
                "`LIBCUDF_NVCOMP_POLICY` environment variable.";
@@ -447,6 +454,7 @@ std::optional<std::string> is_decompression_disabled(compression_type compressio
 size_t required_alignment(compression_type compression)
 {
   switch (compression) {
+    case compression_type::GZIP:
     case compression_type::DEFLATE: return nvcompDeflateRequiredAlignment;
     case compression_type::SNAPPY: return nvcompSnappyRequiredAlignment;
     case compression_type::ZSTD: return nvcompZstdRequiredAlignment;
@@ -462,7 +470,7 @@ std::optional<size_t> compress_max_allowed_chunk_size(compression_type compressi
     case compression_type::SNAPPY: return nvcompSnappyCompressionMaxAllowedChunkSize;
     case compression_type::ZSTD: return nvcompZstdCompressionMaxAllowedChunkSize;
     case compression_type::LZ4: return nvcompLZ4CompressionMaxAllowedChunkSize;
-    default: return std::nullopt;
+    default: CUDF_FAIL("Unsupported compression type");
   }
 }
 
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 84f0dab0d8b..245e1829c72 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -865,8 +865,18 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
 
     switch (codec.compression_type) {
       case GZIP:
-        gpuinflate(
-          d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
+        if (cudf::io::nvcomp_integration::is_all_enabled()) {
+          nvcomp::batched_decompress(nvcomp::compression_type::GZIP,
+                                     d_comp_in_view,
+                                     d_comp_out_view,
+                                     d_comp_res_view,
+                                     codec.max_decompressed_size,
+                                     codec.total_decomp_size,
+                                     stream);
+        } else {
+          gpuinflate(
+            d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
+        }
         break;
       case SNAPPY:
         if (cudf::io::nvcomp_integration::is_stable_enabled()) {
diff --git a/docs/cudf/source/user_guide/io/io.md b/docs/cudf/source/user_guide/io/io.md
index adcdaa51e7e..97b961b455b 100644
--- a/docs/cudf/source/user_guide/io/io.md
+++ b/docs/cudf/source/user_guide/io/io.md
@@ -75,7 +75,6 @@ IO format.
 
 </div>
 
-
 **Notes:**
 
 - \[¹\] - Not all orientations are GPU-accelerated.
@@ -177,4 +176,9 @@ If no value is set, behavior will be the same as the "STABLE" option.
     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
     | DEFLATE               | ❌     | ❌     | ❌           | ❌           | ❌      | ❌     | Experimental | Experimental | ❌     |
     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+    | LZ4                   | ❌     | ❌     | Stable       | Stable       | ❌      | ❌     | Stable       | Stable       | ❌     |
+    +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+    | GZIP                  | ❌     | ❌     | Experimental | Experimental | ❌      | ❌     | ❌           | ❌           | ❌     |
+    +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+
 ```

From 9acbaf88cbe025a9dd2ccf1208828de2598e0199 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 11 Sep 2024 09:34:21 -0500
Subject: [PATCH 190/270] JSON reader validation of values (#15968)

Addresses part of https://github.com/rapidsai/cudf/issues/15222
This change adds validation stage in JSON reader at tokens level. If any validation fails in a row, it will make the entire row as null.

- [x] validation functor - implement spark validation rules. (@revans2 implemented all validation rules)
- [x] move output iterator to thrust. (already merged by https://github.com/NVIDIA/cccl/pull/2282)
- [x] Fix failing tests and infer data type for Float.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15968
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/io/json.hpp                  | 190 +++++++++++
 cpp/src/io/json/json_normalization.cu         |   3 +
 cpp/src/io/json/nested_json.hpp               |  15 +
 cpp/src/io/json/nested_json_gpu.cu            |   5 +-
 cpp/src/io/json/process_tokens.cu             | 310 ++++++++++++++++++
 cpp/src/io/json/tabulate_output_iterator.cuh  | 132 ++++++++
 cpp/tests/io/json/json_test.cpp               |  80 +++++
 .../main/java/ai/rapids/cudf/JSONOptions.java |  65 +++-
 java/src/main/java/ai/rapids/cudf/Table.java  |  73 ++++-
 java/src/main/native/src/TableJni.cpp         |  66 +++-
 .../test/java/ai/rapids/cudf/TableTest.java   | 201 ++++++++++++
 12 files changed, 1113 insertions(+), 28 deletions(-)
 create mode 100644 cpp/src/io/json/process_tokens.cu
 create mode 100644 cpp/src/io/json/tabulate_output_iterator.cuh

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1040fcb7b91..7bc01e64441 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -384,6 +384,7 @@ add_library(
   src/io/json/nested_json_gpu.cu
   src/io/json/read_json.cu
   src/io/json/parser_features.cpp
+  src/io/json/process_tokens.cu
   src/io/json/write_json.cu
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index a3d6533705e..ff25a5bacae 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -20,6 +20,7 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <map>
@@ -128,6 +129,19 @@ class json_reader_options {
   // Whether to recover after an invalid JSON line
   json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
 
+  // Validation checks for spark
+  // Should the json validation be strict or not
+  // Note: strict validation enforces the JSON specification https://www.json.org/json-en.html
+  bool _strict_validation = false;
+  // Allow leading zeros for numeric values.
+  bool _allow_numeric_leading_zeros = true;
+  // Allow non-numeric numbers: NaN, +INF, -INF, +Infinity, Infinity, -Infinity
+  bool _allow_nonnumeric_numbers = true;
+  // Allow unquoted control characters
+  bool _allow_unquoted_control_chars = true;
+  // Additional values to recognize as null values
+  std::vector<std::string> _na_values;
+
   /**
    * @brief Constructor from source info.
    *
@@ -298,6 +312,55 @@ class json_reader_options {
    */
   [[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; }
 
+  /**
+   * @brief Whether json validation should be enforced strictly or not.
+   *
+   * @return true if it should be.
+   */
+  [[nodiscard]] bool is_strict_validation() const { return _strict_validation; }
+
+  /**
+   * @brief Whether leading zeros are allowed in numeric values.
+   *
+   * @note: This validation is enforced only if strict validation is enabled.
+   *
+   * @return true if leading zeros are allowed in numeric values
+   */
+  [[nodiscard]] bool is_allowed_numeric_leading_zeros() const
+  {
+    return _allow_numeric_leading_zeros;
+  }
+
+  /**
+   * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity,
+   * and -Infinity.
+   *
+   * @note: This validation is enforced only if strict validation is enabled.
+   *
+   * @return true if leading zeros are allowed in numeric values
+   */
+  [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; }
+
+  /**
+   * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32
+   * be allowed without some form of escaping.
+   *
+   * @note: This validation is enforced only if strict validation is enabled.
+   *
+   * @return true if unquoted control chars are allowed.
+   */
+  [[nodiscard]] bool is_allowed_unquoted_control_chars() const
+  {
+    return _allow_unquoted_control_chars;
+  }
+
+  /**
+   * @brief Returns additional values to recognize as null values.
+   *
+   * @return Additional values to recognize as null values
+   */
+  [[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }
+
   /**
    * @brief Set data types for columns to be read.
    *
@@ -427,6 +490,63 @@ class json_reader_options {
    * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines.
    */
   void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; }
+
+  /**
+   * @brief Set whether strict validation is enabled or not.
+   *
+   * @param val Boolean value to indicate whether strict validation is enabled.
+   */
+  void set_strict_validation(bool val) { _strict_validation = val; }
+
+  /**
+   * @brief Set whether leading zeros are allowed in numeric values. Strict validation
+   * must be enabled for this to work.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   */
+  void allow_numeric_leading_zeros(bool val)
+  {
+    CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
+    _allow_numeric_leading_zeros = val;
+  }
+
+  /**
+   * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity,
+   * Infinity, and -Infinity. Strict validation must be enabled for this to work.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   */
+  void allow_nonnumeric_numbers(bool val)
+  {
+    CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
+    _allow_nonnumeric_numbers = val;
+  }
+
+  /**
+   * @brief Set whether in a quoted string should characters greater than or equal to 0
+   * and less than 32 be allowed without some form of escaping. Strict validation must
+   * be enabled for this to work.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val true to indicate whether unquoted control chars are allowed.
+   */
+  void allow_unquoted_control_chars(bool val)
+  {
+    CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
+    _allow_unquoted_control_chars = val;
+  }
+
+  /**
+   * @brief Sets additional values to recognize as null values.
+   *
+   * @param vals Vector of values to be considered to be null
+   */
+  void set_na_values(std::vector<std::string> vals) { _na_values = std::move(vals); }
 };
 
 /**
@@ -638,6 +758,76 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether json validation should be strict or not.
+   *
+   * @param val Boolean value to indicate whether json validation should be strict or not.
+   * @return this for chaining
+   */
+  json_reader_options_builder& strict_validation(bool val)
+  {
+    options.set_strict_validation(val);
+    return *this;
+  }
+
+  /**
+   * @brief Set Whether leading zeros are allowed in numeric values. Strict validation must
+   * be enabled for this to have any effect.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   * @return this for chaining
+   */
+  json_reader_options_builder& numeric_leading_zeros(bool val)
+  {
+    options.allow_numeric_leading_zeros(val);
+    return *this;
+  }
+
+  /**
+   * @brief Set whether specific unquoted number values are valid JSON. The values are NaN,
+   * +INF, -INF, +Infinity, Infinity, and -Infinity.
+   * Strict validation must be enabled for this to have any effect.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not.
+   * @return this for chaining
+   */
+  json_reader_options_builder& nonnumeric_numbers(bool val)
+  {
+    options.allow_nonnumeric_numbers(val);
+    return *this;
+  }
+
+  /**
+   * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without
+   * some form of escaping. Strict validation must be enabled for this to have any effect.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate if unquoted control chars are allowed or not.
+   * @return this for chaining
+   */
+  json_reader_options_builder& unquoted_control_chars(bool val)
+  {
+    options.allow_unquoted_control_chars(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets additional values to recognize as null values.
+   *
+   * @param vals Vector of values to be considered to be null
+   * @return this for chaining
+   */
+  json_reader_options_builder& na_values(std::vector<std::string> vals)
+  {
+    options.set_na_values(std::move(vals));
+    return *this;
+  }
+
   /**
    * @brief move json_reader_options member once it's built.
    */
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 7899ea7bac4..97d5884fef1 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -16,6 +16,7 @@
 
 #include "io/fst/lookup_tables.cuh"
 
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -302,6 +303,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
+  CUDF_FUNC_RANGE();
   static constexpr std::int32_t min_out = 0;
   static constexpr std::int32_t max_out = 2;
   auto parser =
@@ -330,6 +332,7 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr)
 {
+  CUDF_FUNC_RANGE();
   static constexpr std::int32_t min_out = 0;
   static constexpr std::int32_t max_out = 2;
   auto parser =
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index b06458e1a8e..75639a0438f 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -225,6 +225,21 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
   device_span<SymbolOffsetT const> token_indices,
   rmm::cuda_stream_view stream);
 
+/**
+ * @brief Validate the tokens conforming to behavior given in options.
+ *
+ * @param d_input The string of input characters
+ * @param tokens The tokens to be post-processed
+ * @param token_indices The tokens' corresponding indices that are post-processed
+ * @param options Parsing options specifying the parsing behaviour
+ * @param stream The cuda stream to dispatch GPU kernels to
+ */
+void validate_token_stream(device_span<char const> d_input,
+                           device_span<PdaTokenT> tokens,
+                           device_span<SymbolOffsetT> token_indices,
+                           cudf::io::json_reader_options const& options,
+                           rmm::cuda_stream_view stream);
+
 /**
  * @brief Parses the given JSON string and generates a tree representation of the given input.
  *
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index d76e5447c30..4e513d3495c 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1660,6 +1660,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
 
   if (delimiter_offset == 1) {
     tokens.set_element(0, token_t::LineEnd, stream);
+    validate_token_stream(json_in, tokens, tokens_indices, options, stream);
     auto [filtered_tokens, filtered_tokens_indices] =
       process_token_stream(tokens, tokens_indices, stream);
     tokens         = std::move(filtered_tokens);
@@ -2082,7 +2083,9 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
   parse_opts.keepquotes = options.is_enabled_keep_quotes();
   parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
   parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
-  parse_opts.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+  std::vector<std::string> na_values{"", "null"};
+  na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end());
+  parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream);
   return parse_opts;
 }
 
diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
new file mode 100644
index 00000000000..83c7b663980
--- /dev/null
+++ b/cpp/src/io/json/process_tokens.cu
@@ -0,0 +1,310 @@
+
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/utilities/trie.cuh"
+#include "nested_json.hpp"
+#include "tabulate_output_iterator.cuh"
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/tokenize_json.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/transform_scan.h>
+
+namespace cudf::io::json {
+namespace detail {
+
+struct write_if {
+  using token_t   = cudf::io::json::token_t;
+  using scan_type = thrust::pair<token_t, bool>;
+  PdaTokenT* tokens;
+  size_t n;
+  // Index, value
+  __device__ void operator()(size_type i, scan_type x)
+  {
+    if (i == n - 1 or tokens[i + 1] == token_t::LineEnd) {
+      if (x.first == token_t::ErrorBegin and tokens[i] != token_t::ErrorBegin) {
+        tokens[i] = token_t::ErrorBegin;
+      }
+    }
+  }
+};
+
+enum class number_state {
+  START = 0,
+  SAW_NEG,  // not a complete state
+  LEADING_ZERO,
+  WHOLE,
+  SAW_RADIX,  // not a complete state
+  FRACTION,
+  START_EXPONENT,       // not a complete state
+  AFTER_SIGN_EXPONENT,  // not a complete state
+  EXPONENT
+};
+
+enum class string_state {
+  NORMAL = 0,
+  ESCAPED,   // not a complete state
+  ESCAPED_U  // not a complete state
+};
+
+__device__ inline bool substr_eq(const char* data,
+                                 SymbolOffsetT const start,
+                                 SymbolOffsetT const end,
+                                 SymbolOffsetT const expected_len,
+                                 const char* expected)
+{
+  if (end - start != expected_len) { return false; }
+  for (auto idx = 0; idx < expected_len; idx++) {
+    if (data[start + idx] != expected[idx]) { return false; }
+  }
+  return true;
+}
+
+void validate_token_stream(device_span<char const> d_input,
+                           device_span<PdaTokenT> tokens,
+                           device_span<SymbolOffsetT> token_indices,
+                           cudf::io::json_reader_options const& options,
+                           rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  if (!options.is_strict_validation()) { return; }
+  using token_t = cudf::io::json::token_t;
+  cudf::detail::optional_trie trie_na =
+    cudf::detail::create_serialized_trie(options.get_na_values(), stream);
+  auto trie_na_view    = cudf::detail::make_trie_view(trie_na);
+  auto validate_values = cuda::proclaim_return_type<bool>(
+    [data                        = d_input.data(),
+     trie_na                     = trie_na_view,
+     allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
+     allow_nonnumeric =
+       options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
+                                                           SymbolOffsetT end) -> bool {
+      // This validates an unquoted value. A value must match https://www.json.org/json-en.html
+      // but the leading and training whitespace should already have been removed, and is not
+      // a string
+      auto c               = data[start];
+      auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
+      if (is_null_literal) {
+        return true;
+      } else if ('n' == c) {
+        return substr_eq(data, start, end, 4, "null");
+      } else if ('t' == c) {
+        return substr_eq(data, start, end, 4, "true");
+      } else if ('f' == c) {
+        return substr_eq(data, start, end, 5, "false");
+      } else if (allow_nonnumeric && c == 'N') {
+        return substr_eq(data, start, end, 3, "NaN");
+      } else if (allow_nonnumeric && c == 'I') {
+        return substr_eq(data, start, end, 8, "Infinity");
+      } else if (allow_nonnumeric && c == '+') {
+        return substr_eq(data, start, end, 4, "+INF") ||
+               substr_eq(data, start, end, 9, "+Infinity");
+      } else if ('-' == c || c <= '9' && 'c' >= '0') {
+        // number
+        auto num_state = number_state::START;
+        for (auto at = start; at < end; at++) {
+          c = data[at];
+          switch (num_state) {
+            case number_state::START:
+              if ('-' == c) {
+                num_state = number_state::SAW_NEG;
+              } else if ('0' == c) {
+                num_state = number_state::LEADING_ZERO;
+              } else if (c >= '1' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::SAW_NEG:
+              if ('0' == c) {
+                num_state = number_state::LEADING_ZERO;
+              } else if (c >= '1' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else if (allow_nonnumeric && 'I' == c) {
+                return substr_eq(data, start, end, 4, "-INF") ||
+                       substr_eq(data, start, end, 9, "-Infinity");
+              } else {
+                return false;
+              }
+              break;
+            case number_state::LEADING_ZERO:
+              if (allow_numeric_leading_zeros && c >= '0' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else if ('.' == c) {
+                num_state = number_state::SAW_RADIX;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::WHOLE:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else if ('.' == c) {
+                num_state = number_state::SAW_RADIX;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::SAW_RADIX:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::FRACTION;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::FRACTION:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::FRACTION;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::START_EXPONENT:
+              if ('+' == c || '-' == c) {
+                num_state = number_state::AFTER_SIGN_EXPONENT;
+              } else if (c >= '0' && c <= '9') {
+                num_state = number_state::EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::AFTER_SIGN_EXPONENT:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::EXPONENT:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+          }
+        }
+        return num_state != number_state::AFTER_SIGN_EXPONENT &&
+               num_state != number_state::START_EXPONENT && num_state != number_state::SAW_NEG &&
+               num_state != number_state::SAW_RADIX;
+      } else {
+        return false;
+      }
+    });
+
+  auto validate_strings = cuda::proclaim_return_type<bool>(
+    [data = d_input.data(),
+     allow_unquoted_control_chars =
+       options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start,
+                                                               SymbolOffsetT end) -> bool {
+      // This validates a quoted string. A string must match https://www.json.org/json-en.html
+      // but we already know that it has a starting and ending " and all white space has been
+      // stripped out. Also the base CUDF validation makes sure escaped chars are correct
+      // so we only need to worry about unquoted control chars
+
+      auto state   = string_state::NORMAL;
+      auto u_count = 0;
+      for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
+        auto c = data[idx];
+        if (!allow_unquoted_control_chars && static_cast<int>(c) >= 0 && static_cast<int>(c) < 32) {
+          return false;
+        }
+
+        switch (state) {
+          case string_state::NORMAL:
+            if (c == '\\') { state = string_state::ESCAPED; }
+            break;
+          case string_state::ESCAPED:
+            // in Spark you can allow any char to be escaped, but CUDF
+            // validates it in some cases so we need to also validate it.
+            if (c == 'u') {
+              state   = string_state::ESCAPED_U;
+              u_count = 0;
+            } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' ||
+                       c == 'r' || c == 't') {
+              state = string_state::NORMAL;
+            } else {
+              return false;
+            }
+            break;
+          case string_state::ESCAPED_U:
+            if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
+              u_count++;
+              if (u_count == 4) {
+                state   = string_state::NORMAL;
+                u_count = 0;
+              }
+            } else {
+              return false;
+            }
+            break;
+        }
+      }
+      return string_state::NORMAL == state;
+    });
+
+  auto num_tokens = tokens.size();
+  auto count_it   = thrust::make_counting_iterator(0);
+  auto predicate  = [tokens        = tokens.begin(),
+                    token_indices = token_indices.begin(),
+                    validate_values,
+                    validate_strings] __device__(auto i) -> bool {
+    if (tokens[i] == token_t::ValueEnd) {
+      return !validate_values(token_indices[i - 1], token_indices[i]);
+    } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) {
+      return !validate_strings(token_indices[i - 1], token_indices[i]);
+    }
+    return false;
+  };
+
+  using scan_type            = write_if::scan_type;
+  auto conditional_write     = write_if{tokens.begin(), num_tokens};
+  auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write);
+  auto transform_op          = cuda::proclaim_return_type<scan_type>(
+    [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
+      if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
+      return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
+    });
+  auto binary_op = cuda::proclaim_return_type<scan_type>(
+    [] __device__(scan_type prev, scan_type curr) -> scan_type {
+      auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first);
+      return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second);
+    });
+
+  thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                   count_it,
+                                   count_it + num_tokens,
+                                   conditional_output_it,
+                                   transform_op,
+                                   binary_op);  // in-place scan
+}
+}  // namespace detail
+}  // namespace cudf::io::json
diff --git a/cpp/src/io/json/tabulate_output_iterator.cuh b/cpp/src/io/json/tabulate_output_iterator.cuh
new file mode 100644
index 00000000000..7cf3655e259
--- /dev/null
+++ b/cpp/src/io/json/tabulate_output_iterator.cuh
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+
+namespace cudf {
+namespace detail {
+
+// Proxy reference that calls BinaryFunction with index value and the rhs of assignment operator
+template <typename BinaryFunction, typename IndexT>
+class tabulate_output_iterator_proxy {
+ public:
+  __host__ __device__ tabulate_output_iterator_proxy(const IndexT index, BinaryFunction fun)
+    : index(index), fun(fun)
+  {
+  }
+  template <typename T>
+  __host__ __device__ tabulate_output_iterator_proxy operator=(const T& rhs_value)
+  {
+    fun(index, rhs_value);
+    return *this;
+  }
+
+ private:
+  IndexT index;
+  BinaryFunction fun;
+};
+
+/**
+ * @brief Tabulate output iterator with custom binary function which takes index and value.
+ *
+ * @code {.cpp}
+ * #include "tabulate_output_iterator.cuh"
+ * #include <thrust/device_vector.h>
+ * #include <thrust/iterator/counting_iterator.h>
+ * #include <thrust/iterator/transform_iterator.h>
+ *
+ * struct set_bits_field {
+ *   int* bitfield;
+ *   __device__ inline void set_bit(size_t bit_index)
+ *   {
+ *     atomicOr(&bitfield[bit_index/32], (int{1} << (bit_index % 32)));
+ *   }
+ *   __device__ inline void clear_bit(size_t bit_index)
+ *   {
+ *     atomicAnd(&bitfield[bit_index / 32], ~(int{1} << (bit_index % 32)));
+ *   }
+ *   // Index, value
+ *   __device__ void operator()(size_t i, bool x)
+ *   {
+ *     if (x)
+ *       set_bit(i);
+ *     else
+ *       clear_bit(i);
+ *   }
+ * };
+ *
+ * thrust::device_vector<int> v(1, 0x00000000);
+ * auto result_begin = thrust::make_tabulate_output_iterator(set_bits_field{v.data().get()});
+ * auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+ *   [] __device__ (int x) {   return x%2; });
+ * thrust::copy(thrust::device, value, value+32, result_begin);
+ * assert(v[0] == 0xaaaaaaaa);
+ * @endcode
+ *
+ *
+ * @tparam BinaryFunction Binary function to be called with the Iterator value and the rhs of
+ * assignment operator.
+ * @tparam Iterator iterator type that acts as index of the output.
+ */
+template <typename BinaryFunction, typename IndexT = ptrdiff_t>
+class tabulate_output_iterator
+  : public thrust::iterator_adaptor<tabulate_output_iterator<BinaryFunction, IndexT>,
+                                    thrust::counting_iterator<IndexT>,
+                                    thrust::use_default,
+                                    thrust::use_default,
+                                    thrust::use_default,
+                                    tabulate_output_iterator_proxy<BinaryFunction, IndexT>> {
+ public:
+  // parent class.
+  using super_t = thrust::iterator_adaptor<tabulate_output_iterator<BinaryFunction, IndexT>,
+                                           thrust::counting_iterator<IndexT>,
+                                           thrust::use_default,
+                                           thrust::use_default,
+                                           thrust::use_default,
+                                           tabulate_output_iterator_proxy<BinaryFunction, IndexT>>;
+  // friend thrust::iterator_core_access to allow it access to the private interface dereference()
+  friend class thrust::iterator_core_access;
+  __host__ __device__ tabulate_output_iterator(BinaryFunction fun) : fun(fun) {}
+
+ private:
+  BinaryFunction fun;
+
+  // thrust::iterator_core_access accesses this function
+  __host__ __device__ typename super_t::reference dereference() const
+  {
+    return tabulate_output_iterator_proxy<BinaryFunction, IndexT>(*this->base(), fun);
+  }
+};
+
+template <typename BinaryFunction>
+tabulate_output_iterator<BinaryFunction> __host__ __device__
+make_tabulate_output_iterator(BinaryFunction fun)
+{
+  return tabulate_output_iterator<BinaryFunction>(fun);
+}  // end make_tabulate_output_iterator
+
+}  // namespace detail
+}  // namespace cudf
+
+// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from
+// type_traits to enable its use with algorithms.
+template <class BinaryFunction, class IndexT>
+struct thrust::detail::is_proxy_reference<
+  cudf::detail::tabulate_output_iterator_proxy<BinaryFunction, IndexT>>
+  : public thrust::detail::true_type {};
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index c26e5ca3edb..960c19fce2e 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2180,6 +2180,86 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync)
   cudf::set_pinned_memory_resource(last_mr);
 }
 
+// Validation
+TEST_F(JsonReaderTest, ValueValidation)
+{
+  // parsing error as null rows
+  std::string data =
+    // 0 -> a: -2 (valid)
+    R"({"a":-2 }{})"
+    "\n"
+    // 1 -> (invalid)
+    R"({"b":{}should_be_invalid})"
+    "\n"
+    // 2 -> b (valid)
+    R"({"b":{"a":3} })"
+    "\n"
+    // 3 -> c: (valid/null based on option)
+    R"({"a": 1, "c":nan, "d": "null" } )"
+    "\n"
+    "\n"
+    // 4 -> (valid/null based on option)
+    R"({"a":04, "c": 1.23, "d": "abc"} 123)"
+    "\n"
+    // 5 -> (valid)
+    R"({"a":5}//Comment after record)"
+    "\n"
+    // 6 -> ((valid/null based on option)
+    R"({"a":06} //Comment after whitespace)"
+    "\n"
+    // 7 -> (invalid)
+    R"({"a":5 //Invalid Comment within record})";
+
+  // leadingZeros allowed
+  // na_values,
+  {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .strict_validation(true);
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 4);
+    EXPECT_EQ(result.tbl->num_rows(), 8);
+    auto b_a_col  = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
+    auto a_column = int64_wrapper{{-2, 0, 0, 0, 4, 5, 6, 0},
+                                  {true, false, false, false, true, true, true, false}};
+    auto b_column = cudf::test::structs_column_wrapper(
+      {b_a_col}, {false, false, true, false, false, false, false, false});
+    auto c_column = float64_wrapper({0.0, 0.0, 0.0, 0.0, 1.23, 0.0, 0.0, 0.0},
+                                    {false, false, false, false, true, false, false, false});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column);
+  }
+  // leadingZeros not allowed, NaN allowed
+  {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .strict_validation(true)
+        .numeric_leading_zeros(false)
+        .na_values({"nan"});
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 4);
+    EXPECT_EQ(result.tbl->num_rows(), 8);
+    EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::INT8);  // empty column
+    auto b_a_col  = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
+    auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0},
+                                  {true, false, false, true, false, true, false, false}};
+    auto b_column = cudf::test::structs_column_wrapper(
+      {b_a_col}, {false, false, true, false, false, false, false, false});
+    auto c_column = int8_wrapper({0, 0, 0, 0, 0, 0, 0, 0},
+                                 {false, false, false, false, false, false, false, false});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column);
+  }
+}
+
 TEST_F(JsonReaderTest, MixedTypes)
 {
   using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index b37d0d88ec9..c8308ca17ec 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -34,6 +34,10 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean normalizeWhitespace;
   private final boolean mixedTypesAsStrings;
   private final boolean keepStringQuotes;
+  private final boolean strictValidation;
+  private final boolean allowLeadingZeros;
+  private final boolean allowNonNumericNumbers;
+  private final boolean allowUnquotedControlChars;
 
   private JSONOptions(Builder builder) {
     super(builder);
@@ -44,6 +48,10 @@ private JSONOptions(Builder builder) {
     normalizeWhitespace = builder.normalizeWhitespace;
     mixedTypesAsStrings = builder.mixedTypesAsStrings;
     keepStringQuotes = builder.keepQuotes;
+    strictValidation = builder.strictValidation;
+    allowLeadingZeros = builder.allowLeadingZeros;
+    allowNonNumericNumbers = builder.allowNonNumericNumbers;
+    allowUnquotedControlChars = builder.allowUnquotedControlChars;
   }
 
   public boolean isDayFirst() {
@@ -75,6 +83,22 @@ public boolean keepStringQuotes() {
     return keepStringQuotes;
   }
 
+  public boolean strictValidation() {
+    return strictValidation;
+  }
+
+  public boolean leadingZerosAllowed() {
+    return allowLeadingZeros;
+  }
+
+  public boolean nonNumericNumbersAllowed() {
+    return allowNonNumericNumbers;
+  }
+
+  public boolean unquotedControlChars() {
+    return allowUnquotedControlChars;
+  }
+
   @Override
   String[] getIncludeColumnNames() {
     throw new UnsupportedOperationException("JSON reader didn't support column prune");
@@ -85,6 +109,10 @@ public static Builder builder() {
   }
 
   public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptions.Builder> {
+    private boolean strictValidation = false;
+    private boolean allowUnquotedControlChars = true;
+    private boolean allowNonNumericNumbers = false;
+    private boolean allowLeadingZeros = false;
     private boolean dayFirst = false;
     private boolean lines = true;
 
@@ -95,10 +123,45 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean mixedTypesAsStrings = false;
     private boolean keepQuotes = false;
 
+    /**
+     * Should json validation be strict or not
+     */
+    public Builder withStrictValidation(boolean isAllowed) {
+      strictValidation = isAllowed;
+      return this;
+    }
+
+    /**
+     * Should leading zeros on numbers be allowed or not. Strict validation
+     * must be enabled for this to have any effect.
+     */
+    public Builder withLeadingZeros(boolean isAllowed) {
+      allowLeadingZeros = isAllowed;
+      return this;
+    }
+
+    /**
+     * Should non-numeric numbers be allowed or not. Strict validation
+     * must be enabled for this to have any effect.
+     */
+    public Builder withNonNumericNumbers(boolean isAllowed) {
+      allowNonNumericNumbers = isAllowed;
+      return this;
+    }
+
+    /**
+     * Should unquoted control chars be allowed in strings. Strict validation
+     * must be enabled for this to have any effect.
+     */
+    public Builder withUnquotedControlChars(boolean isAllowed) {
+      allowUnquotedControlChars = isAllowed;
+      return this;
+    }
+
     /**
      * Whether to parse dates as DD/MM versus MM/DD
      * @param dayFirst true: DD/MM, false, MM/DD
-     * @return
+     * @return builder for chaining
      */
     public Builder withDayFirst(boolean dayFirst) {
       this.dayFirst = dayFirst;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 36e342cae13..cbb126d7ee5 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -254,7 +254,11 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean normalizeSingleQuotes,
                                         boolean normalizeWhitespace,
                                         boolean mixedTypesAsStrings,
-                                        boolean keepStringQuotes) throws CudfException;
+                                        boolean keepStringQuotes,
+                                        boolean strictValidation,
+                                        boolean allowLeadingZeros,
+                                        boolean allowNonNumericNumbers,
+                                        boolean allowUnquotedControl) throws CudfException;
 
   private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
                                       int[] dTypeIds, int[] dTypeScales,
@@ -264,6 +268,10 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
+                                      boolean strictValidation,
+                                      boolean allowLeadingZeros,
+                                      boolean allowNonNumericNumbers,
+                                      boolean allowUnquotedControl,
                                       long dsHandle) throws CudfException;
 
   private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
@@ -272,7 +280,12 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool
                                       boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
+                                      boolean strictValidation,
+                                      boolean allowLeadingZeros,
+                                      boolean allowNonNumericNumbers,
+                                      boolean allowUnquotedControl,
                                       long dsHandle) throws CudfException;
+
   private static native long readAndInferJSON(long address, long length,
                                               boolean dayFirst,
                                               boolean lines,
@@ -280,7 +293,11 @@ private static native long readAndInferJSON(long address, long length,
                                               boolean normalizeSingleQuotes,
                                               boolean normalizeWhitespace,
                                               boolean mixedTypesAsStrings,
-                                              boolean keepStringQuotes) throws CudfException;
+                                              boolean keepStringQuotes,
+                                              boolean strictValidation,
+                                              boolean allowLeadingZeros,
+                                              boolean allowNonNumericNumbers,
+                                              boolean allowUnquotedControl) throws CudfException;
 
   /**
    * Read in Parquet formatted data.
@@ -1292,7 +1309,11 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     opts.isNormalizeSingleQuotes(),
                     opts.isNormalizeWhitespace(),
                     opts.isMixedTypesAsStrings(),
-                opts.keepStringQuotes()))) {
+                    opts.keepStringQuotes(),
+                    opts.strictValidation(),
+                    opts.leadingZerosAllowed(),
+                    opts.nonNumericNumbersAllowed(),
+                    opts.unquotedControlChars()))) {
 
       return gatherJSONColumns(schema, twm, -1);
     }
@@ -1370,7 +1391,12 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
         opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
         opts.isNormalizeSingleQuotes(),
         opts.isNormalizeWhitespace(),
-        opts.isMixedTypesAsStrings(), opts.keepStringQuotes()));
+        opts.isMixedTypesAsStrings(),
+        opts.keepStringQuotes(),
+        opts.strictValidation(),
+        opts.leadingZerosAllowed(),
+        opts.nonNumericNumbersAllowed(),
+        opts.unquotedControlChars()));
   }
 
   /**
@@ -1388,6 +1414,10 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
           opts.isNormalizeWhitespace(),
           opts.isMixedTypesAsStrings(),
           opts.keepStringQuotes(),
+          opts.strictValidation(),
+          opts.leadingZerosAllowed(),
+          opts.nonNumericNumbersAllowed(),
+          opts.unquotedControlChars(),
           dsHandle));
         return twm;
       } finally {
@@ -1430,10 +1460,18 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     try (TableWithMeta twm = new TableWithMeta(readJSON(
             schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
             schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
-            buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
-            opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+            buffer.getAddress() + offset, len,
+            opts.isDayFirst(),
+            opts.isLines(),
+            opts.isRecoverWithNull(),
+            opts.isNormalizeSingleQuotes(),
             opts.isNormalizeWhitespace(),
-            opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) {
+            opts.isMixedTypesAsStrings(),
+            opts.keepStringQuotes(),
+            opts.strictValidation(),
+            opts.leadingZerosAllowed(),
+            opts.nonNumericNumbersAllowed(),
+            opts.unquotedControlChars()))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
     }
   }
@@ -1454,17 +1492,26 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
    * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
    * @param opts various JSON parsing options.
    * @param ds the DataSource to read from.
-   * @param emtpyRowCount the number of rows to return if no columns were read.
+   * @param emptyRowCount the number of rows to return if no columns were read.
    * @return the data parsed as a table on the GPU.
    */
-  public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emtpyRowCount) {
+  public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) {
     long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
-        schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(),
-        opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+        schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
+        opts.isDayFirst(),
+        opts.isLines(),
+        opts.isRecoverWithNull(),
+        opts.isNormalizeSingleQuotes(),
         opts.isNormalizeWhitespace(),
-        opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) {
-      return gatherJSONColumns(schema, twm, emtpyRowCount);
+        opts.isMixedTypesAsStrings(),
+        opts.keepStringQuotes(),
+        opts.strictValidation(),
+        opts.leadingZerosAllowed(),
+        opts.nonNumericNumbersAllowed(),
+        opts.unquotedControlChars(),
+        dsHandle))) {
+      return gatherJSONColumns(schema, twm, emptyRowCount);
     } finally {
       DataSourceHelper.destroyWrapperDataSource(dsHandle);
     }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c5abf08a59d..40a111209b0 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1623,6 +1623,10 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
                                                          jboolean normalize_whitespace,
                                                          jboolean mixed_types_as_string,
                                                          jboolean keep_quotes,
+                                                         jboolean strict_validation,
+                                                         jboolean allow_leading_zeros,
+                                                         jboolean allow_nonnumeric_numbers,
+                                                         jboolean allow_unquoted_control,
                                                          jlong ds_handle)
 {
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
@@ -1642,8 +1646,13 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
-
+    if (strict_validation) {
+      opts.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
     auto result =
       std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
@@ -1652,17 +1661,22 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
-                                                                   jclass,
-                                                                   jlong buffer,
-                                                                   jlong buffer_length,
-                                                                   jboolean day_first,
-                                                                   jboolean lines,
-                                                                   jboolean recover_with_null,
-                                                                   jboolean normalize_single_quotes,
-                                                                   jboolean normalize_whitespace,
-                                                                   jboolean mixed_types_as_string,
-                                                                   jboolean keep_quotes)
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
+                                           jclass,
+                                           jlong buffer,
+                                           jlong buffer_length,
+                                           jboolean day_first,
+                                           jboolean lines,
+                                           jboolean recover_with_null,
+                                           jboolean normalize_single_quotes,
+                                           jboolean normalize_whitespace,
+                                           jboolean mixed_types_as_string,
+                                           jboolean keep_quotes,
+                                           jboolean strict_validation,
+                                           jboolean allow_leading_zeros,
+                                           jboolean allow_nonnumeric_numbers,
+                                           jboolean allow_unquoted_control)
 {
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1684,8 +1698,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
         .recovery_mode(recovery_mode)
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .strict_validation(strict_validation)
         .mixed_types_as_string(mixed_types_as_string)
         .keep_quotes(keep_quotes);
+    if (strict_validation) {
+      opts.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
 
     auto result =
       std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1790,6 +1810,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
                                                  jboolean normalize_whitespace,
                                                  jboolean mixed_types_as_string,
                                                  jboolean keep_quotes,
+                                                 jboolean strict_validation,
+                                                 jboolean allow_leading_zeros,
+                                                 jboolean allow_nonnumeric_numbers,
+                                                 jboolean allow_unquoted_control,
                                                  jlong ds_handle)
 {
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
@@ -1824,7 +1848,13 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
+    if (strict_validation) {
+      opts.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
@@ -1874,7 +1904,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
                                                            jboolean normalize_single_quotes,
                                                            jboolean normalize_whitespace,
                                                            jboolean mixed_types_as_string,
-                                                           jboolean keep_quotes)
+                                                           jboolean keep_quotes,
+                                                           jboolean strict_validation,
+                                                           jboolean allow_leading_zeros,
+                                                           jboolean allow_nonnumeric_numbers,
+                                                           jboolean allow_unquoted_control)
 {
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1923,7 +1957,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
+    if (strict_validation) {
+      opts.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 050bcbb268f..56fe63598d9 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -437,6 +437,7 @@ void testReadWhitespacesJSONFile() throws IOException {
     }
   }
 
+  @Test
   void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
     Schema schema = Schema.builder()
         .column(DType.STRING, "A")
@@ -455,6 +456,206 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
     }
   }
 
+  private static final byte[] JSON_VALIDATION_BUFFER = (
+      "{\"a\":true}\n" +
+      "{\"a\":false}\n" +
+      "{\"a\":null}\n" +
+      "{\"a\":true, \"b\":truee}\n" +
+      "{\"a\":true, \"b\":\"nulll\"}\n" +
+      "{\"a\": 1}\n" +
+      "{\"a\": 0}\n" +
+      "{\"a\": -}\n" +
+      "{\"a\": -0}\n" +
+      "{\"a\": -01}\n" +
+
+      "{\"a\": 01}\n" +
+      "{\"a\": -0.1}\n" +
+      "{\"a\": -00.1}\n" +
+      "{\"a\": NaN}\n" +
+      "{\"a\": INF}\n" +
+      "{\"a\": +INF}\n" +
+      "{\"a\": -INF}\n" +
+      "{\"a\": +Infinity}\n" +
+      "{\"a\": Infinity}\n" +
+      "{\"a\": -Infinity}\n" +
+
+      "{\"a\": INFinity}\n" +
+      "{\"a\":\"3710-11-10T02:46:58.732Z\"}\n" +
+      "{\"a\":12.}\n" +
+      "{\"a\": -3.4e+38}\n" +
+      "{\"a\": -3.4e-38}\n" +
+      "{\"a\": 1.4e38}\n" +
+      "{\"a\": -3.4E+38}\n" +
+      "{\"a\": -3.4E-38}\n" +
+      "{\"a\": 1.4E38}\n" +
+      "{\"a\": -3.4E+}\n" +
+
+      "{\"a\": -3.4E-}\n" +
+      "{\"a\": \"A\u0000B\"}\n" +
+      "{\"a\": \"A\\u0000B\"}\n" +
+      "{\"a\": \"A\u0001B\"}\n" +
+      "{\"a\": \"A\\u0001B\"}\n" +
+      "{\"a\": \"A\u001FB\"}\n" +
+      "{\"a\": \"A\\u001FB\"}\n" +
+      "{\"a\": \"A\u0020B\"}\n" +
+      "{\"a\": \"A\\u0020B\"}\n" +
+      "{\"a\": \"\\u12\"}\n" +
+
+      "{\"a\": \"\\z\"}\n" +
+      "{\"a\": \"\\r\"}\n" +
+      "{\"a\": \"something\", \"b\": \"\\z\"}\n"
+  ).getBytes(StandardCharsets.UTF_8);
+
+  @Test
+  void testJSONValidationNoStrict() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(false)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01",
+            "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity",
+            "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+",
+            "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", "\"something\"")
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidation() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", null,
+            null, "-0.1", null, null, null, null, null, null, null, null,
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidationLeadingZeros() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(true)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", "-01",
+            "01", "-0.1", "-00.1", null, null, null, null, null, null, null,
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidationNonNumeric() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(true)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", null,
+            null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity",
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidationUnquotedControl() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(false)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", null,
+            null, "-0.1", null, null, null, null, null, null, null, null,
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" +
       "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" +
       "{\"d\":[1,2,3]}\n" +

From 985f671e1308c97de992887f3bccedced494fa44 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 11 Sep 2024 13:03:19 -0400
Subject: [PATCH 191/270] Fix slice_strings wide strings logic with multi-byte
 characters (#16777)

Fixes logic error in computing character and byte counts for slice positions in strings with specific pattern of multi-byte characters.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Zach Puller (https://github.com/zpuller)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16777
---
 cpp/src/strings/slice.cu          |  8 +++++---
 cpp/tests/strings/slice_tests.cpp | 19 +++++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 978a844c476..4c39fc96397 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -122,26 +122,28 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,
       break;
     }
     size_type const cc = (itr < end) && is_begin_utf8_char(*itr);
-    size_type const bc = (itr < end);
+    size_type const bc = (itr < end) ? bytes_in_utf8_byte(*itr) : 0;
     char_count += cg::reduce(warp, cc, cg::plus<int>());
     byte_count += cg::reduce(warp, bc, cg::plus<int>());
     itr += cudf::detail::warp_size;
   }
 
+  __syncwarp();
+
   if (warp.thread_rank() == 0) {
     if (start >= char_count) {
       d_output[str_idx] = string_index_pair{"", 0};
       return;
     }
 
-    // we are just below start/stop and must now increment up to it from here
+    // we are just below start/stop and must now increment up to them from here
     auto first_byte = start_counts.second;
     if (start_counts.first < start) {
       auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte);
       first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
     }
 
-    stop           = max(stop, char_count);
+    stop           = min(stop, char_count);
     auto last_byte = stop_counts.second;
     if (stop_counts.first < stop) {
       auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte);
diff --git a/cpp/tests/strings/slice_tests.cpp b/cpp/tests/strings/slice_tests.cpp
index 52e439bd93f..7f7fd9d521b 100644
--- a/cpp/tests/strings/slice_tests.cpp
+++ b/cpp/tests/strings/slice_tests.cpp
@@ -268,6 +268,25 @@ TEST_F(StringsSliceTest, MaxPositions)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringsSliceTest, MultiByteChars)
+{
+  auto input = cudf::test::strings_column_wrapper({
+    // clang-format off
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "the following code snippet demonstrates how to use search for values in an ordered range  "
+            // this placement tests proper multi-byte chars handling  ------vvvvv
+    "it returns the last position where value could be inserted without the ééééé ordering ",
+    "algorithms execution is parallelized as determined by an execution policy; this is a 12345"
+    "continuation of previous row to make sure string boundaries are honored 012345678901234567"
+           //   v--- this one also
+    "01234567890é34567890012345678901234567890"
+    // clang-format on
+  });
+
+  auto results = cudf::strings::slice_strings(cudf::strings_column_view(input), 0);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
+}
+
 TEST_F(StringsSliceTest, Error)
 {
   cudf::test::strings_column_wrapper strings{"this string intentionally left blank"};

From 0b32f55b1ed38507437770d21da1e4e1a1c4a17d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 11 Sep 2024 13:33:37 -0400
Subject: [PATCH 192/270] Fix nvbench output for sha512 (#16773)

Fixes the `sha512` output for nvbench for `GlobalMem BW`.
Previously:
```
|    65536 |     0 | sha512 |   1216x | 417.898 us |  1.40% | 412.669 us | 0.61% |     24.139 GB/s |           3.14% |
| 16777216 |     0 | sha512 |     11x |  71.392 ms |  0.03% |  71.387 ms | 0.03% | 258404.649 PB/s | 33642233417.78% |
|    65536 |   0.1 | sha512 |   1184x | 433.031 us |  1.58% | 427.815 us | 1.01% |     22.919 GB/s |           2.98% |
| 16777216 |   0.1 | sha512 |     11x |  73.457 ms |  0.03% |  73.452 ms | 0.03% | 251140.174 PB/s | 32696456458.71% |
```
Fixed integer overflow calculation:
```
|    65536 |     0 | sha512 |   1200x | 423.838 us |  1.42% | 418.561 us | 0.66% |  23.799 GB/s |  3.10% |
| 16777216 |     0 | sha512 |     11x |  72.773 ms |  0.11% |  72.767 ms | 0.11% |  35.041 GB/s |  4.56% |
|    65536 |   0.1 | sha512 |   1168x | 439.078 us |  1.60% | 433.843 us | 1.05% |  22.601 GB/s |  2.94% |
| 16777216 |   0.1 | sha512 |     19x |  75.108 ms |  0.49% |  75.102 ms | 0.49% |  33.412 GB/s |  4.35% |
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16773
---
 cpp/benchmarks/hashing/hash.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index 61e79a47a50..e4ff0c8c4a7 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -50,7 +50,7 @@ static void bench_hash(nvbench::state& state)
   state.add_global_memory_reads<nvbench::int64_t>(num_rows);
   // add memory read from bitmaks
   if (!no_nulls) {
-    state.add_global_memory_reads<nvbench::int8_t>(2 *
+    state.add_global_memory_reads<nvbench::int8_t>(2L *
                                                    cudf::bitmask_allocation_size_bytes(num_rows));
   }
   // memory written depends on used hash
@@ -63,37 +63,37 @@ static void bench_hash(nvbench::state& state)
     });
   } else if (hash_name == "md5") {
     // md5 creates a 32-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(32 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(32L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); });
   } else if (hash_name == "sha1") {
     // sha1 creates a 40-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(40 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(40L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha1(data->view()); });
   } else if (hash_name == "sha224") {
     // sha224 creates a 56-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(56 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(56L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha224(data->view()); });
   } else if (hash_name == "sha256") {
     // sha256 creates a 64-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(64 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(64L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha256(data->view()); });
   } else if (hash_name == "sha384") {
     // sha384 creates a 96-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(96 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(96L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha384(data->view()); });
   } else if (hash_name == "sha512") {
     // sha512 creates a 128-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(128 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(128L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha512(data->view()); });

From e063baa7a447a8273c213c6fbef2ffc93a95ff99 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 11 Sep 2024 15:14:26 -0700
Subject: [PATCH 193/270] Support reading multiple PQ sources with mismatching
 nullability for columns (#16639)

Related to #12702.

This PR adds support of reading multiple Parquet files with mismatched nullability for input columns. i.e. A column may not be nullable in one input file and nullable in another file.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16639
---
 cpp/src/io/parquet/page_decode.cuh           |   2 +-
 cpp/src/io/parquet/parquet.hpp               |   7 +-
 cpp/src/io/parquet/parquet_gpu.hpp           |   7 +-
 cpp/src/io/parquet/reader_impl_chunking.cu   |  18 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp   | 120 ++++++---
 cpp/src/io/parquet/reader_impl_helpers.hpp   |  27 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu | 104 +++++---
 python/cudf/cudf/tests/test_parquet.py       | 254 ++++++++++++++++---
 8 files changed, 418 insertions(+), 121 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a3f91f6859b..9ed2929a70e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -893,7 +893,7 @@ __device__ void gpuDecodeLevels(page_state_s* s,
 {
   bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  constexpr int batch_size = 32;
+  constexpr int batch_size = cudf::detail::warp_size;
   int cur_leaf_count       = target_leaf_count;
   while (s->error == 0 && s->nz_count < target_leaf_count &&
          s->input_value_count < s->num_input_values) {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 5d10472b0ae..7c985643887 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -203,10 +203,9 @@ struct SchemaElement {
   bool operator==(SchemaElement const& other) const
   {
     return type == other.type && converted_type == other.converted_type &&
-           type_length == other.type_length && repetition_type == other.repetition_type &&
-           name == other.name && num_children == other.num_children &&
-           decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision &&
-           field_id == other.field_id;
+           type_length == other.type_length && name == other.name &&
+           num_children == other.num_children && decimal_scale == other.decimal_scale &&
+           decimal_precision == other.decimal_precision && field_id == other.field_id;
   }
 
   // the parquet format is a little squishy when it comes to interpreting
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 125d35f6499..1390339c1ae 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -400,7 +400,8 @@ struct ColumnChunkDesc {
                            int32_t src_col_schema_,
                            column_chunk_info const* chunk_info_,
                            float list_bytes_per_row_est_,
-                           bool strings_to_categorical_)
+                           bool strings_to_categorical_,
+                           int32_t src_file_idx_)
     : compressed_data(compressed_data_),
       compressed_size(compressed_size_),
       num_values(num_values_),
@@ -419,7 +420,8 @@ struct ColumnChunkDesc {
       src_col_schema(src_col_schema_),
       h_chunk_info(chunk_info_),
       list_bytes_per_row_est(list_bytes_per_row_est_),
-      is_strings_to_cat(strings_to_categorical_)
+      is_strings_to_cat(strings_to_categorical_),
+      src_file_idx(src_file_idx_)
 
   {
   }
@@ -456,6 +458,7 @@ struct ColumnChunkDesc {
 
   bool is_strings_to_cat{};    // convert strings to hashes
   bool is_large_string_col{};  // `true` if string data uses 64-bit offsets
+  int32_t src_file_idx{};      // source file index
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 245e1829c72..c588fedb85c 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1511,10 +1511,13 @@ void reader::impl::create_global_chunk_info()
     std::transform(
       _input_columns.begin(), _input_columns.end(), column_mapping.begin(), [&](auto const& col) {
         // translate schema_idx into something we can use for the page indexes
-        if (auto it = std::find_if(
-              columns.begin(),
-              columns.end(),
-              [&col](auto const& col_chunk) { return col_chunk.schema_idx == col.schema_idx; });
+        if (auto it = std::find_if(columns.begin(),
+                                   columns.end(),
+                                   [&](auto const& col_chunk) {
+                                     return col_chunk.schema_idx ==
+                                            _metadata->map_schema_index(col.schema_idx,
+                                                                        rg.source_index);
+                                   });
             it != columns.end()) {
           return std::distance(columns.begin(), it);
         }
@@ -1535,7 +1538,8 @@ void reader::impl::create_global_chunk_info()
       auto col = _input_columns[i];
       // look up metadata
       auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-      auto& schema   = _metadata->get_schema(col.schema_idx);
+      auto& schema   = _metadata->get_schema(
+        _metadata->map_schema_index(col.schema_idx, rg.source_index), rg.source_index);
 
       auto [clock_rate, logical_type] =
         conversion_info(to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id()),
@@ -1574,9 +1578,9 @@ void reader::impl::create_global_chunk_info()
                                        col.schema_idx,
                                        chunk_info,
                                        list_bytes_per_row_est,
-                                       schema.type == BYTE_ARRAY and _strings_to_categorical));
+                                       schema.type == BYTE_ARRAY and _strings_to_categorical,
+                                       rg.source_index));
     }
-
     // Adjust for skip_rows when updating the remaining rows after the first group
     remaining_rows -=
       (skip_rows) ? std::min<int>(rg.start_row + row_group.num_rows - skip_rows, remaining_rows)
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 8b5678f202b..6d566b5815e 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -423,8 +423,13 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf
   std::vector<column_chunk_info> chunks(rg.columns.size());
 
   for (size_t col_idx = 0; col_idx < rg.columns.size(); col_idx++) {
-    auto const& col_chunk    = rg.columns[col_idx];
-    auto& schema             = get_schema(col_chunk.schema_idx);
+    auto const& col_chunk = rg.columns[col_idx];
+    auto const is_schema_idx_mapped =
+      is_schema_index_mapped(col_chunk.schema_idx, rg_info.source_index);
+    auto const mapped_schema_idx = is_schema_idx_mapped
+                                     ? map_schema_index(col_chunk.schema_idx, rg_info.source_index)
+                                     : col_chunk.schema_idx;
+    auto& schema = get_schema(mapped_schema_idx, is_schema_idx_mapped ? rg_info.source_index : 0);
     auto const max_def_level = schema.max_definition_level;
     auto const max_rep_level = schema.max_repetition_level;
 
@@ -559,22 +564,40 @@ aggregate_reader_metadata::aggregate_reader_metadata(
     num_rows(calc_num_rows()),
     num_row_groups(calc_num_row_groups())
 {
-  // Validate that all sources have the same schema unless we are reading select columns
-  // from mismatched sources, in which case, we will only check the projected columns later.
-  if (per_file_metadata.size() > 1 and not has_cols_from_mismatched_srcs) {
-    auto const& first_meta = per_file_metadata.front();
+  if (per_file_metadata.size() > 1) {
+    auto& first_meta = per_file_metadata.front();
     auto const num_cols =
       first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0;
-    auto const& schema = first_meta.schema;
-
-    // Verify that the input files have matching numbers of columns and schema.
-    for (auto const& pfm : per_file_metadata) {
-      if (pfm.row_groups.size() > 0) {
-        CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(),
-                     "All sources must have the same number of columns");
+    auto& schema = first_meta.schema;
+
+    // Validate that all sources have the same schema unless we are reading select columns
+    // from mismatched sources, in which case, we will only check the projected columns later.
+    if (not has_cols_from_mismatched_srcs) {
+      // Verify that the input files have matching numbers of columns and schema.
+      for (auto const& pfm : per_file_metadata) {
+        if (pfm.row_groups.size() > 0) {
+          CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(),
+                       "All sources must have the same number of columns");
+        }
+        CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
       }
-      CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
     }
+
+    // Mark the column schema in the first (default) source as nullable if it is nullable in any of
+    // the input sources. This avoids recomputing this within build_column() and
+    // populate_metadata().
+    std::for_each(
+      thrust::make_counting_iterator(static_cast<size_t>(1)),
+      thrust::make_counting_iterator(schema.size()),
+      [&](auto const schema_idx) {
+        if (schema[schema_idx].repetition_type == REQUIRED and
+            std::any_of(
+              per_file_metadata.begin() + 1, per_file_metadata.end(), [&](auto const& pfm) {
+                return pfm.schema[schema_idx].repetition_type != REQUIRED;
+              })) {
+          schema[schema_idx].repetition_type = OPTIONAL;
+        }
+      });
   }
 
   // Collect and apply arrow:schema from Parquet's key value metadata section
@@ -884,15 +907,8 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t
                                                                           size_type src_idx,
                                                                           int schema_idx) const
 {
-  // schema_idx_maps will only have > 0 size when we are reading matching column projection from
-  // mismatched Parquet sources.
-  if (src_idx and not schema_idx_maps.empty()) {
-    auto const& schema_idx_map = schema_idx_maps[src_idx - 1];
-    CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(),
-                 "Unmapped schema index encountered in the specified source tree",
-                 std::range_error);
-    schema_idx = schema_idx_map.at(schema_idx);
-  }
+  // Map schema index to the provided source file index
+  schema_idx = map_schema_index(schema_idx, src_idx);
 
   auto col =
     std::find_if(per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
@@ -924,6 +940,46 @@ aggregate_reader_metadata::get_rowgroup_metadata() const
   return rg_metadata;
 }
 
+bool aggregate_reader_metadata::is_schema_index_mapped(int schema_idx, int pfm_idx) const
+{
+  // Check if schema_idx or pfm_idx is invalid
+  CUDF_EXPECTS(
+    schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
+    "Parquet reader encountered an invalid schema_idx or pfm_idx",
+    std::out_of_range);
+
+  // True if root index requested or zeroth file index or schema_idx maps doesn't exist. (i.e.
+  // schemas are identical).
+  if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return true; }
+
+  // Check if mapped
+  auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+  return schema_idx_map.find(schema_idx) != schema_idx_map.end();
+}
+
+int aggregate_reader_metadata::map_schema_index(int schema_idx, int pfm_idx) const
+{
+  // Check if schema_idx or pfm_idx is invalid
+  CUDF_EXPECTS(
+    schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
+    "Parquet reader encountered an invalid schema_idx or pfm_idx",
+    std::out_of_range);
+
+  // Check if pfm_idx is zero or root index requested or schema_idx_maps doesn't exist (i.e.
+  // schemas are identical).
+  if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return schema_idx; }
+
+  // schema_idx_maps will only have > 0 size when we are reading matching column projection from
+  // mismatched Parquet sources.
+  auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+  CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(),
+               "Unmapped schema index encountered in the specified source tree",
+               std::out_of_range);
+
+  // Return the mapped schema idx.
+  return schema_idx_map.at(schema_idx);
+}
+
 std::string aggregate_reader_metadata::get_pandas_index() const
 {
   // Assumes that all input files have the same metadata
@@ -1185,8 +1241,8 @@ aggregate_reader_metadata::select_columns(
   // Compares two schema elements to be equal except their number of children
   auto const equal_to_except_num_children = [](SchemaElement const& lhs, SchemaElement const& rhs) {
     return lhs.type == rhs.type and lhs.converted_type == rhs.converted_type and
-           lhs.type_length == rhs.type_length and lhs.repetition_type == rhs.repetition_type and
-           lhs.name == rhs.name and lhs.decimal_scale == rhs.decimal_scale and
+           lhs.type_length == rhs.type_length and lhs.name == rhs.name and
+           lhs.decimal_scale == rhs.decimal_scale and
            lhs.decimal_precision == rhs.decimal_precision and lhs.field_id == rhs.field_id;
   };
 
@@ -1209,6 +1265,11 @@ aggregate_reader_metadata::select_columns(
                    "the selected path",
                    std::invalid_argument);
 
+      // Get the schema_idx_map for this data source (pfm)
+      auto& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+      // Map the schema index from 0th tree (src) to the one in the current (dst) tree.
+      schema_idx_map[src_schema_idx] = dst_schema_idx;
+
       // If src_schema_elem is a stub, it does not exist in the column_name_info and column_buffer
       // hierarchy. So continue on with mapping.
       if (src_schema_elem.is_stub()) {
@@ -1262,15 +1323,6 @@ aggregate_reader_metadata::select_columns(
                        pfm_idx);
           });
       }
-
-      // We're at a leaf and this is an input column (one with actual data stored) so map it.
-      if (src_schema_elem.num_children == 0) {
-        // Get the schema_idx_map for this data source (pfm)
-        auto& schema_idx_map = schema_idx_maps[pfm_idx - 1];
-
-        // Map the schema index from 0th tree (src) to the one in the current (dst) tree.
-        schema_idx_map[src_schema_idx] = dst_schema_idx;
-      }
     };
 
   std::vector<int> output_column_schemas;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 6f2863136b2..6487c92f48f 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -234,6 +234,26 @@ class aggregate_reader_metadata {
 
   [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
 
+  /**
+   * @brief Checks if a schema index from 0th source is mapped to the specified file index
+   *
+   * @param schema_idx The index of the SchemaElement in the zeroth file.
+   * @param pfm_idx The index of the file (per_file_metadata) to check mappings for.
+   *
+   * @return True if schema index is mapped
+   */
+  [[nodiscard]] bool is_schema_index_mapped(int schema_idx, int pfm_idx) const;
+
+  /**
+   * @brief Maps schema index from 0th source file to the specified file index
+   *
+   * @param schema_idx The index of the SchemaElement in the zeroth file.
+   * @param pfm_idx The index of the file (per_file_metadata) to map the schema_idx to.
+   *
+   * @return Mapped schema index
+   */
+  [[nodiscard]] int map_schema_index(int schema_idx, int pfm_idx) const;
+
   /**
    * @brief Extracts the schema_idx'th SchemaElement from the pfm_idx'th file
    *
@@ -248,7 +268,7 @@ class aggregate_reader_metadata {
     CUDF_EXPECTS(
       schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
       "Parquet reader encountered an invalid schema_idx or pfm_idx",
-      std::invalid_argument);
+      std::out_of_range);
     return per_file_metadata[pfm_idx].schema[schema_idx];
   }
 
@@ -256,7 +276,10 @@ class aggregate_reader_metadata {
   [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); }
 
   /**
-   * @brief Gets the concrete nesting depth of output cudf columns
+   * @brief Gets the concrete nesting depth of output cudf columns.
+   *
+   * Gets the nesting depth of the output cudf column for the given schema.
+   * The nesting depth must be equal for the given schema_index across all sources.
    *
    * @param schema_index Schema index of the input column
    *
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 52918f5bc80..8e67f233213 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -79,23 +79,30 @@ void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_str
  * is indicated when adding new values.  This function generates the mappings of
  * the R/D levels to those start/end bounds
  *
- * @param remap Maps column schema index to the R/D remapping vectors for that column
- * @param src_col_schema The column schema to generate the new mapping for
+ * @param remap Maps column schema index to the R/D remapping vectors for that column for a
+ *              particular input source file
+ * @param src_col_schema The source column schema to generate the new mapping for
+ * @param mapped_src_col_schema Mapped column schema for src_file_idx'th file
+ * @param src_file_idx The input source file index for the column schema
  * @param md File metadata information
  */
-void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>>& remap,
-                               int src_col_schema,
-                               aggregate_reader_metadata const& md)
+void generate_depth_remappings(
+  std::map<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>>& remap,
+  int const src_col_schema,
+  int const mapped_src_col_schema,
+  int const src_file_idx,
+  aggregate_reader_metadata const& md)
 {
   // already generated for this level
-  if (remap.find(src_col_schema) != remap.end()) { return; }
-  auto schema   = md.get_schema(src_col_schema);
-  int max_depth = md.get_output_nesting_depth(src_col_schema);
+  if (remap.find({src_col_schema, src_file_idx}) != remap.end()) { return; }
+  auto const& schema   = md.get_schema(mapped_src_col_schema, src_file_idx);
+  auto const max_depth = md.get_output_nesting_depth(src_col_schema);
 
-  CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(),
+  CUDF_EXPECTS(remap.find({src_col_schema, src_file_idx}) == remap.end(),
                "Attempting to remap a schema more than once");
   auto inserted =
-    remap.insert(std::pair<int, std::pair<std::vector<int>, std::vector<int>>>{src_col_schema, {}});
+    remap.insert(std::pair<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>>{
+      {src_col_schema, src_file_idx}, {}});
   auto& depth_remap = inserted.first->second;
 
   std::vector<int>& rep_depth_remap = (depth_remap.first);
@@ -136,15 +143,15 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
     auto find_shallowest = [&](int r) {
       int shallowest = -1;
       int cur_depth  = max_depth - 1;
-      int schema_idx = src_col_schema;
+      int schema_idx = mapped_src_col_schema;
       while (schema_idx > 0) {
-        auto cur_schema = md.get_schema(schema_idx);
+        auto& cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_repetition_level == r) {
           // if this is a repeated field, map it one level deeper
           shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth;
         }
         // if it's one-level encoding list
-        else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx))) {
+        else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx, src_file_idx))) {
           shallowest = cur_depth - 1;
         }
         if (!cur_schema.is_stub()) { cur_depth--; }
@@ -159,10 +166,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) {
     auto find_deepest = [&](int d) {
       SchemaElement prev_schema;
-      int schema_idx = src_col_schema;
+      int schema_idx = mapped_src_col_schema;
       int r1         = 0;
       while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
+        SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_definition_level == d) {
           // if this is a repeated field, map it one level deeper
           r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level
@@ -175,10 +182,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
 
       // we now know R1 from above. return the deepest nesting level that has the
       // same repetition level
-      schema_idx = src_col_schema;
+      schema_idx = mapped_src_col_schema;
       int depth  = max_depth - 1;
       while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
+        SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_repetition_level == r1) {
           // if this is a repeated field, map it one level deeper
           depth = cur_schema.is_stub() ? depth + 1 : depth;
@@ -783,9 +790,20 @@ void reader::impl::allocate_nesting_info()
   std::vector<int> per_page_nesting_info_size(num_columns);
   auto iter = thrust::make_counting_iterator(size_type{0});
   std::transform(iter, iter + num_columns, per_page_nesting_info_size.begin(), [&](size_type i) {
+    // Schema index of the current input column
     auto const schema_idx = _input_columns[i].schema_idx;
-    auto const& schema    = _metadata->get_schema(schema_idx);
-    return max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(schema_idx));
+    // Get the max_definition_level of this column across all sources.
+    auto max_definition_level = _metadata->get_schema(schema_idx).max_definition_level + 1;
+    std::for_each(thrust::make_counting_iterator(static_cast<size_t>(1)),
+                  thrust::make_counting_iterator(_sources.size()),
+                  [&](auto const src_file_idx) {
+                    auto const& schema = _metadata->get_schema(
+                      _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx);
+                    max_definition_level =
+                      std::max(max_definition_level, schema.max_definition_level + 1);
+                  });
+
+    return std::max(max_definition_level, _metadata->get_output_nesting_depth(schema_idx));
   });
 
   // compute total # of page_nesting infos needed and allocate space. doing this in one
@@ -813,6 +831,8 @@ void reader::impl::allocate_nesting_info()
         page_nesting_decode_info.device_ptr() + src_info_index;
 
       pages[target_page_index + p_idx].nesting_info_size = per_page_nesting_info_size[idx];
+      // Set the number of output nesting levels from the zeroth source as nesting must be
+      // identical across sources.
       pages[target_page_index + p_idx].num_output_nesting_levels =
         _metadata->get_output_nesting_depth(src_col_schema);
 
@@ -821,25 +841,36 @@ void reader::impl::allocate_nesting_info()
     target_page_index += subpass.column_page_count[idx];
   }
 
+  // Reset the target_page_index
+  target_page_index = 0;
+
   // fill in
   int nesting_info_index = 0;
-  std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const src_col_schema = _input_columns[idx].schema_idx;
 
-    // schema of the input column
-    auto& schema = _metadata->get_schema(src_col_schema);
     // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc)
+    // nesting depth must be same across sources so getting it from the zeroth source is ok
     int const max_output_depth = _metadata->get_output_nesting_depth(src_col_schema);
 
+    // Map to store depths if this column has lists
+    std::map<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
     // if this column has lists, generate depth remapping
-    std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
-    if (schema.max_repetition_level > 0) {
-      generate_depth_remappings(depth_remapping, src_col_schema, *_metadata);
-    }
+    std::for_each(
+      thrust::make_counting_iterator(static_cast<size_t>(0)),
+      thrust::make_counting_iterator(_sources.size()),
+      [&](auto const src_file_idx) {
+        auto const mapped_schema_idx = _metadata->map_schema_index(src_col_schema, src_file_idx);
+        if (_metadata->get_schema(mapped_schema_idx, src_file_idx).max_repetition_level > 0) {
+          generate_depth_remappings(
+            depth_remapping, src_col_schema, mapped_schema_idx, src_file_idx, *_metadata);
+        }
+      });
 
     // fill in host-side nesting info
-    int schema_idx  = src_col_schema;
+    int schema_idx = src_col_schema;
+    // This is okay as we only use this to check stubness of cur_schema and
+    // to get its parent's indices, both of which are one to one mapped.
     auto cur_schema = _metadata->get_schema(schema_idx);
     int cur_depth   = max_output_depth - 1;
     while (schema_idx > 0) {
@@ -848,6 +879,9 @@ void reader::impl::allocate_nesting_info()
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
         for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
+          // Source file index for the current page.
+          auto const src_file_idx =
+            pass.chunks[pages[target_page_index + p_idx].chunk_idx].src_file_idx;
           PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])];
 
@@ -855,9 +889,11 @@ void reader::impl::allocate_nesting_info()
             &page_nesting_decode_info[nesting_info_index +
                                       (p_idx * per_page_nesting_info_size[idx])];
 
+          auto const mapped_src_col_schema =
+            _metadata->map_schema_index(src_col_schema, src_file_idx);
           // if we have lists, set our start and end depth remappings
-          if (schema.max_repetition_level > 0) {
-            auto remap = depth_remapping.find(src_col_schema);
+          if (_metadata->get_schema(mapped_src_col_schema, src_file_idx).max_repetition_level > 0) {
+            auto remap = depth_remapping.find({src_col_schema, src_file_idx});
             CUDF_EXPECTS(remap != depth_remapping.end(),
                          "Could not find depth remapping for schema");
             std::vector<int> const& rep_depth_remap = (remap->second.first);
@@ -871,11 +907,15 @@ void reader::impl::allocate_nesting_info()
             }
           }
 
+          // Get the schema from the current input source.
+          auto& actual_cur_schema = _metadata->get_schema(
+            _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx);
+
           // values indexed by output column index
-          nesting_info[cur_depth].max_def_level = cur_schema.max_definition_level;
+          nesting_info[cur_depth].max_def_level = actual_cur_schema.max_definition_level;
           pni[cur_depth].size                   = 0;
           pni[cur_depth].type =
-            to_type_id(cur_schema, _strings_to_categorical, _options.timestamp_type.id());
+            to_type_id(actual_cur_schema, _strings_to_categorical, _options.timestamp_type.id());
           pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL;
         }
 
@@ -888,6 +928,8 @@ void reader::impl::allocate_nesting_info()
       cur_schema = _metadata->get_schema(schema_idx);
     }
 
+    // Offset the page and nesting info indices
+    target_page_index += subpass.column_page_count[idx];
     nesting_info_index += (per_page_nesting_info_size[idx] * subpass.column_page_count[idx]);
   }
 
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 8b59a7eef08..7f1b0b1cd46 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3822,8 +3822,8 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
     df1 = cudf.DataFrame(
         {
             "i32": cudf.Series([None, None, None], dtype="int32"),
-            "i64": cudf.Series([1234, None, 123], dtype="int64"),
-            "list": list([[1, 2], [None, 4], [5, 6]]),
+            "i64": cudf.Series([1234, 467, 123], dtype="int64"),
+            "list": list([[1, 2], None, [None, 6]]),
             "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
             "str": ["vfd", None, "ghu"],
             "d_list": list(
@@ -3838,14 +3838,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
 
     df2 = cudf.DataFrame(
         {
-            "str": ["abc", "def", None],
+            "str": ["abc", "def", "ghi"],
             "i64": cudf.Series([None, 65, 98], dtype="int64"),
             "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"),
-            "list": list([[7, 8], [9, 10], [None, 12]]),
+            "list": list([[7, 8], [9, 10], [11, 12]]),
             "d_list": list(
                 [
                     [pd.Timedelta(minutes=4), None],
-                    [None, None],
+                    None,
                     [pd.Timedelta(minutes=6), None],
                 ]
             ),
@@ -3900,38 +3900,27 @@ def test_parquet_reader_with_mismatched_structs():
         {
             "a": 1,
             "b": {
-                "inner_a": 10,
-                "inner_b": {"inner_inner_b": 1, "inner_inner_a": 2},
+                "a_a": 10,
+                "b_b": {"b_b_b": 1, "b_b_a": 2},
             },
             "c": 2,
         },
         {
             "a": 3,
-            "b": {"inner_a": 30, "inner_b": {"inner_inner_a": 210}},
+            "b": {"b_a": 30, "b_b": {"b_b_a": 210}},
             "c": 4,
         },
-        {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+        {"a": 5, "b": {"b_a": 50, "b_b": None}, "c": 6},
         {"a": 7, "b": None, "c": 8},
-        {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
-        None,
-        {
-            "a": None,
-            "b": {
-                "inner_a": None,
-                "inner_b": {"inner_inner_b": None, "inner_inner_a": 10},
-            },
-            "c": 10,
-        },
+        {"a": 5, "b": {"b_a": None, "b_b": None}, "c": None},
     ]
 
     data2 = [
-        {"a": 1, "b": {"inner_b": {"inner_inner_a": None}}},
-        {"a": 3, "b": {"inner_b": {"inner_inner_a": 1}}},
-        {"a": 5, "b": {"inner_b": None}},
-        {"a": 7, "b": {"inner_b": {"inner_inner_b": 1, "inner_inner_a": 0}}},
-        {"a": None, "b": {"inner_b": None}},
+        {"a": 1, "b": {"b_b": {"b_b_a": None}}},
+        {"a": 5, "b": {"b_b": None}},
+        {"a": 7, "b": {"b_b": {"b_b_b": 1, "b_b_a": 0}}},
+        {"a": None, "b": {"b_b": None}},
         None,
-        {"a": None, "b": {"inner_b": {"inner_inner_a": 1}}},
     ]
 
     # cuDF tables from struct data
@@ -3949,20 +3938,20 @@ def test_parquet_reader_with_mismatched_structs():
     # Read the struct.b.inner_b.inner_inner_a column from parquet
     got = cudf.read_parquet(
         [buf1, buf2],
-        columns=["struct.b.inner_b.inner_inner_a"],
+        columns=["struct.b.b_b.b_b_a"],
         allow_mismatched_pq_schemas=True,
     )
     got = (
         cudf.Series(got["struct"])
         .struct.field("b")
-        .struct.field("inner_b")
-        .struct.field("inner_inner_a")
+        .struct.field("b_b")
+        .struct.field("b_b_a")
     )
 
     # Read with chunked reader
     got_chunked = read_parquet_chunked(
         [buf1, buf2],
-        columns=["struct.b.inner_b.inner_inner_a"],
+        columns=["struct.b.b_b.b_b_a"],
         chunk_read_limit=240,
         pass_read_limit=240,
         allow_mismatched_pq_schemas=True,
@@ -3970,8 +3959,8 @@ def test_parquet_reader_with_mismatched_structs():
     got_chunked = (
         cudf.Series(got_chunked["struct"])
         .struct.field("b")
-        .struct.field("inner_b")
-        .struct.field("inner_inner_a")
+        .struct.field("b_b")
+        .struct.field("b_b_a")
     )
 
     # Construct the expected series
@@ -3979,12 +3968,12 @@ def test_parquet_reader_with_mismatched_structs():
         [
             cudf.Series(df1["struct"])
             .struct.field("b")
-            .struct.field("inner_b")
-            .struct.field("inner_inner_a"),
+            .struct.field("b_b")
+            .struct.field("b_b_a"),
             cudf.Series(df2["struct"])
             .struct.field("b")
-            .struct.field("inner_b")
-            .struct.field("inner_inner_a"),
+            .struct.field("b_b")
+            .struct.field("b_b_a"),
         ]
     ).reset_index(drop=True)
 
@@ -4023,12 +4012,12 @@ def test_parquet_reader_with_mismatched_schemas_error():
         )
 
     data1 = [
-        {"a": 1, "b": {"inner_a": 1, "inner_b": 6}},
-        {"a": 3, "b": {"inner_a": None, "inner_b": 2}},
+        {"a": 1, "b": {"b_a": 1, "b_b": 6}},
+        {"a": 3, "b": {"b_a": None, "b_b": 2}},
     ]
     data2 = [
-        {"b": {"inner_a": 1}, "c": "str"},
-        {"b": {"inner_a": None}, "c": None},
+        {"b": {"b_a": 1}, "c": "str"},
+        {"b": {"b_a": None}, "c": None},
     ]
 
     # cuDF tables from struct data
@@ -4059,6 +4048,191 @@ def test_parquet_reader_with_mismatched_schemas_error():
     ):
         cudf.read_parquet(
             [buf1, buf2],
-            columns=["struct.b.inner_b"],
+            columns=["struct.b.b_b"],
             allow_mismatched_pq_schemas=True,
         )
+
+
+def test_parquet_reader_mismatched_nullability():
+    # Ensure that we can faithfully read the tables with mismatched nullabilities
+    df1 = cudf.DataFrame(
+        {
+            "timedelta": cudf.Series([12, 54, 1231], dtype="timedelta64[ms]"),
+            "duration_list": list(
+                [
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            None,
+                            [pd.Timedelta(minutes=8), None],
+                        ],
+                        None,
+                    ],
+                    None,
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([1234, None, 4123], dtype="int64"),
+            "int32": cudf.Series([1234, 123, 4123], dtype="int32"),
+            "list": list([[1, 2], [1, 2], [1, 2]]),
+            "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "string": cudf.Series(["kitten", "puppy", "cub"]),
+        }
+    )
+
+    df2 = cudf.DataFrame(
+        {
+            "timedelta": cudf.Series(
+                [None, None, None], dtype="timedelta64[ms]"
+            ),
+            "duration_list": list(
+                [
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=1)],
+                        ],
+                    ],
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([1234, 123, 4123], dtype="int64"),
+            "int32": cudf.Series([1234, None, 4123], dtype="int32"),
+            "list": list([[1, 2], None, [1, 2]]),
+            "datetime": cudf.Series(
+                [1234, None, 4123], dtype="datetime64[ms]"
+            ),
+            "string": cudf.Series(["kitten", None, "cub"]),
+        }
+    )
+
+    # Write tables to parquet with arrow schema for compatibility for duration column(s)
+    fname1 = BytesIO()
+    df1.to_parquet(fname1, store_schema=True)
+    fname2 = BytesIO()
+    df2.to_parquet(fname2, store_schema=True)
+
+    # Read tables back with cudf and arrow in either order and compare
+    assert_eq(
+        cudf.read_parquet([fname1, fname2]),
+        cudf.concat([df1, df2]).reset_index(drop=True),
+    )
+    assert_eq(
+        cudf.read_parquet([fname2, fname1]),
+        cudf.concat([df2, df1]).reset_index(drop=True),
+    )
+
+
+def test_parquet_reader_mismatched_nullability_structs(tmpdir):
+    data1 = [
+        {
+            "a": "a",
+            "b": {
+                "b_a": 10,
+                "b_b": {"b_b_b": 1, "b_b_a": 12},
+            },
+            "c": [1, 2],
+        },
+        {
+            "a": "b",
+            "b": {
+                "b_a": 30,
+                "b_b": {"b_b_b": 2, "b_b_a": 2},
+            },
+            "c": [3, 4],
+        },
+        {
+            "a": "c",
+            "b": {
+                "b_a": 50,
+                "b_b": {"b_b_b": 4, "b_b_a": 5},
+            },
+            "c": [5, 6],
+        },
+        {
+            "a": "d",
+            "b": {
+                "b_a": 135,
+                "b_b": {"b_b_b": 12, "b_b_a": 32},
+            },
+            "c": [7, 8],
+        },
+        {
+            "a": "e",
+            "b": {
+                "b_a": 1,
+                "b_b": {"b_b_b": 1, "b_b_a": 5},
+            },
+            "c": [9, 10],
+        },
+        {
+            "a": "f",
+            "b": {
+                "b_a": 32,
+                "b_b": {"b_b_b": 1, "b_b_a": 6},
+            },
+            "c": [11, 12],
+        },
+    ]
+
+    data2 = [
+        {
+            "a": "g",
+            "b": {
+                "b_a": 10,
+                "b_b": {"b_b_b": None, "b_b_a": 2},
+            },
+            "c": None,
+        },
+        {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]},
+        {"a": "j", "b": None, "c": [8, 10]},
+        {"a": None, "b": {"b_a": None, "b_b": None}, "c": None},
+        None,
+        {
+            "a": None,
+            "b": {"b_a": None, "b_b": {"b_b_b": 1}},
+            "c": [18, 19],
+        },
+        {"a": None, "b": None, "c": None},
+    ]
+
+    pa_table1 = pa.Table.from_pydict({"struct": data1})
+    df1 = cudf.DataFrame.from_arrow(pa_table1)
+
+    pa_table2 = pa.Table.from_pydict({"struct": data2})
+    df2 = cudf.DataFrame.from_arrow(pa_table2)
+
+    # Write tables to parquet
+    buf1 = BytesIO()
+    df1.to_parquet(buf1)
+    buf2 = BytesIO()
+    df2.to_parquet(buf2)
+
+    # Read tables back with cudf and compare with expected.
+    assert_eq(
+        cudf.read_parquet([buf1, buf2]),
+        cudf.concat([df1, df2]).reset_index(drop=True),
+    )
+    assert_eq(
+        cudf.read_parquet([buf2, buf1]),
+        cudf.concat([df2, df1]).reset_index(drop=True),
+    )

From 1b402dfc2f078656bcbbb8a0386008601620e6e2 Mon Sep 17 00:00:00 2001
From: Mike McCarty <mmccarty@nvidia.com>
Date: Wed, 11 Sep 2024 19:00:45 -0400
Subject: [PATCH 194/270] Recommending `miniforge` for conda install (#16782)

Recommending miniforge for conda install in README

Authors:
  - Mike McCarty (https://github.com/mmccarty)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16782
---
 README.md                  | 2 +-
 python/custreamz/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f62f7885d63..8f8c2adac2f 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12
 
 ### Conda
 
-cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel:
+cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
diff --git a/python/custreamz/README.md b/python/custreamz/README.md
index 1509dac9e61..8da17ef09dc 100644
--- a/python/custreamz/README.md
+++ b/python/custreamz/README.md
@@ -54,7 +54,7 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids
 
 ### Conda
 
-cuStreamz is installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` or `rapidsai-nightly` channel:
+cuStraamz can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:
 
 Release:
 ```bash

From 3dbc33a5cb1cf7052cd67f5654b34594403fbfef Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Wed, 11 Sep 2024 19:11:20 -0700
Subject: [PATCH 195/270] Revert "Fix empty cluster handling in tdigest merge
 (#16675)" (#16800)

This PR reverts #16675, which has introduced another bug. Our nightly CI pipeline is failing because of this bug (https://github.com/NVIDIA/spark-rapids/issues/11463). I can reproduce the bug within a libcudf unit test. I will make another PR to fix both the original issue and the new bug.

Authors:
  - Jihoon Son (https://github.com/jihoonson)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16800
---
 cpp/include/cudf/detail/tdigest/tdigest.hpp   | 17 ++--
 cpp/include/cudf_test/tdigest_utilities.cuh   | 20 ++---
 cpp/src/quantiles/tdigest/tdigest.cu          | 23 +++--
 .../quantiles/tdigest/tdigest_aggregation.cu  | 70 ++++++---------
 cpp/tests/groupby/tdigest_tests.cu            | 90 ++-----------------
 .../quantiles/percentile_approx_test.cpp      |  4 +-
 6 files changed, 62 insertions(+), 162 deletions(-)

diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 672b95e2d01..80a4460023f 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -143,29 +143,28 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create a tdigest column of empty clusters.
+ * @brief Create an empty tdigest column.
  *
- * The column created contains the specified number of rows of empty clusters.
+ * An empty tdigest column contains a single row of length 0
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns A tdigest column of empty clusters.
+ * @returns An empty tdigest column.
  */
 CUDF_EXPORT
-std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
-                                                              rmm::cuda_stream_view stream,
-                                                              rmm::device_async_resource_ref mr);
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create a scalar of an empty tdigest cluster.
+ * @brief Create an empty tdigest scalar.
  *
- * The returned scalar is a struct_scalar that contains a single row of an empty cluster.
+ * An empty tdigest scalar is a struct_scalar that contains a single row of length 0
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns A scalar of an empty tdigest cluster.
+ * @returns An empty tdigest scalar.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index be7d19b2227..1758790cd64 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -562,12 +562,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto b = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto b = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 76cd55bf994..0d017cf1f13 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -292,33 +292,32 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
   return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr);
 }
 
-std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
-                                                              rmm::cuda_stream_view stream,
-                                                              rmm::device_async_resource_ref mr)
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::device_async_resource_ref mr)
 {
   auto offsets = cudf::make_fixed_width_column(
-    data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                offsets->mutable_view().begin<size_type>(),
                offsets->mutable_view().end<size_type>(),
                0);
 
-  auto min_col = cudf::make_numeric_column(
-    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
+  auto min_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                min_col->mutable_view().begin<double>(),
                min_col->mutable_view().end<double>(),
                0);
-  auto max_col = cudf::make_numeric_column(
-    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
+  auto max_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                max_col->mutable_view().begin<double>(),
                max_col->mutable_view().end<double>(),
                0);
 
-  return make_tdigest_column(num_rows,
-                             cudf::make_empty_column(type_id::FLOAT64),
-                             cudf::make_empty_column(type_id::FLOAT64),
+  return make_tdigest_column(1,
+                             make_empty_column(type_id::FLOAT64),
+                             make_empty_column(type_id::FLOAT64),
                              std::move(offsets),
                              std::move(min_col),
                              std::move(max_col),
@@ -339,7 +338,7 @@ std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr)
 {
-  auto contents = make_tdigest_column_of_empty_clusters(1, stream, mr)->release();
+  auto contents = make_empty_tdigest_column(stream, mr)->release();
   return std::make_unique<struct_scalar>(
     std::move(*std::make_unique<table>(std::move(contents.children))), true, stream, mr);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index d591fb5c171..2dd25a7b890 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -366,8 +366,8 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
  * @param group_cluster_wl    Output.  The set of cluster weight limits for each group.
  * @param group_num_clusters  Output.  The number of output clusters for each input group.
  * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
- * @param may_have_empty_clusters Whether or not there could be empty clusters. Must only be
- * set to false when there is no empty cluster, true otherwise.
+ * @param has_nulls Whether or not the input contains nulls
+ *
  */
 
 template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
@@ -379,7 +379,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
                                                 double* group_cluster_wl,
                                                 size_type* group_num_clusters,
                                                 size_type const* group_cluster_offsets,
-                                                bool may_have_empty_clusters)
+                                                bool has_nulls)
 {
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -399,12 +399,11 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
   // a group with nothing in it.
   group_num_clusters[group_index] = 0;
   if (total_weight <= 0) {
-    // If the input contains empty clusters, we can potentially have a group that also generates
-    // empty clusters because -all- of the input values are null or empty cluster. In that case, the
-    // `reduce_by_key` call in the tdigest generation step will need a location to store the unused
-    // reduction value for that group of nulls and empty clusters. These "stubs" will be
-    // postprocessed out afterwards.
-    if (may_have_empty_clusters) { group_num_clusters[group_index] = 1; }
+    // if the input contains nulls we can potentially have a group that generates no
+    // clusters because -all- of the input values are null.  in that case, the reduce_by_key call
+    // in the tdigest generation step will need a location to store the unused reduction value for
+    // that group of nulls. these "stubs" will be postprocessed out afterwards.
+    if (has_nulls) { group_num_clusters[group_index] = 1; }
     return;
   }
 
@@ -503,8 +502,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
  * stream that falls before our current cluster limit
  * @param group_info         A functor which returns the info for the specified group (total weight,
  * size and start offset)
- * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
- * set to false only when there is no empty cluster.
+ * @param has_nulls          Whether or not the input data contains nulls
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -518,7 +516,7 @@ generate_group_cluster_info(int delta,
                             NearestWeight nearest_weight,
                             GroupInfo group_info,
                             CumulativeWeight cumulative_weight,
-                            bool may_have_empty_clusters,
+                            bool has_nulls,
                             rmm::cuda_stream_view stream,
                             rmm::device_async_resource_ref mr)
 {
@@ -537,7 +535,7 @@ generate_group_cluster_info(int delta,
     nullptr,
     group_num_clusters.begin(),
     nullptr,
-    may_have_empty_clusters);
+    has_nulls);
 
   // generate group cluster offsets (where the clusters for a given group start and end)
   auto group_cluster_offsets = cudf::make_numeric_column(
@@ -569,7 +567,7 @@ generate_group_cluster_info(int delta,
     group_cluster_wl.begin(),
     group_num_clusters.begin(),
     group_cluster_offsets->view().begin<size_type>(),
-    may_have_empty_clusters);
+    has_nulls);
 
   return {std::move(group_cluster_wl),
           std::move(group_cluster_offsets),
@@ -582,7 +580,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                             std::unique_ptr<column>&& offsets,
                                             std::unique_ptr<column>&& min_col,
                                             std::unique_ptr<column>&& max_col,
-                                            bool may_have_empty_clusters,
+                                            bool has_nulls,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
@@ -597,7 +595,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                           size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; };
 
   size_type const num_stubs = [&]() {
-    if (!may_have_empty_clusters) { return 0; }
+    if (!has_nulls) { return 0; }
     auto iter = cudf::detail::make_counting_transform_iterator(
       0, cuda::proclaim_return_type<size_type>(is_stub_digest));
     return thrust::reduce(rmm::exec_policy(stream), iter, iter + num_rows);
@@ -663,10 +661,6 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                                     mr);
 }
 
-/**
- * @brief A functor which returns the cluster index within a group that the value at
- * the given value index falls into.
- */
 template <typename CumulativeWeight>
 struct compute_tdigests_keys_fn {
   int const delta;
@@ -712,8 +706,8 @@ struct compute_tdigests_keys_fn {
  * boundaries.
  *
  * @param delta              tdigest compression level
- * @param centroids_begin    Beginning of the range of centroids.
- * @param centroids_end      End of the range of centroids.
+ * @param values_begin       Beginning of the range of input values.
+ * @param values_end         End of the range of input values.
  * @param cumulative_weight  Functor which returns cumulative weight and group information for
  * an absolute input value index.
  * @param min_col            Column containing the minimum value per group.
@@ -721,8 +715,7 @@ struct compute_tdigests_keys_fn {
  * @param group_cluster_wl   Cluster weight limits for each group.
  * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits.
  * @param total_clusters     Total number of clusters in all groups.
- * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
- * set to false only when there is no empty cluster.
+ * @param has_nulls          Whether or not the input contains nulls
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -738,7 +731,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                                          rmm::device_uvector<double> const& group_cluster_wl,
                                          std::unique_ptr<column>&& group_cluster_offsets,
                                          size_type total_clusters,
-                                         bool may_have_empty_clusters,
+                                         bool has_nulls,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
@@ -757,9 +750,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
   //   double       // max
   // }
   //
-  if (total_clusters == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
-  }
+  if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
   // each input group represents an individual tdigest.  within each tdigest, we want the keys
   // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
@@ -802,7 +793,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                              std::move(group_cluster_offsets),
                              std::move(min_col),
                              std::move(max_col),
-                             may_have_empty_clusters,
+                             has_nulls,
                              stream,
                              mr);
 }
@@ -1154,13 +1145,8 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
   auto merged =
     cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref());
 
-  auto merged_weights = merged->get_column(1).view();
-  // If there are no values, we can simply return a column that has only empty tdigests.
-  if (merged_weights.size() == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(num_groups, stream, mr);
-  }
-
   // generate cumulative weights
+  auto merged_weights     = merged->get_column(1).view();
   auto cumulative_weights = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream);
   auto keys = cudf::detail::make_counting_transform_iterator(
@@ -1175,10 +1161,6 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   auto const delta = max_centroids;
 
-  // We do not know whether there is any empty cluster in the input without actually reading the
-  // data, which could be expensive. So, we just assume that there could be empty clusters.
-  auto const may_have_empty_clusters = true;
-
   // generate cluster info
   auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
     delta,
@@ -1195,7 +1177,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       group_labels,
       group_offsets,
       {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
-    may_have_empty_clusters,
+    false,
     stream,
     mr);
 
@@ -1220,7 +1202,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
     group_cluster_wl,
     std::move(group_cluster_offsets),
     total_clusters,
-    may_have_empty_clusters,
+    false,
     stream,
     mr);
 }
@@ -1285,9 +1267,7 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  if (col.size() == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
-  }
+  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(col.type(),
@@ -1313,7 +1293,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   tdigest_column_view tdv(input);
 
   if (num_groups == 0 || input.size() == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
+    return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr);
   }
 
   // bring group offsets back to the host
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 3780dbb1d95..baa59026b07 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto e = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
@@ -507,81 +507,3 @@ TEST_F(TDigestMergeTest, EmptyGroups)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
 }
-
-std::unique_ptr<cudf::table> do_agg(
-  cudf::column_view key,
-  cudf::column_view val,
-  std::function<std::unique_ptr<cudf::groupby_aggregation>()> make_agg)
-{
-  std::vector<cudf::column_view> keys;
-  keys.push_back(key);
-  cudf::table_view const key_table(keys);
-
-  cudf::groupby::groupby gb(key_table);
-  std::vector<cudf::groupby::aggregation_request> requests;
-  cudf::groupby::aggregation_request req;
-  req.values = val;
-  req.aggregations.push_back(make_agg());
-  requests.push_back(std::move(req));
-
-  auto result = gb.aggregate(std::move(requests));
-
-  std::vector<std::unique_ptr<cudf::column>> result_columns;
-  for (auto&& c : result.first->release()) {
-    result_columns.push_back(std::move(c));
-  }
-
-  EXPECT_EQ(result.second.size(), 1);
-  EXPECT_EQ(result.second[0].results.size(), 1);
-  result_columns.push_back(std::move(result.second[0].results[0]));
-
-  return std::make_unique<cudf::table>(std::move(result_columns));
-}
-
-TEST_F(TDigestMergeTest, AllGroupsHaveEmptyClusters)
-{
-  // The input must be sorted by the key.
-  // See `aggregate_result_functor::operator()<aggregation::TDIGEST>` for details.
-  auto const keys      = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 0, 1, 1, 2}};
-  auto const keys_view = cudf::column_view(keys);
-  auto val_elems  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
-  auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-    // All values are null
-    return false;
-  });
-  auto const vals = cudf::test::fixed_width_column_wrapper<int32_t>{
-    val_elems, val_elems + keys_view.size(), val_valids};
-
-  auto const delta = 10000;
-
-  // Compute tdigest. The result should have 3 empty clusters, one per group.
-  auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() {
-    return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
-  });
-
-  auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
-  cudf::column_view const expected_computed_keys_view{expected_computed_keys};
-  auto const expected_computed_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    expected_computed_keys_view.size(),
-    cudf::get_default_stream(),
-    rmm::mr::get_current_device_resource());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view());
-  // The computed values are nullable even though the input values are not.
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(),
-                                 compute_result->get_column(1).view());
-
-  // Merge tdigest. The result should have 3 empty clusters, one per group.
-  auto const merge_result =
-    do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
-      return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
-    });
-
-  auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
-  cudf::column_view const expected_merged_keys_view{expected_merged_keys};
-  auto const expected_merged_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    expected_merged_keys_view.size(),
-    cudf::get_default_stream(),
-    rmm::mr::get_current_device_resource());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view());
-}
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 7359f0406fc..915717713df 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {};
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;

From 124d3e353eeebd595da113dbef3d5bad842a791d Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 16 Sep 2024 12:17:58 -0500
Subject: [PATCH 196/270] Migrate dask-cudf README improvements to dask-cudf
 sphinx docs (#16765)

Follow up to https://github.com/rapidsai/cudf/pull/16671

- Moves most of the information recently added to the dask-cudf README into the dask-cudf Sphinx documentation
- Adds a "Quick-start" example to the simplified dask-cudf README

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Benjamin Zaitlen (https://github.com/quasiben)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/cudf/pull/16765
---
 docs/cudf/source/user_guide/10min.ipynb |  31 ++--
 docs/dask_cudf/source/index.rst         | 210 ++++++++++++++++++------
 python/dask_cudf/README.md              | 148 +++++------------
 3 files changed, 213 insertions(+), 176 deletions(-)

diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 2eaa75b3189..95f5f9734dd 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -5,9 +5,9 @@
    "id": "4c6c548b",
    "metadata": {},
    "source": [
-    "# 10 Minutes to cuDF and Dask-cuDF\n",
+    "# 10 Minutes to cuDF and Dask cuDF\n",
     "\n",
-    "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly towards new users.\n",
+    "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask cuDF, geared mainly towards new users.\n",
     "\n",
     "## What are these Libraries?\n",
     "\n",
@@ -18,13 +18,14 @@
     "[Dask cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n",
     "\n",
     "\n",
-    "> [!NOTE]  \n",
-    "> This notebook uses the explicit Dask cuDF API (`dask_cudf`) for clarity. However, we strongly recommend that you use Dask's [configuration infrastructure](https://docs.dask.org/en/latest/configuration.html) to set the `\"dataframe.backend\"` to `\"cudf\"`, and work with the `dask.dataframe` API directly. Please see the [Dask cuDF documentation](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) for more information.\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<b>Note:</b> This notebook uses the explicit Dask cuDF API (dask_cudf) for clarity. However, we strongly recommend that you use Dask's <a href=\"https://docs.dask.org/en/latest/configuration.html\">configuration infrastructure</a> to set the \"dataframe.backend\" option to \"cudf\", and work with the Dask DataFrame API directly. Please see the <a href=\"https://github.com/rapidsai/cudf/tree/main/python/dask_cudf\">Dask cuDF documentation</a> for more information.\n",
+    "</div>\n",
     "\n",
     "\n",
-    "## When to use cuDF and Dask-cuDF\n",
+    "## When to use cuDF and Dask cuDF\n",
     "\n",
-    "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF."
+    "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask cuDF."
    ]
   },
   {
@@ -115,7 +116,7 @@
    "source": [
     "ds = dask_cudf.from_cudf(s, npartitions=2)\n",
     "# Note the call to head here to show the first few entries, unlike\n",
-    "# cuDF objects, dask-cuDF objects do not have a printing\n",
+    "# cuDF objects, Dask-cuDF objects do not have a printing\n",
     "# representation that shows values since they may not be in local\n",
     "# memory.\n",
     "ds.head(n=3)"
@@ -331,11 +332,11 @@
    "id": "b17db919",
    "metadata": {},
    "source": [
-    "Now we will convert our cuDF dataframe into a dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n",
+    "Now we will convert our cuDF dataframe into a Dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n",
     "\n",
-    "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n",
+    "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the Dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a Dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n",
     "\n",
-    "*To understand more of the differences between how cuDF and dask-cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*"
+    "*To understand more of the differences between how cuDF and Dask cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*"
    ]
   },
   {
@@ -1680,7 +1681,7 @@
    "id": "7aa0089f",
    "metadata": {},
    "source": [
-    "Note here we call `compute()` rather than `head()` on the dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)."
+    "Note here we call `compute()` rather than `head()` on the Dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)."
    ]
   },
   {
@@ -2393,7 +2394,7 @@
    "id": "f6094cbe",
    "metadata": {},
    "source": [
-    "Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe."
+    "Applying functions to a `Series`. Note that applying user defined functions directly with Dask cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe."
    ]
   },
   {
@@ -3492,7 +3493,7 @@
    "id": "5ac3b004",
    "metadata": {},
    "source": [
-    "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF."
+    "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask cuDF."
    ]
   },
   {
@@ -4181,7 +4182,7 @@
    "id": "aa8a445b",
    "metadata": {},
    "source": [
-    "To convert the first few entries to pandas, we similarly call `.head()` on the dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert."
+    "To convert the first few entries to pandas, we similarly call `.head()` on the Dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert."
    ]
   },
   {
@@ -4899,7 +4900,7 @@
    "id": "787eae14",
    "metadata": {},
    "source": [
-    "Note that for the dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU."
+    "Note that for the Dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU."
    ]
   },
   {
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 9a216690384..7fe6cbd45fa 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -3,39 +3,42 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to dask-cudf's documentation!
+Welcome to Dask cuDF's documentation!
 =====================================
 
-**Dask-cuDF** (pronounced "DASK KOO-dee-eff") is an extension
+**Dask cuDF** (pronounced "DASK KOO-dee-eff") is an extension
 library for the `Dask <https://dask.org>`__ parallel computing
-framework that provides a `cuDF
-<https://docs.rapids.ai/api/cudf/stable/>`__-backed distributed
-dataframe with the same API as `Dask dataframes
-<https://docs.dask.org/en/stable/dataframe.html>`__.
+framework. When installed, Dask cuDF is automatically registered
+as the ``"cudf"`` dataframe backend for
+`Dask DataFrame <https://docs.dask.org/en/stable/dataframe.html>`__.
+
+.. note::
+  Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU
+  or multi-node execution on their own. You must also deploy a
+  `dask.distributed <https://distributed.dask.org/en/stable/>` cluster
+  to leverage multiple GPUs. We strongly recommend using `Dask-CUDA
+  <https://docs.rapids.ai/api/dask-cuda/stable/>`__ to simplify the
+  setup of the cluster, taking advantage of all features of the GPU
+  and networking hardware.
 
 If you are familiar with Dask and `pandas <pandas.pydata.org>`__ or
-`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask-cuDF
+`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask cuDF
 should feel familiar to you. If not, we recommend starting with `10
 minutes to Dask
 <https://docs.dask.org/en/stable/10-minutes-to-dask.html>`__ followed
-by `10 minutes to cuDF and Dask-cuDF
+by `10 minutes to cuDF and Dask cuDF
 <https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html>`__.
 
-When running on multi-GPU systems, `Dask-CUDA
-<https://docs.rapids.ai/api/dask-cuda/stable/>`__ is recommended to
-simplify the setup of the cluster, taking advantage of all features of
-the GPU and networking hardware.
 
-Using Dask-cuDF
+Using Dask cuDF
 ---------------
 
-When installed, Dask-cuDF registers itself as a dataframe backend for
-Dask. This means that in many cases, using cuDF-backed dataframes requires
-only small changes to an existing workflow. The minimal change is to
-select cuDF as the dataframe backend in :doc:`Dask's
-configuration <dask:configuration>`. To do so, we must set the option
-``dataframe.backend`` to ``cudf``. From Python, this can be achieved
-like so::
+The Dask DataFrame API (Recommended)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Simply use the `Dask configuration <dask:configuration>` system to
+set the ``"dataframe.backend"`` option to ``"cudf"``. From Python,
+this can be achieved like so::
 
   import dask
 
@@ -44,52 +47,157 @@ like so::
 Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the
 environment before running your code.
 
-Dataframe creation from on-disk formats
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If your workflow creates Dask dataframes from on-disk formats
-(for example using :func:`dask.dataframe.read_parquet`), then setting
-the backend may well be enough to migrate your workflow.
-
-For example, consider reading a dataframe from parquet::
+Once this is done, the public Dask DataFrame API will leverage
+``cudf`` automatically when a new DataFrame collection is created
+from an on-disk format using any of the following ``dask.dataframe``
+functions::
 
-   import dask.dataframe as dd
+* :func:`dask.dataframe.read_parquet`
+* :func:`dask.dataframe.read_json`
+* :func:`dask.dataframe.read_csv`
+* :func:`dask.dataframe.read_orc`
+* :func:`dask.dataframe.read_hdf`
+* :func:`dask.dataframe.from_dict`
 
-   # By default, we obtain a pandas-backed dataframe
-   df = dd.read_parquet("data.parquet", ...)
+For example::
 
+  import dask.dataframe as dd
 
-To obtain a cuDF-backed dataframe, we must set the
-``dataframe.backend`` configuration option::
+  # By default, we obtain a pandas-backed dataframe
+  df = dd.read_parquet("data.parquet", ...)
 
   import dask
-  import dask.dataframe as dd
 
   dask.config.set({"dataframe.backend": "cudf"})
-  # This gives us a cuDF-backed dataframe
+  # This now gives us a cuDF-backed dataframe
   df = dd.read_parquet("data.parquet", ...)
 
-This code will use cuDF's GPU-accelerated :func:`parquet reader
-<cudf.read_parquet>` to read partitions of the data.
+When other functions are used to create a new collection
+(e.g. :func:`from_map`, :func:`from_pandas`, :func:`from_delayed`,
+and :func:`from_array`), the backend of the new collection will
+depend on the inputs to those functions. For example::
+
+  import pandas as pd
+  import cudf
+
+  # This gives us a pandas-backed dataframe
+  dd.from_pandas(pd.DataFrame({"a": range(10)}))
+
+  # This gives us a cuDF-backed dataframe
+  dd.from_pandas(cudf.DataFrame({"a": range(10)}))
+
+An existing collection can always be moved to a specific backend
+using the :func:`dask.dataframe.DataFrame.to_backend` API::
+
+  # This ensures that we have a cuDF-backed dataframe
+  df = df.to_backend("cudf")
+
+  # This ensures that we have a pandas-backed dataframe
+  df = df.to_backend("pandas")
+
+The explicit Dask cuDF API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition to providing the ``"cudf"`` backend for Dask DataFrame,
+Dask cuDF also provides an explicit ``dask_cudf`` API::
+
+  import dask_cudf
+
+  # This always gives us a cuDF-backed dataframe
+  df = dask_cudf.read_parquet("data.parquet", ...)
+
+This API is used implicitly by the Dask DataFrame API when the ``"cudf"``
+backend is enabled. Therefore, using it directly will not provide any
+performance benefit over the CPU/GPU-portable ``dask.dataframe`` API.
+Also, using some parts of the explicit API are incompatible with
+automatic query planning (see the next section).
+
+The explicit Dask cuDF API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+).
+As long as the ``"dataframe.query-planning"`` configuration is set to
+``True`` (the default) when ``dask.dataframe`` is first imported, `Dask
+Expressions <https://github.com/dask/dask-expr>`__ will be used under the hood.
+
+For example, the following code will automatically benefit from predicate
+pushdown when the result is computed::
+
+  df = dd.read_parquet("/my/parquet/dataset/")
+  result = df.sort_values('B')['A']
+
+Unoptimized expression graph (``df.pprint()``)::
+
+  Projection: columns='A'
+    SortValues: by=['B'] shuffle_method='tasks' options={}
+      ReadParquetFSSpec: path='/my/parquet/dataset/' ...
+
+Simplified expression graph (``df.simplify().pprint()``)::
+
+  Projection: columns='A'
+    SortValues: by=['B'] shuffle_method='tasks' options={}
+      ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ...
+
+.. note::
+  Dask will automatically simplify the expression graph (within
+  :func:`optimize`) when the result is converted to a task graph
+  (via :func:`compute` or :func:`persist`). You do not need to call
+  :func:`simplify` yourself.
+
+
+Using Multiple GPUs and Multiple Nodes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try
+to partition your data into small-enough tasks to fit comfortably in the
+memory of a single GPU. This means the necessary compute tasks needed to
+compute a query can often be streamed to a single GPU process for
+out-of-core computing. This also means that the compute tasks can be
+executed in parallel over a multi-GPU cluster.
+
+In order to execute your Dask workflow on multiple GPUs, you will
+typically need to use `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+to deploy distributed Dask cluster, and
+`Distributed <https://distributed.dask.org/en/stable/client.html>`__
+to define a client object. For example::
+
+  from dask_cuda import LocalCUDACluster
+  from distributed import Client
+
+  if __name__ == "__main__":
+
+    client = Client(
+      LocalCUDACluster(
+        CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
+        rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
+        enable_cudf_spill=True,  # Improve device memory stability
+        local_directory="/fast/scratch/",  # Use fast local storage for spilling
+      )
+    )
+
+    df = dd.read_parquet("/my/parquet/dataset/")
+    agg = df.groupby('B').sum()
+    agg.compute()  # This will use the cluster defined above
+
+.. note::
+  This example uses :func:`compute` to materialize a concrete
+  ``cudf.DataFrame`` object in local memory. Never call :func:`compute`
+  on a large collection that cannot fit comfortably in the memory of a
+  single GPU! See Dask's `documentation on managing computation
+  <https://distributed.dask.org/en/stable/manage-computation.html>`__
+  for more details.
 
-Dataframe creation from in-memory formats
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Please see the `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+documentation for more information about deploying GPU-aware clusters
+(including `best practices
+<https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/>`__).
 
-If you already have a dataframe in memory and want to convert it to a
-cuDF-backend one, there are two options depending on whether the
-dataframe is already a Dask one or not. If you have a Dask dataframe,
-then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"``
-as the backend; if you have a pandas dataframe then you can either
-call :func:`dask.dataframe.from_pandas` followed by
-:func:`~dask.dataframe.to_backend` or first convert the dataframe with
-:func:`cudf.from_pandas` and then parallelise this with
-:func:`dask_cudf.from_cudf`.
 
 API Reference
 -------------
 
-Generally speaking, Dask-cuDF tries to offer exactly the same API as
-Dask itself. There are, however, some minor differences mostly because
+Generally speaking, Dask cuDF tries to offer exactly the same API as
+Dask DataFrame. There are, however, some minor differences mostly because
 cuDF does not :doc:`perfectly mirror <cudf:user_guide/PandasCompat>`
 the pandas API, or because cuDF provides additional configuration
 flags (these mostly occur in data reading and writing interfaces).
@@ -97,7 +205,7 @@ flags (these mostly occur in data reading and writing interfaces).
 As a result, straightforward workflows can be migrated without too
 much trouble, but more complex ones that utilise more features may
 need a bit of tweaking. The API documentation describes details of the
-differences and all functionality that Dask-cuDF supports.
+differences and all functionality that Dask cuDF supports.
 
 .. toctree::
    :maxdepth: 2
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
index 6edb9f87d48..4655d2165f0 100644
--- a/python/dask_cudf/README.md
+++ b/python/dask_cudf/README.md
@@ -1,135 +1,63 @@
 # <div align="left"><img src="../../img/rapids_logo.png" width="90px"/>&nbsp;Dask cuDF - A GPU Backend for Dask DataFrame</div>
 
-Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html). When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame.
-
-## Using Dask cuDF
-
-### The Dask DataFrame API (Recommended)
-
-Simply set the `"dataframe.backend"` [configuration](https://docs.dask.org/en/stable/configuration.html) to `"cudf"` in Dask, and the public Dask DataFrame API will leverage `cudf` automatically:
-
-```python
-import dask
-dask.config.set({"dataframe.backend": "cudf"})
-
-import dask.dataframe as dd
-# This gives us a cuDF-backed dataframe
-df = dd.read_parquet("data.parquet", ...)
-```
+Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html) that provides a Pandas-like API for parallel and larger-than-memory DataFrame computing on GPUs. When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame.
 
 > [!IMPORTANT]
-> The `"dataframe.backend"` configuration will only be used for collection creation when the following APIs are used: `read_parquet`, `read_json`, `read_csv`, `read_orc`, `read_hdf`, and `from_dict`. For example, if `from_map`, `from_pandas`, `from_delayed`, or `from_array` are used, the backend of the new collection will depend on the input to the function:
-
-```python
-import pandas as pd
-import cudf
-
-# This gives us a Pandas-backed dataframe
-dd.from_pandas(pd.DataFrame({"a": range(10)}))
-
-# This gives us a cuDF-backed dataframe
-dd.from_pandas(cudf.DataFrame({"a": range(10)}))
-```
-
-A cuDF-backed DataFrame collection can be moved to the `"pandas"` backend:
-
-```python
-df = df.to_backend("pandas")
-```
-
-Similarly, a Pandas-backed DataFrame collection can be moved to the `"cudf"` backend:
-
-```python
-df = df.to_backend("cudf")
-```
-
-### The Explicit Dask cuDF API
-
-In addition to providing the `"cudf"` backend for Dask DataFrame, Dask cuDF also provides an explicit `dask_cudf` API:
-
-```python
-import dask_cudf
-
-# This always gives us a cuDF-backed dataframe
-df = dask_cudf.read_parquet("data.parquet", ...)
-```
-
-> [!NOTE]
-> This API is used implicitly by the Dask DataFrame API when the `"cudf"` backend is enabled. Therefore, using it directly will not provide any performance benefit over the CPU/GPU-portable `dask.dataframe` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section).
+> Dask cuDF does not provide support for multi-GPU or multi-node execution on its own. You must also deploy a distributed cluster (ideally with [Dask-CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs efficiently.
 
-See the [Dask cuDF's API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) for further information.
-
-## Query Planning
-
-Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the `"dataframe.query-planning"` configuration is set to `True` (the default) when `dask.dataframe` is first imported, [Dask Expressions](https://github.com/dask/dask-expr) will be used under the hood.
-
-For example, the following user code will automatically benefit from predicate pushdown when the result is computed.
-
-```python
-df = dd.read_parquet("/my/parquet/dataset/")
-result = df.sort_values('B')['A']
-```
-
-Unoptimized expression graph (`df.pprint()`):
-```
-Projection: columns='A'
-  SortValues: by=['B'] shuffle_method='tasks' options={}
-    ReadParquetFSSpec: path='/my/parquet/dataset/' ...
-```
+## Using Dask cuDF
 
-Simplified expression graph (`df.simplify().pprint()`):
-```
-Projection: columns='A'
-  SortValues: by=['B'] shuffle_method='tasks' options={}
-    ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ...
-```
+Please visit [the official documentation page](https://docs.rapids.ai/api/dask-cudf/stable/) for detailed information about using Dask cuDF.
 
-> [!NOTE]
-> Dask will automatically simplify the expression graph (within `optimize`) when the result is converted to a task graph (via `compute` or `persist`). The user does not need to call `simplify` themself.
+## Installation
 
+See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages.
 
-## Using Multiple GPUs and Multiple Nodes
+## Resources
 
-Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try to partition your data into small-enough tasks to fit comfortably in the memory of a single GPU. This means the necessary compute tasks needed to compute a query can often be streamed to a single GPU process for out-of-core computing. This also means that the compute tasks can be executed in parallel over a multi-GPU cluster.
+- [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
+- [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/)
+- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
+- [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
+- [Deployment](https://docs.rapids.ai/deployment/stable/)
+- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.
 
-> [!IMPORTANT]
-> Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must deploy a distributed cluster (ideally with [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs.
+### Quick-start example
 
-In order to execute your Dask workflow on multiple GPUs, you will typically need to use [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) to deploy distributed Dask cluster, and [Distributed](https://distributed.dask.org/en/stable/client.html) to define a `client` object. For example:
+A very common Dask cuDF use case is single-node multi-GPU data processing. These workflows typically use the following pattern:
 
 ```python
-
+import dask
+import dask.dataframe as dd
 from dask_cuda import LocalCUDACluster
 from distributed import Client
 
-client = Client(
+if __name__ == "__main__":
+
+  # Define a GPU-aware cluster to leverage multiple GPUs
+  client = Client(
     LocalCUDACluster(
-        CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
-        rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
-        enable_cudf_spill=True,  # Improve device memory stability
-        local_directory="/fast/scratch/",  # Use fast local storage for spilling
+      CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
+      rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
+      enable_cudf_spill=True,  # Improve device memory stability
+      local_directory="/fast/scratch/",  # Use fast local storage for spilling
     )
-)
+  )
 
-df = dd.read_parquet("/my/parquet/dataset/")
-agg = df.groupby('B').sum()
-agg.compute()  # This will use the cluster defined above
-```
+  # Set the default dataframe backend to "cudf"
+  dask.config.set({"dataframe.backend": "cudf"})
 
-> [!NOTE]
-> This example uses `compute` to materialize a concrete `cudf.DataFrame` object in local memory. Never call `compute` on a large collection that cannot fit comfortably in the memory of a single GPU! See Dask's [documentation on managing computation](https://distributed.dask.org/en/stable/manage-computation.html) for more details.
+  # Create your DataFrame collection from on-disk
+  # or in-memory data
+  df = dd.read_parquet("/my/parquet/dataset/")
 
-Please see the [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) documentation for more information about deploying GPU-aware clusters (including [best practices](https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/)).
+  # Use cudf-like syntax to transform and/or query your data
+  query = df.groupby('item')['price'].mean()
 
-## Install
-
-See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages.
+  # Compute, persist, or write out the result
+  query.head()
+```
 
-## Resources
+If you do not have multiple GPUs available, using `LocalCUDACluster` is optional. However, it is still a good idea to [enable cuDF spilling](https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory).
 
-- [Dask cuDF API documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
-- [cuDF API documentation](https://docs.rapids.ai/api/cudf/stable/)
-- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
-- [Dask CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
-- [Deployment](https://docs.rapids.ai/deployment/stable/)
-- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.
+If you wish to scale across multiple nodes, you will need to use a different mechanism to deploy your Dask-CUDA workers. Please see [the RAPIDS deployment documentation](https://docs.rapids.ai/deployment/stable/) for more instructions.

From 40333854b5efadb5b482ec80663b837680af1598 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Mon, 16 Sep 2024 17:04:47 -0500
Subject: [PATCH 197/270] Java: Make
 ColumnVector.fromViewWithContiguousAllocation public (#16784)

Exposes ColumnVector's fromViewWithContiguousAllocation method so code outside of cudf that builds contiguous table views can expose those columns in Java.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/16784
---
 java/src/main/java/ai/rapids/cudf/ColumnVector.java | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 5a0fbd224ad..6a0f0f6f169 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -218,7 +218,13 @@ static long initViewHandle(DType type, int numRows, int nullCount,
         od, vd, nullCount, numRows, childHandles);
   }
 
-  static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) {
+  /**
+   * Creates a ColumnVector from a native column_view using a contiguous device allocation.
+   *
+   * @param columnViewAddress address of the native column_view
+   * @param buffer device buffer containing the data referenced by the column view
+   */
+  public static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) {
     return new ColumnVector(columnViewAddress, buffer);
   }
 

From 86861e08d9f7b1ae0a61d6b05bbfc6690107ca0f Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 16 Sep 2024 19:14:18 -0500
Subject: [PATCH 198/270] Fix `cov`/`corr` bug in dask-cudf (#16786)

Closes https://github.com/rapidsai/cudf/issues/14935

Overrides `_prepare_cov_corr` method to avoid cudf compatibility issues in dask-cudf.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16786
---
 python/dask_cudf/dask_cudf/expr/_collection.py | 18 +++++++++++++++++-
 python/dask_cudf/dask_cudf/tests/test_core.py  | 17 +++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index f60e4ff81ef..97e1dffc65b 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -49,8 +49,24 @@ def to_dask_dataframe(self, **kwargs):
 
         return self.to_backend("pandas", **kwargs)
 
+    def _prepare_cov_corr(self, min_periods, numeric_only):
+        # Upstream version of this method sets min_periods
+        # to 2 by default (which is not supported by cudf)
+        # TODO: Remove when cudf supports both min_periods
+        # and numeric_only
+        # See: https://github.com/rapidsai/cudf/issues/12626
+        # See: https://github.com/rapidsai/cudf/issues/9009
+        self._meta.cov(min_periods=min_periods)
+
+        frame = self
+        if numeric_only:
+            numerics = self._meta._get_numeric_data()
+            if len(numerics.columns) != len(self.columns):
+                frame = frame[list(numerics.columns)]
+        return frame, min_periods
+
     # var can be removed if cudf#15179 is addressed.
-    # See: https://github.com/rapidsai/cudf/issues/15179
+    # See: https://github.com/rapidsai/cudf/issues/14935
     def var(
         self,
         axis=0,
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 905d8c08135..7aa0f6320f2 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -1007,3 +1007,20 @@ def test_to_backend_simplify():
         df2 = df.to_backend("cudf")[["y"]].simplify()
         df3 = df[["y"]].to_backend("cudf").to_backend("cudf").simplify()
         assert df2._name == df3._name
+
+
+@pytest.mark.parametrize("numeric_only", [True, False])
+@pytest.mark.parametrize("op", ["corr", "cov"])
+def test_cov_corr(op, numeric_only):
+    df = cudf.DataFrame.from_dict(
+        {
+            "x": np.random.randint(0, 5, size=10),
+            "y": np.random.normal(size=10),
+        }
+    )
+    ddf = dd.from_pandas(df, npartitions=2)
+    res = getattr(ddf, op)(numeric_only=numeric_only)
+    # Use to_pandas until cudf supports numeric_only
+    # (See: https://github.com/rapidsai/cudf/issues/12626)
+    expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only)
+    dd.assert_eq(res, expect)

From f8d50639fffb541dee3b860c19756af2c4a5a850 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Mon, 16 Sep 2024 21:27:38 -0400
Subject: [PATCH 199/270] Add ability to set parquet row group max #rows and
 #bytes in java (#16805)

Adds the ability to set the max # rows per row group and max # bytes per row group for parquet files.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16805
---
 .../ai/rapids/cudf/ParquetWriterOptions.java  | 26 ++++++-
 java/src/main/java/ai/rapids/cudf/Table.java  | 68 +++++++++++--------
 java/src/main/native/src/TableJni.cpp         |  8 +++
 .../test/java/ai/rapids/cudf/TableTest.java   |  8 ++-
 4 files changed, 80 insertions(+), 30 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
index 7b58817550d..8c8180436e6 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -24,9 +24,13 @@
  */
 public final class ParquetWriterOptions extends CompressionMetadataWriterOptions {
   private final StatisticsFrequency statsGranularity;
+  private int rowGroupSizeRows;
+  private long rowGroupSizeBytes;
 
   private ParquetWriterOptions(Builder builder) {
     super(builder);
+    this.rowGroupSizeRows = builder.rowGroupSizeRows;
+    this.rowGroupSizeBytes = builder.rowGroupSizeBytes;
     this.statsGranularity = builder.statsGranularity;
   }
 
@@ -51,18 +55,38 @@ public static Builder builder() {
     return new Builder();
   }
 
+  public int getRowGroupSizeRows() {
+    return rowGroupSizeRows;
+  }
+
+  public long getRowGroupSizeBytes() {
+    return rowGroupSizeBytes;
+  }
+
   public StatisticsFrequency getStatisticsFrequency() {
     return statsGranularity;
   }
 
   public static class Builder extends CompressionMetadataWriterOptions.Builder
         <Builder, ParquetWriterOptions> {
+    private int rowGroupSizeRows = 1000000; //Max of 1 million rows per row group
+    private long rowGroupSizeBytes = 128 * 1024 * 1024; //Max of 128MB per row group
     private StatisticsFrequency statsGranularity = StatisticsFrequency.ROWGROUP;
 
     public Builder() {
       super();
     }
 
+    public Builder withRowGroupSizeRows(int rowGroupSizeRows) {
+      this.rowGroupSizeRows = rowGroupSizeRows;
+      return this;
+    }
+
+    public Builder withRowGroupSizeBytes(long rowGroupSizeBytes) {
+      this.rowGroupSizeBytes = rowGroupSizeBytes;
+      return this;
+    }
+
     public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) {
       this.statsGranularity = statsGranularity;
       return this;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index cbb126d7ee5..09da43374ae 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -332,20 +332,22 @@ private static native long[] readAvroFromDataSource(String[] filterColumnNames,
 
   /**
    * Setup everything to write parquet formatted data to a file.
-   * @param columnNames     names that correspond to the table columns
-   * @param numChildren     Children of the top level
-   * @param flatNumChildren flattened list of children per column
-   * @param nullable        true if the column can have nulls else false
-   * @param metadataKeys    Metadata key names to place in the Parquet file
-   * @param metadataValues  Metadata values corresponding to metadataKeys
-   * @param compression     native compression codec ID
-   * @param statsFreq       native statistics frequency ID
-   * @param isInt96         true if timestamp type is int96
-   * @param precisions      precision list containing all the precisions of the decimal types in
-   *                        the columns
-   * @param isMapValues     true if a column is a map
-   * @param isBinaryValues  true if a column is a binary
-   * @param filename        local output path
+   * @param columnNames       names that correspond to the table columns
+   * @param numChildren       Children of the top level
+   * @param flatNumChildren   flattened list of children per column
+   * @param nullable          true if the column can have nulls else false
+   * @param metadataKeys      Metadata key names to place in the Parquet file
+   * @param metadataValues    Metadata values corresponding to metadataKeys
+   * @param compression       native compression codec ID
+   * @param rowGroupSizeRows  max #rows in a row group
+   * @param rowGroupSizeBytes max #bytes in a row group
+   * @param statsFreq         native statistics frequency ID
+   * @param isInt96           true if timestamp type is int96
+   * @param precisions        precision list containing all the precisions of the decimal types in
+   *                          the columns
+   * @param isMapValues       true if a column is a map
+   * @param isBinaryValues    true if a column is a binary
+   * @param filename          local output path
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
   private static native long writeParquetFileBegin(String[] columnNames,
@@ -355,6 +357,8 @@ private static native long writeParquetFileBegin(String[] columnNames,
                                                    String[] metadataKeys,
                                                    String[] metadataValues,
                                                    int compression,
+                                                   int rowGroupSizeRows,
+                                                   long rowGroupSizeBytes,
                                                    int statsFreq,
                                                    boolean[] isInt96,
                                                    int[] precisions,
@@ -366,20 +370,22 @@ private static native long writeParquetFileBegin(String[] columnNames,
 
   /**
    * Setup everything to write parquet formatted data to a buffer.
-   * @param columnNames     names that correspond to the table columns
-   * @param numChildren     Children of the top level
-   * @param flatNumChildren flattened list of children per column
-   * @param nullable        true if the column can have nulls else false
-   * @param metadataKeys    Metadata key names to place in the Parquet file
-   * @param metadataValues  Metadata values corresponding to metadataKeys
-   * @param compression     native compression codec ID
-   * @param statsFreq       native statistics frequency ID
-   * @param isInt96         true if timestamp type is int96
-   * @param precisions      precision list containing all the precisions of the decimal types in
-   *                        the columns
-   * @param isMapValues     true if a column is a map
-   * @param isBinaryValues  true if a column is a binary
-   * @param consumer        consumer of host buffers produced.
+   * @param columnNames       names that correspond to the table columns
+   * @param numChildren       Children of the top level
+   * @param flatNumChildren   flattened list of children per column
+   * @param nullable          true if the column can have nulls else false
+   * @param metadataKeys      Metadata key names to place in the Parquet file
+   * @param metadataValues    Metadata values corresponding to metadataKeys
+   * @param compression       native compression codec ID
+   * @param rowGroupSizeRows  max #rows in a row group
+   * @param rowGroupSizeBytes max #bytes in a row group
+   * @param statsFreq         native statistics frequency ID
+   * @param isInt96           true if timestamp type is int96
+   * @param precisions        precision list containing all the precisions of the decimal types in
+   *                          the columns
+   * @param isMapValues       true if a column is a map
+   * @param isBinaryValues    true if a column is a binary
+   * @param consumer          consumer of host buffers produced.
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
   private static native long writeParquetBufferBegin(String[] columnNames,
@@ -389,6 +395,8 @@ private static native long writeParquetBufferBegin(String[] columnNames,
                                                      String[] metadataKeys,
                                                      String[] metadataValues,
                                                      int compression,
+                                                     int rowGroupSizeRows,
+                                                     long rowGroupSizeBytes,
                                                      int statsFreq,
                                                      boolean[] isInt96,
                                                      int[] precisions,
@@ -1820,6 +1828,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
+          options.getRowGroupSizeRows(),
+          options.getRowGroupSizeBytes(),
           options.getStatisticsFrequency().nativeId,
           options.getFlatIsTimeTypeInt96(),
           options.getFlatPrecision(),
@@ -1840,6 +1850,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
+          options.getRowGroupSizeRows(),
+          options.getRowGroupSizeBytes(),
           options.getStatisticsFrequency().nativeId,
           options.getFlatIsTimeTypeInt96(),
           options.getFlatPrecision(),
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 40a111209b0..92e213bcb60 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2150,6 +2150,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
                                                   jobjectArray j_metadata_keys,
                                                   jobjectArray j_metadata_values,
                                                   jint j_compression,
+                                                  jint j_row_group_size_rows,
+                                                  jlong j_row_group_size_bytes,
                                                   jint j_stats_freq,
                                                   jbooleanArray j_isInt96,
                                                   jintArray j_precisions,
@@ -2205,6 +2207,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
       chunked_parquet_writer_options::builder(sink)
         .metadata(std::move(metadata))
         .compression(static_cast<compression_type>(j_compression))
+        .row_group_size_rows(j_row_group_size_rows)
+        .row_group_size_bytes(j_row_group_size_bytes)
         .stats_level(static_cast<statistics_freq>(j_stats_freq))
         .key_value_metadata({kv_metadata})
         .compression_statistics(stats)
@@ -2227,6 +2231,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
                                                 jobjectArray j_metadata_keys,
                                                 jobjectArray j_metadata_values,
                                                 jint j_compression,
+                                                jint j_row_group_size_rows,
+                                                jlong j_row_group_size_bytes,
                                                 jint j_stats_freq,
                                                 jbooleanArray j_isInt96,
                                                 jintArray j_precisions,
@@ -2280,6 +2286,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
       chunked_parquet_writer_options::builder(sink)
         .metadata(std::move(metadata))
         .compression(static_cast<compression_type>(j_compression))
+        .row_group_size_rows(j_row_group_size_rows)
+        .row_group_size_bytes(j_row_group_size_bytes)
         .stats_level(static_cast<statistics_freq>(j_stats_freq))
         .key_value_metadata({kv_metadata})
         .compression_statistics(stats)
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 56fe63598d9..830f2b33b32 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -9122,7 +9122,11 @@ void testParquetWriteToBufferChunked() {
     columns.add(Columns.STRUCT.name);
     WriteUtils.buildWriterOptions(optBuilder, columns);
     ParquetWriterOptions options = optBuilder.build();
-    ParquetWriterOptions optionsNoCompress = optBuilder.withCompressionType(CompressionType.NONE).build();
+    ParquetWriterOptions optionsNoCompress =
+      optBuilder.withCompressionType(CompressionType.NONE)
+      .withRowGroupSizeRows(10000)
+      .withRowGroupSizeBytes(10000)
+      .build();
     try (Table table0 = getExpectedFileTable(columns);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
@@ -9208,6 +9212,8 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
           .withDecimalColumn("_c7", 4)
           .withDecimalColumn("_c8", 6)
           .withCompressionType(CompressionType.NONE)
+          .withRowGroupSizeRows(10000)
+          .withRowGroupSizeBytes(10000)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {

From 7285efbeee12fa7f327933bcf6a52726bfa07790 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 16 Sep 2024 18:41:27 -1000
Subject: [PATCH 200/270] Support drop_first in get_dummies (#16795)

closes #16791

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16795
---
 python/cudf/cudf/core/reshape.py      | 11 +++++++----
 python/cudf/cudf/tests/test_onehot.py | 17 +++++++++++++++++
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 3d205957126..c026579b8b5 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -738,7 +738,8 @@ def get_dummies(
     sparse : boolean, optional
         Right now this is NON-FUNCTIONAL argument in rapids.
     drop_first : boolean, optional
-        Right now this is NON-FUNCTIONAL argument in rapids.
+        Whether to get k-1 dummies out of k categorical levels by removing the
+        first level.
     columns : sequence of str, optional
         Names of columns to encode. If not provided, will attempt to encode all
         columns. Note this is different from pandas default behavior, which
@@ -806,9 +807,6 @@ def get_dummies(
     if sparse:
         raise NotImplementedError("sparse is not supported yet")
 
-    if drop_first:
-        raise NotImplementedError("drop_first is not supported yet")
-
     if isinstance(data, cudf.DataFrame):
         encode_fallback_dtypes = ["object", "category"]
 
@@ -862,6 +860,7 @@ def get_dummies(
                     prefix=prefix_map.get(name, prefix),
                     prefix_sep=prefix_sep_map.get(name, prefix_sep),
                     dtype=dtype,
+                    drop_first=drop_first,
                 )
                 result_data.update(col_enc_data)
             return cudf.DataFrame._from_data(result_data, index=data.index)
@@ -874,6 +873,7 @@ def get_dummies(
             prefix=prefix,
             prefix_sep=prefix_sep,
             dtype=dtype,
+            drop_first=drop_first,
         )
         return cudf.DataFrame._from_data(data, index=ser.index)
 
@@ -1256,6 +1256,7 @@ def _one_hot_encode_column(
     prefix: str | None,
     prefix_sep: str | None,
     dtype: Dtype | None,
+    drop_first: bool,
 ) -> dict[str, ColumnBase]:
     """Encode a single column with one hot encoding. The return dictionary
     contains pairs of (category, encodings). The keys may be prefixed with
@@ -1276,6 +1277,8 @@ def _one_hot_encode_column(
         )
     data = one_hot_encode(column, categories)
 
+    if drop_first and len(data):
+        data.pop(next(iter(data)))
     if prefix is not None and prefix_sep is not None:
         data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()}
     if dtype:
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index cc17dc46e0a..e054143b438 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -161,3 +161,20 @@ def test_get_dummies_cats_deprecated():
     df = cudf.DataFrame(range(3))
     with pytest.warns(FutureWarning):
         cudf.get_dummies(df, cats={0: [0, 1, 2]})
+
+
+def test_get_dummies_drop_first_series():
+    result = cudf.get_dummies(cudf.Series(list("abcaa")), drop_first=True)
+    expected = pd.get_dummies(pd.Series(list("abcaa")), drop_first=True)
+    assert_eq(result, expected)
+
+
+def test_get_dummies_drop_first_dataframe():
+    result = cudf.get_dummies(
+        cudf.DataFrame({"A": list("abcaa"), "B": list("bcaab")}),
+        drop_first=True,
+    )
+    expected = pd.get_dummies(
+        pd.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), drop_first=True
+    )
+    assert_eq(result, expected)

From 250a73ab64c036cb82dfda1542a12b98603fab95 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 17 Sep 2024 01:13:43 -0500
Subject: [PATCH 201/270] Fix pylibcudf imports, branches, and more.

---
 .github/workflows/pr.yaml                         |  2 +-
 ci/test_cudf_polars_polars_tests.sh               |  2 +-
 .../api_docs/pylibcudf/strings/strip.rst          |  2 +-
 python/cudf/cudf/_lib/datetime.pyx                |  2 +-
 python/cudf/cudf/_lib/string_casting.pyx          |  4 +++-
 python/cudf/cudf/_lib/strings/strip.pyx           |  2 +-
 .../strings/convert/convert_datetime.pxd          |  5 ++---
 .../strings/convert/convert_datetime.pyx          |  9 ++++-----
 .../strings/convert/convert_durations.pxd         |  5 ++---
 .../strings/convert/convert_durations.pyx         |  9 ++++-----
 python/pylibcudf/pylibcudf/strings/side_type.pxd  |  2 +-
 python/pylibcudf/pylibcudf/strings/side_type.pyx  |  2 +-
 python/pylibcudf/pylibcudf/strings/strip.pxd      |  6 +++---
 python/pylibcudf/pylibcudf/strings/strip.pyx      | 15 +++++++--------
 .../pylibcudf/tests/test_string_convert.py        |  3 +--
 .../pylibcudf/tests/test_string_strip.py          |  3 +--
 16 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 68b1a38737e..2c76e50eedd 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -248,7 +248,7 @@ jobs:
   cudf-polars-polars-tests:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index 924fc4ef28b..25ed44df316 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -10,7 +10,7 @@ set -eou pipefail
 # files in cudf_polars/pylibcudf", rather than "are there changes
 # between upstream and this branch which touch cudf_polars/pylibcudf"
 # TODO: is the target branch exposed anywhere in an environment variable?
-if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
 then
     HAS_CHANGES=1
     rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst
index 32f87e013ad..a79774b8e67 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst
@@ -2,5 +2,5 @@
 strip
 =====
 
-.. automodule:: cudf._lib.pylibcudf.strings.strip
+.. automodule:: pylibcudf.strings.strip
    :members:
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 3eb24d14441..bc5e085ec39 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -17,7 +17,7 @@ from pylibcudf.libcudf.types cimport size_type
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 6d2734d552d..60a6795a402 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -42,8 +42,10 @@ from pylibcudf.libcudf.types cimport data_type, type_id
 
 from cudf._lib.types cimport underlying_type_t_type_id
 
+import pylibcudf as plc
+
 import cudf
-import cudf._lib.pylibcudf as plc
+
 from cudf._lib.types cimport dtype_to_pylibcudf_type
 
 
diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx
index 22102eb2a32..38ecb21a94c 100644
--- a/python/cudf/cudf/_lib/strings/strip.pyx
+++ b/python/cudf/cudf/_lib/strings/strip.pyx
@@ -13,7 +13,7 @@ from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip
 
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
index a6ad4dc1b3a..07c84d263d6 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.types cimport DataType
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
 
 
 cpdef Column to_timestamps(
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
index a51b317e95a..fcacb096f87 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
@@ -3,14 +3,13 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.strings.convert cimport (
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
     convert_datetime as cpp_convert_datetime,
 )
 
-from cudf._lib.pylibcudf.types import DataType
+from pylibcudf.types import DataType
 
 
 cpdef Column to_timestamps(
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
index 74d31a4f7b6..ac11b8959ed 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.types cimport DataType
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
 
 
 cpdef Column to_durations(
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
index c94433fe215..f3e0b7c9c8e 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
@@ -3,14 +3,13 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.strings.convert cimport (
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
     convert_durations as cpp_convert_durations,
 )
 
-from cudf._lib.pylibcudf.types import DataType
+from pylibcudf.types import DataType
 
 
 cpdef Column to_durations(
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd
index 95bf6fabb15..34b7a580380 100644
--- a/python/pylibcudf/pylibcudf/strings/side_type.pxd
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd
@@ -1,3 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
+from pylibcudf.libcudf.strings.side_type cimport side_type
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx
index dcbe8af7f6f..acdc7d6ff1f 100644
--- a/python/pylibcudf/pylibcudf/strings/side_type.pyx
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.strings.side_type import \
+from pylibcudf.libcudf.strings.side_type import \
     side_type as SideType  # no-cython-lint
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/strings/strip.pxd
index f3bdbacbaf8..8bbe4753edd 100644
--- a/python/pylibcudf/pylibcudf/strings/strip.pxd
+++ b/python/pylibcudf/pylibcudf/strings/strip.pxd
@@ -1,8 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.scalar cimport Scalar
-from cudf._lib.pylibcudf.strings.side_type cimport side_type
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.side_type cimport side_type
 
 
 cpdef Column strip(
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx
index 5179774f82d..429a23c3cdf 100644
--- a/python/pylibcudf/pylibcudf/strings/strip.pyx
+++ b/python/pylibcudf/pylibcudf/strings/strip.pyx
@@ -3,16 +3,15 @@
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
     make_string_scalar as cpp_make_string_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.strings cimport strip as cpp_strip
-from cudf._lib.pylibcudf.scalar cimport Scalar
-from cudf._lib.pylibcudf.strings.side_type cimport side_type
+from pylibcudf.libcudf.strings cimport strip as cpp_strip
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.side_type cimport side_type
 
 
 cpdef Column strip(
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
index 3ea53685eaf..e9e95459d0e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
@@ -3,11 +3,10 @@
 from datetime import datetime
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(
     scope="module",
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
index e2567785a70..005e5e4a405 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_strip.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
@@ -1,11 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 data_strings = [
     "AbC",
     "123abc",

From 27c29ebd81864d1662dd8a3e8e807955bd8fd9c5 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 17 Sep 2024 09:17:43 -0500
Subject: [PATCH 202/270] Use cupy 12.2.0 as oldest dependency pinning on CUDA
 12 ARM (#16808)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Uses cupy 12.2.0 as oldest dependency pinning on ARM to ensure CUDA 12 support.

This will fix nightly CI failures that look like:

```
LibMambaUnsatisfiableError: Encountered problems while solving:
  - package cupy-12.0.0-py311h308989c_2 requires python_abi 3.11.* *_cp311, but none of the providers can be installed

Could not solve for environment specs
The following packages are incompatible
├─ cuda-version 12.2**  is installable and it requires
│  └─ cudatoolkit 12.2|12.2.* , which can be installed;
├─ cupy 12.0.0  is installable with the potential options
│  ├─ cupy 12.0.0 would require
│  │  └─ cudatoolkit >=11.2,<12 , which conflicts with any installable versions previously reported;
...
```

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/16808
---
 dependencies.yaml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 483335c02ff..7a13043cc5f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -710,7 +710,16 @@ dependencies:
               - numpy==1.23.*
               - pandas==2.0.*
               - pyarrow==14.0.0
-              - cupy==12.0.0  # ignored as pip constraint
+          - matrix:
+            packages:
+      - output_types: conda
+        matrices:
+          - matrix: {dependencies: "oldest", arch: "aarch64", cuda: "12.*"}
+            packages:
+              - cupy==12.2.0  # cupy 12.2.0 is the earliest with CUDA 12 ARM packages.
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - cupy==12.0.0
           - matrix:
             packages:
       - output_types: requirements

From 23351aa15f5334b7582c53d4cb6b7421c5c2fd74 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 17 Sep 2024 13:14:32 -0400
Subject: [PATCH 203/270] Word-based nvtext::minhash function (#15368)

Experimental implementation for #15055
The input is a lists column of strings where each string in each row is expected as a word to be hashed. The minimum hash for that row is returned in a lists column where each row contains a minhash per input hash seed.
Here the caller is expected to produce the words to be hashed.

```
std::unique_ptr<cudf::column> word_minhash(
  cudf::lists_column_view const& input,
  cudf::device_span<uint32_t const> seeds,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15368
---
 cpp/benchmarks/CMakeLists.txt                 |   2 +-
 cpp/benchmarks/text/word_minhash.cpp          |  77 +++++++++
 cpp/include/nvtext/minhash.hpp                |  61 +++++++-
 cpp/src/text/minhash.cu                       | 147 +++++++++++++++++-
 cpp/tests/text/minhash_tests.cpp              |  35 +++++
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  38 +++++
 python/cudf/cudf/_lib/strings/__init__.py     |   9 +-
 python/cudf/cudf/core/column/string.py        |  70 +++++++++
 .../cudf/cudf/tests/text/test_text_methods.py |  60 +++++++
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |  10 ++
 10 files changed, 498 insertions(+), 11 deletions(-)
 create mode 100644 cpp/benchmarks/text/word_minhash.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 3bf9d02b384..6c5f4a68a4c 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -337,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
   TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp
new file mode 100644
index 00000000000..adc3dddc59c
--- /dev/null
+++ b/cpp/benchmarks/text/word_minhash.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/minhash.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_word_minhash(nvbench::state& state)
+{
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const base64     = state.get_int64("hash_type") == 64;
+
+  data_profile const strings_profile =
+    data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
+  auto strings_table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
+
+  auto const num_offsets = (num_rows / row_width) + 1;
+  auto offsets           = cudf::sequence(num_offsets,
+                                cudf::numeric_scalar<cudf::size_type>(0),
+                                cudf::numeric_scalar<cudf::size_type>(row_width));
+
+  auto source = cudf::make_lists_column(num_offsets - 1,
+                                        std::move(offsets),
+                                        std::move(strings_table->release().front()),
+                                        0,
+                                        rmm::device_buffer{});
+
+  data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
+    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
+  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
+  auto seeds             = seeds_table->get_column(0);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
+                         : nvtext::word_minhash(source->view(), seeds.view());
+  });
+}
+
+NVBENCH_BENCH(bench_word_minhash)
+  .set_name("word_minhash")
+  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
+  .add_int64_axis("row_width", {10, 100, 1000})
+  .add_int64_axis("seed_count", {2, 25})
+  .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index c83a4260c19..7c909f1a948 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/hashing.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
@@ -72,7 +73,7 @@ std::unique_ptr<cudf::column> minhash(
  *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seeds Seed values used for the hash algorithm
@@ -133,7 +134,7 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seeds Seed values used for the hash algorithm
@@ -150,5 +151,61 @@ std::unique_ptr<cudf::column> minhash64(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each row of strings per seed
+ *
+ * Hash values are computed from each string in each row and the
+ * minimum hash value is returned for each row for each seed.
+ * Each row of the output list column are seed results for the corresponding
+ * input row. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Lists column of strings to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> word_minhash(
+  cudf::lists_column_view const& input,
+  cudf::device_span<uint32_t const> seeds,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Returns the minhash values for each row of strings per seed
+ *
+ * Hash values are computed from each string in each row and the
+ * minimum hash value is returned for each row for each seed.
+ * Each row of the output list column are seed results for the corresponding
+ * input row. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm though
+ * only the first 64-bits of the hash are used in computing the output.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Lists column of strings to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> word_minhash64(
+  cudf::lists_column_view const& input,
+  cudf::device_span<uint64_t const> seeds,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 605582f28a6..a03a34f5fa7 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -25,6 +25,8 @@
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/lists/list_device_view.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -151,15 +153,111 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                           mr);
   auto d_hashes = hashes->mutable_view().data<hash_value_type>();
 
-  constexpr int block_size = 256;
-  cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+  constexpr cudf::thread_index_type block_size = 256;
+  cudf::detail::grid_1d grid{
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
   minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     *d_strings, seeds, width, d_hashes);
 
   return hashes;
 }
 
-std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const& input,
+/**
+ * @brief Compute the minhash of each list row of strings for each seed
+ *
+ * This is a warp-per-row algorithm where parallel threads within a warp
+ * work on strings in a single list row.
+ *
+ * @tparam HashFunction hash function to use on each string
+ *
+ * @param d_input List of strings to process
+ * @param seeds Seeds for hashing each string
+ * @param d_hashes Minhash output values (one per row)
+ */
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input,
+                                     cudf::device_span<hash_value_type const> seeds,
+                                     hash_value_type* d_hashes)
+{
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const row_idx = idx / cudf::detail::warp_size;
+
+  if (row_idx >= d_input.size()) { return; }
+  if (d_input.is_null(row_idx)) { return; }
+
+  auto const d_row    = cudf::list_device_view(d_input, row_idx);
+  auto const d_output = d_hashes + (row_idx * seeds.size());
+
+  // initialize hashes output for this row
+  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
+  if (lane_idx == 0) {
+    auto const init = d_row.size() == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
+    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
+  }
+  __syncwarp();
+
+  // each lane hashes a string from the input row
+  for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) {
+    auto const hash_str =
+      d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element<cudf::string_view>(str_idx);
+    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
+      auto const hasher = HashFunction(seeds[seed_idx]);
+      // hash string and store the min value
+      hash_value_type hv;
+      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
+        hv = hasher(hash_str);
+      } else {
+        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
+        // but only uses the first uint64 value as requested by the LLM team.
+        hv = thrust::get<0>(hasher(hash_str));
+      }
+      cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
+      ref.fetch_min(hv, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+std::unique_ptr<cudf::column> word_minhash_fn(cudf::lists_column_view const& input,
+                                              cudf::device_span<hash_value_type const> seeds,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
+  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
+                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+               "The number of seeds times the number of input rows exceeds the column size limit",
+               std::overflow_error);
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_input = cudf::column_device_view::create(input.parent(), stream);
+
+  auto hashes   = cudf::make_numeric_column(output_type,
+                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
+                                          cudf::mask_state::UNALLOCATED,
+                                          stream,
+                                          mr);
+  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
+  auto lcdv     = cudf::detail::lists_column_device_view(*d_input);
+
+  constexpr cudf::thread_index_type block_size = 256;
+  cudf::detail::grid_1d grid{
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
+  minhash_word_kernel<HashFunction>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(lcdv, seeds, d_hashes);
+
+  return hashes;
+}
+
+std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
                                                 std::unique_ptr<cudf::column>&& hashes,
                                                 cudf::size_type seeds_size,
                                                 rmm::cuda_stream_view stream,
@@ -176,7 +274,7 @@ std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const&
                                   std::move(offsets),
                                   std::move(hashes),
                                   input.null_count(),
-                                  cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                  cudf::detail::copy_bitmask(input, stream, mr),
                                   stream,
                                   mr);
   // expect this condition to be very rare
@@ -208,7 +306,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
@@ -232,7 +330,27 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
+                                           cudf::device_span<uint32_t const> seeds,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
+                                             cudf::device_span<uint64_t const> seeds,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 }  // namespace detail
 
@@ -276,4 +394,21 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return detail::minhash64(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
+                                           cudf::device_span<uint32_t const> seeds,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::word_minhash(input, seeds, stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
+                                             cudf::device_span<uint64_t const> seeds,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::word_minhash64(input, seeds, stream, mr);
+}
 }  // namespace nvtext
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 7575a3ba846..e23f3f6e7d8 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -139,6 +139,41 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
+TEST_F(MinHashTest, WordsMinHash)
+{
+  using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto validity = cudf::test::iterators::null_at(1);
+
+  LCWS input(
+    {LCWS({"hello", "abcdéfgh"}),
+     LCWS{},
+     LCWS({"rapids", "moré", "test", "text"}),
+     LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})},
+    validity);
+
+  auto view = cudf::lists_column_view(input);
+
+  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
+  auto results = nvtext::word_minhash(view, cudf::column_view(seeds));
+  using LCW32  = cudf::test::lists_column_wrapper<uint32_t>;
+  LCW32 expected({LCW32{2069617641u, 1975382903u},
+                  LCW32{},
+                  LCW32{657297235u, 1010955999u},
+                  LCW32{644643885u, 310002789u}},
+                 validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
+  auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64));
+  using LCW64    = cudf::test::lists_column_wrapper<uint64_t>;
+  LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul},
+                    LCW64{},
+                    LCW64{5331949571924938590ul, 2088583894581919741ul},
+                    LCW64{3400468157617183341ul, 2398577492366130055ul}},
+                   validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
+}
+
 TEST_F(MinHashTest, EmptyTest)
 {
   auto input   = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 5ee15d0e409..59cb8d51440 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -10,6 +10,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
+    word_minhash as cpp_word_minhash,
+    word_minhash64 as cpp_word_minhash64,
 )
 from pylibcudf.libcudf.types cimport size_type
 
@@ -54,3 +56,39 @@ def minhash64(Column strings, Column seeds, int width):
         )
 
     return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def word_minhash(Column input, Column seeds):
+
+    cdef column_view c_input = input.view()
+    cdef column_view c_seeds = seeds.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash(
+                c_input,
+                c_seeds
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def word_minhash64(Column input, Column seeds):
+
+    cdef column_view c_input = input.view()
+    cdef column_view c_seeds = seeds.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash64(
+                c_input,
+                c_seeds
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 47a194c4fda..4bf8a9b1a8f 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
 from cudf._lib.nvtext.generate_ngrams import (
     generate_character_ngrams,
@@ -6,7 +6,12 @@
     hash_character_ngrams,
 )
 from cudf._lib.nvtext.jaccard import jaccard_index
-from cudf._lib.nvtext.minhash import minhash, minhash64
+from cudf._lib.nvtext.minhash import (
+    minhash,
+    minhash64,
+    word_minhash,
+    word_minhash64,
+)
 from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
 from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
 from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 16e6908f308..e059917b0b8 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5349,6 +5349,76 @@ def minhash64(
             libstrings.minhash64(self._column, seeds_column, width)
         )
 
+    def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
+        """
+        Compute the minhash of a list column of strings.
+        This uses the MurmurHash3_x86_32 algorithm for the hash function.
+
+        Parameters
+        ----------
+        seeds : ColumnLike
+            The seeds used for the hash algorithm.
+            Must be of type uint32.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+        >>> ls.str.word_minhash(seeds=seeds)
+        0     [21141582, 1232889953, 1268336794]
+        1    [962346254, 2321233602, 1354839212]
+        dtype: list
+        """
+        if seeds is None:
+            seeds_column = column.as_column(0, dtype=np.uint32, length=1)
+        else:
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint32:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.word_minhash(self._column, seeds_column)
+        )
+
+    def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
+        """
+        Compute the minhash of a list column of strings.
+        This uses the MurmurHash3_x64_128 algorithm for the hash function.
+        This function generates 2 uint64 values but only the first
+        uint64 value is used.
+
+        Parameters
+        ----------
+        seeds : ColumnLike
+            The seeds used for the hash algorithm.
+            Must be of type uint64.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+        >>> ls.str.word_minhash64(seeds)
+        0    [2603139454418834912, 8644371945174847701, 5541030711534384340]
+        1    [5240044617220523711, 5847101123925041457, 153762819128779913]
+        dtype: list
+        """
+        if seeds is None:
+            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
+        else:
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint64:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.word_minhash64(self._column, seeds_column)
+        )
+
     def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
         """
         Compute the Jaccard index between this column and the given
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 52179f55da3..997ca357986 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -946,6 +946,66 @@ def test_minhash():
         strings.str.minhash64(seeds=seeds)
 
 
+def test_word_minhash():
+    ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+
+    expected = cudf.Series(
+        [
+            cudf.Series([21141582], dtype=np.uint32),
+            cudf.Series([962346254], dtype=np.uint32),
+        ]
+    )
+    actual = ls.str.word_minhash()
+    assert_eq(expected, actual)
+    seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+    expected = cudf.Series(
+        [
+            cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32),
+            cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32),
+        ]
+    )
+    actual = ls.str.word_minhash(seeds=seeds)
+    assert_eq(expected, actual)
+
+    expected = cudf.Series(
+        [
+            cudf.Series([2603139454418834912], dtype=np.uint64),
+            cudf.Series([5240044617220523711], dtype=np.uint64),
+        ]
+    )
+    actual = ls.str.word_minhash64()
+    assert_eq(expected, actual)
+    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+    expected = cudf.Series(
+        [
+            cudf.Series(
+                [
+                    2603139454418834912,
+                    8644371945174847701,
+                    5541030711534384340,
+                ],
+                dtype=np.uint64,
+            ),
+            cudf.Series(
+                [5240044617220523711, 5847101123925041457, 153762819128779913],
+                dtype=np.uint64,
+            ),
+        ]
+    )
+    actual = ls.str.word_minhash64(seeds=seeds)
+    assert_eq(expected, actual)
+
+    # test wrong seed types
+    with pytest.raises(ValueError):
+        ls.str.word_minhash(seeds="a")
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
+        ls.str.word_minhash(seeds=seeds)
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+        ls.str.word_minhash64(seeds=seeds)
+
+
 def test_jaccard_index():
     str1 = cudf.Series(["the brown dog", "jumped about"])
     str2 = cudf.Series(["the black cat", "jumped around"])
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 0c352a5068b..f2dd22f43aa 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -19,3 +19,13 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const column_view &seeds,
         const size_type width,
     ) except +
+
+    cdef unique_ptr[column] word_minhash(
+        const column_view &input,
+        const column_view &seeds
+    ) except +
+
+    cdef unique_ptr[column] word_minhash64(
+        const column_view &input,
+        const column_view &seeds
+    ) except +

From e98e10981fc245a6837a51e9b6c2b933a5d7acd8 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 17 Sep 2024 13:19:40 -0400
Subject: [PATCH 204/270] Support multiple new-line characters in regex APIs
 (#15961)

Add support for multiple new-line characters for BOL (`^` / `\A`) and EOL (`$` / `\Z`):
-  `\n` line-feed (already supported)
-  `\r` carriage-return
-  `\u0085` next line (NEL)
-  `\u2028` line separator
-  `\u2029` paragraph separator

Reference #15746

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Navin Kumar (https://github.com/NVnavkumar)

URL: https://github.com/rapidsai/cudf/pull/15961
---
 cpp/doxygen/regex.md                      |  6 +++
 cpp/include/cudf/strings/regex/flags.hpp  | 20 ++++++--
 cpp/include/cudf/strings/string_view.cuh  | 11 +++--
 cpp/src/strings/regex/regcomp.cpp         | 21 ++++++--
 cpp/src/strings/regex/regex.inl           | 46 +++++++++++++-----
 cpp/tests/strings/contains_tests.cpp      | 59 +++++++++++++++++++++++
 cpp/tests/strings/extract_tests.cpp       | 40 +++++++++++++++
 cpp/tests/strings/findall_tests.cpp       | 28 +++++++++++
 cpp/tests/strings/replace_regex_tests.cpp | 49 +++++++++++++++++++
 cpp/tests/strings/special_chars.h         | 25 ++++++++++
 10 files changed, 281 insertions(+), 24 deletions(-)
 create mode 100644 cpp/tests/strings/special_chars.h

diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md
index 8d206f245dc..6d1c91a5752 100644
--- a/cpp/doxygen/regex.md
+++ b/cpp/doxygen/regex.md
@@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions.
 
 **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen.
 
+By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include:
+- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`)
+- Line separator (Unicode: `2028`, UTF-8: `E280A8`)
+- Next line (Unicode: `0085`, UTF-8: `C285`)
+- Carriage return (Unicode: `000D`, UTF-8: `0D`)
+
 **Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following:
 - Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals.
 - Unmatched paired special characters like `()`, `[]`, and `{}`.
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
index f7108129dee..4f3fc7086f2 100644
--- a/cpp/include/cudf/strings/regex/flags.hpp
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -35,10 +35,11 @@ namespace strings {
  * and to match the Python flag values.
  */
 enum regex_flags : uint32_t {
-  DEFAULT   = 0,   ///< default
-  MULTILINE = 8,   ///< the '^' and '$' honor new-line characters
-  DOTALL    = 16,  ///< the '.' matching includes new-line characters
-  ASCII     = 256  ///< use only ASCII when matching built-in character classes
+  DEFAULT     = 0,    ///< default
+  MULTILINE   = 8,    ///< the '^' and '$' honor new-line characters
+  DOTALL      = 16,   ///< the '.' matching includes new-line characters
+  ASCII       = 256,  ///< use only ASCII when matching built-in character classes
+  EXT_NEWLINE = 512   ///< new-line matches extended characters
 };
 
 /**
@@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f)
   return (f & regex_flags::ASCII) == regex_flags::ASCII;
 }
 
+/**
+ * @brief Returns true if the given flags contain EXT_NEWLINE
+ *
+ * @param f Regex flags to check
+ * @return true if `f` includes EXT_NEWLINE
+ */
+constexpr bool is_ext_newline(regex_flags const f)
+{
+  return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE;
+}
+
 /**
  * @brief Capture groups setting
  *
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index abb26d7ccb4..14695c3bb27 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper
 
 __device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
 {
-  if (byte_pos > 0)
-    while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
-      ;
+  if (byte_pos > 0) {
+    if (byte_pos == char_pos) {
+      --byte_pos;
+    } else {
+      while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
+        ;
+    }
+  }
   --char_pos;
   return *this;
 }
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index adf650a4f27..7c4c89bd3fb 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -539,15 +539,26 @@ class regex_parser {
                                                          : static_cast<int32_t>(LBRA);
       case ')': return RBRA;
       case '^': {
-        _chr = is_multiline(_flags) ? chr : '\n';
+        if (is_ext_newline(_flags)) {
+          _chr = is_multiline(_flags) ? 'S' : 'N';
+        } else {
+          _chr = is_multiline(_flags) ? chr : '\n';
+        }
         return BOL;
       }
       case '$': {
-        _chr = is_multiline(_flags) ? chr : '\n';
+        if (is_ext_newline(_flags)) {
+          _chr = is_multiline(_flags) ? 'S' : 'N';
+        } else {
+          _chr = is_multiline(_flags) ? chr : '\n';
+        }
         return EOL;
       }
       case '[': return build_cclass();
-      case '.': return dot_type;
+      case '.': {
+        _chr = is_ext_newline(_flags) ? 'N' : chr;
+        return dot_type;
+      }
     }
 
     if (std::find(quantifiers.begin(), quantifiers.end(), static_cast<char>(chr)) ==
@@ -959,7 +970,7 @@ class regex_compiler {
       _prog.inst_at(inst_id).u1.cls_id = class_id;
     } else if (token == CHAR) {
       _prog.inst_at(inst_id).u1.c = yy;
-    } else if (token == BOL || token == EOL) {
+    } else if (token == BOL || token == EOL || token == ANY) {
       _prog.inst_at(inst_id).u1.c = yy;
     }
     push_and(inst_id, inst_id);
@@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags)
       case STAR: printf("   STAR next=%d", inst.u2.next_id); break;
       case PLUS: printf("   PLUS next=%d", inst.u2.next_id); break;
       case QUEST: printf("  QUEST next=%d", inst.u2.next_id); break;
-      case ANY: printf("    ANY next=%d", inst.u2.next_id); break;
+      case ANY: printf("    ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break;
       case ANYNL: printf("  ANYNL next=%d", inst.u2.next_id); break;
       case NOP: printf("    NOP next=%d", inst.u2.next_id); break;
       case BOL: {
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 3b899e4edc1..e34a1e12015 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist()
   list2    = tmp;
 }
 
+/**
+ * @brief Check for supported new-line characters
+ *
+ * '\n, \r, \u0085, \u2028, or \u2029'
+ */
+constexpr bool is_newline(char32_t const ch)
+{
+  return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9);
+}
+
 /**
  * @brief Utility to check a specific character against this class instance.
  *
@@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
     if (checkstart) {
       auto startchar = static_cast<char_utf8>(jnk.startchar);
       switch (jnk.starttype) {
-        case BOL:
-          if (pos == 0) break;
-          if (jnk.startchar != '^') { return cuda::std::nullopt; }
+        case BOL: {
+          if (pos == 0) { break; }
+          if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; }
+          if (startchar != '\n') { break; }
           --itr;
           startchar = static_cast<char_utf8>('\n');
+          [[fallthrough]];
+        }
         case CHAR: {
           auto const find_itr = find_char(startchar, dstr, itr);
           if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; }
@@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
             id_activate = inst.u2.next_id;
             expanded    = true;
             break;
-          case BOL:
-            if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) {
+          case BOL: {
+            auto titr         = itr;
+            auto const prev_c = pos > 0 ? *(--titr) : 0;
+            if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) ||
+                ((inst.u1.c == 'S') && (is_newline(prev_c)))) {
               id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
-          case EOL:
+          }
+          case EOL: {
             // after the last character OR:
             // - for MULTILINE, if current character is new-line
             // - for non-MULTILINE, the very last character of the string can also be a new-line
+            bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n');
             if (last_character ||
-                ((c == '\n') && (inst.u1.c != 'Z') &&
-                 ((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) {
+                (nl && (inst.u1.c != 'Z') &&
+                 ((inst.u1.c == '$' || inst.u1.c == 'S') ||
+                  (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) {
               id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
+          }
           case BOW:
           case NBOW: {
-            auto const prev_c       = pos > 0 ? dstr[pos - 1] : 0;
+            auto titr               = itr;
+            auto const prev_c       = pos > 0 ? *(--titr) : 0;
             auto const word_class   = reclass_device{CCLASS_W};
             bool const curr_is_word = word_class.is_match(c, _codepoint_flags);
             bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags);
@@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
         case CHAR:
           if (inst.u1.c == c) id_activate = inst.u2.next_id;
           break;
-        case ANY:
-          if (c != '\n') id_activate = inst.u2.next_id;
-          break;
+        case ANY: {
+          if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; }
+          [[fallthrough]];
+        }
         case ANYNL: id_activate = inst.u2.next_id; break;
         case NCCLASS:
         case CCLASS: {
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index c816316d0ff..acf850c7a66 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -613,6 +615,63 @@ TEST_F(StringsContainsTests, MultiLine)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
+TEST_F(StringsContainsTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé",
+                                                   "qqq\rzzé" LINE_SEPARATOR "lll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé" PARAGRAPH_SEPARATOR,
+                                                   "abc\nzzé" NEXT_LINE});
+  auto view  = cudf::strings_column_view(input);
+
+  auto pattern = std::string("^zzé$");
+  auto prog =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  auto ml_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::EXT_NEWLINE |
+                                                          cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml  = cudf::strings::regex_program::create(pattern, ml_flags);
+
+  auto expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1, 0});
+  auto results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1, 1});
+  results  = cudf::strings::contains_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1, 0});
+  results  = cudf::strings::matches_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1, 0});
+  results  = cudf::strings::matches_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto counts = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 1, 0});
+  results     = cudf::strings::count_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts);
+  counts  = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1, 1});
+  results = cudf::strings::count_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts);
+
+  pattern  = std::string("q.*l");
+  prog     = cudf::strings::regex_program::create(pattern);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
+  results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  // inst ANY will stop matching on first 'newline' and so should not match anything here
+  prog     = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0});
+  results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  // including the DOTALL flag accepts the newline characters
+  auto dot_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::EXT_NEWLINE |
+                                                           cudf::strings::regex_flags::DOTALL);
+  prog           = cudf::strings::regex_program::create(pattern, dot_flags);
+  expected       = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
+  results        = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsContainsTests, EndOfString)
 {
   auto input = cudf::test::strings_column_wrapper(
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index b26cbd5a549..1491da758d5 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -200,6 +203,43 @@ TEST_F(StringsExtractTests, DotAll)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
+TEST_F(StringsExtractTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé",
+                                                   "qqq" LINE_SEPARATOR "zzé\rlll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé" NEXT_LINE,
+                                                   "abc" PARAGRAPH_SEPARATOR "zzé\n"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto prog =
+    cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results = cudf::strings::extract(view, *prog);
+  auto expected =
+    cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags);
+  results      = cudf::strings::extract(view, *prog_ml);
+  expected =
+    cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+
+  prog = cudf::strings::regex_program::create("q(q.*l)l");
+  expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""},
+                                                {0, 1, 0, 0, 0, 0});
+  results = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+  // expect no matches here since the newline(s) interrupts the pattern
+  prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE);
+  expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0});
+  results  = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+}
+
 TEST_F(StringsExtractTests, EmptyExtractTest)
 {
   std::vector<char const*> h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""};
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 4582dcb1e38..47606b9b3ed 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -80,6 +82,32 @@ TEST_F(StringsFindallTests, DotAll)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
+TEST_F(StringsFindallTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé",
+                                                   "qqq\nzzé" PARAGRAPH_SEPARATOR "lll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé\r",
+                                                   "zzé" LINE_SEPARATOR "zzé" NEXT_LINE});
+  auto view  = cudf::strings_column_view(input);
+
+  auto prog =
+    cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results = cudf::strings::findall(view, *prog);
+  using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags);
+  results      = cudf::strings::findall(view, *prog_ml);
+  LCW expected_ml(
+    {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml);
+}
+
 TEST_F(StringsFindallTests, MediumRegex)
 {
   // This results in 15 regex instructions and falls in the 'medium' range.
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 8c0482653fb..9847d8d6bb5 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -245,6 +247,53 @@ TEST_F(StringsReplaceRegexTest, Multiline)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected);
 }
 
+TEST_F(StringsReplaceRegexTest, SpecialNewLines)
+{
+  auto input   = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé",
+                                                     "qqq" NEXT_LINE "zzé" NEXT_LINE "lll",
+                                                     "zzé",
+                                                     "",
+                                                     "zzé" PARAGRAPH_SEPARATOR,
+                                                     "abc\rzzé\r"});
+  auto view    = cudf::strings_column_view(input);
+  auto repl    = cudf::string_scalar("_");
+  auto pattern = std::string("^zzé$");
+  auto prog =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results  = cudf::strings::replace_re(view, *prog, repl);
+  auto expected = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé",
+                                                      "qqq" NEXT_LINE "zzé" NEXT_LINE "lll",
+                                                      "_",
+                                                      "",
+                                                      "_" PARAGRAPH_SEPARATOR,
+                                                      "abc\rzzé\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags);
+  results      = cudf::strings::replace_re(view, *prog_ml, repl);
+  expected     = cudf::test::strings_column_wrapper({"_" NEXT_LINE "qqq" NEXT_LINE "_",
+                                                     "qqq" NEXT_LINE "_" NEXT_LINE "lll",
+                                                     "_",
+                                                     "",
+                                                     "_" PARAGRAPH_SEPARATOR,
+                                                     "abc\r_\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto repl_template = std::string("[\\1]");
+  pattern            = std::string("(^zzé$)");
+  prog               = cudf::strings::regex_program::create(pattern, both_flags);
+  results            = cudf::strings::replace_with_backrefs(view, *prog, repl_template);
+  expected = cudf::test::strings_column_wrapper({"[zzé]" NEXT_LINE "qqq" NEXT_LINE "[zzé]",
+                                                 "qqq" NEXT_LINE "[zzé]" NEXT_LINE "lll",
+                                                 "[zzé]",
+                                                 "",
+                                                 "[zzé]" PARAGRAPH_SEPARATOR,
+                                                 "abc\r[zzé]\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
 {
   std::vector<char const*> h_strings{"the quick brown fox jumps over the lazy dog",
diff --git a/cpp/tests/strings/special_chars.h b/cpp/tests/strings/special_chars.h
new file mode 100644
index 00000000000..0d630f6bb52
--- /dev/null
+++ b/cpp/tests/strings/special_chars.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace cudf::test {
+
+// special new-line characters for use with regex_flags::EXT_NEWLINE
+#define NEXT_LINE           "\xC2\x85"
+#define LINE_SEPARATOR      "\xE2\x80\xA8"
+#define PARAGRAPH_SEPARATOR "\xE2\x80\xA9"
+
+}  // namespace cudf::test

From a112f684318e24b2321df48004ca58180f169410 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 17 Sep 2024 11:31:38 -0700
Subject: [PATCH 205/270] Add io_type axis with default `PINNED_BUFFER` to
 nvbench PQ multithreaded reader (#16809)

Closes #16758

This PR adds an `io_type` axis to the benchmarks in `PARQUET_MULTITHREAD_READER_NVBENCH` with `PINNED_BUFFER` as default value. More description at #16758.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/cudf/pull/16809
---
 .../io/parquet/parquet_reader_multithread.cpp | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index 3abd4280081..7121cb9f034 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -50,7 +50,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state)
 }
 
 std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
-  nvbench::state& state, std::vector<cudf::type_id> const& d_types)
+  nvbench::state& state, std::vector<cudf::type_id> const& d_types, io_type io_source_type)
 {
   cudf::size_type const cardinality = state.get_int64("cardinality");
   cudf::size_type const run_length  = state.get_int64("run_length");
@@ -63,7 +63,7 @@ std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
   size_t total_file_size = 0;
 
   for (size_t i = 0; i < num_files; ++i) {
-    cuio_source_sink_pair source_sink{io_type::HOST_BUFFER};
+    cuio_source_sink_pair source_sink{io_source_type};
 
     auto const tbl = create_random_table(
       cycle_dtypes(d_types, num_cols),
@@ -92,11 +92,13 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
 {
   size_t const data_size = state.get_int64("total_data_size");
   auto const num_threads = state.get_int64("num_threads");
+  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   BS::thread_pool threads(num_threads);
 
-  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  auto [source_sink_vector, total_file_size, num_files] =
+    write_file_data(state, d_types, source_type);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
                  source_sink_vector.end(),
@@ -173,10 +175,12 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   auto const num_threads    = state.get_int64("num_threads");
   size_t const input_limit  = state.get_int64("input_limit");
   size_t const output_limit = state.get_int64("output_limit");
+  auto const source_type    = retrieve_io_type_enum(state.get_string("io_type"));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   BS::thread_pool threads(num_threads);
-  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  auto [source_sink_vector, total_file_size, num_files] =
+    write_file_data(state, d_types, source_type);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
                  source_sink_vector.end(),
@@ -264,7 +268,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
   .set_name("parquet_multithreaded_read_decode_fixed_width")
@@ -273,7 +278,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
   .set_name("parquet_multithreaded_read_decode_string")
@@ -282,7 +288,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
   .set_name("parquet_multithreaded_read_decode_list")
@@ -291,7 +298,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 // mixed data types: fixed width, strings
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
@@ -303,7 +311,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
   .set_name("parquet_multithreaded_read_decode_chunked_fixed_width")
@@ -314,7 +323,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
   .set_name("parquet_multithreaded_read_decode_chunked_string")
@@ -325,7 +335,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
   .set_name("parquet_multithreaded_read_decode_chunked_list")
@@ -336,4 +347,5 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});

From 4291f26377a9846c653b135e16e757426014ff53 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 17 Sep 2024 11:40:46 -0700
Subject: [PATCH 206/270] Clean up cudf dependency in cudf_polars.__init__.

---
 python/cudf_polars/cudf_polars/__init__.py | 28 ++++------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
index bada971756a..c1317e8f467 100644
--- a/python/cudf_polars/cudf_polars/__init__.py
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -10,31 +10,11 @@
 
 from __future__ import annotations
 
-import os
-import warnings
-
-# We want to avoid initialising the GPU on import. Unfortunately,
-# while we still depend on cudf, the default mode is to check things.
-# If we set RAPIDS_NO_INITIALIZE, then cudf doesn't do import-time
-# validation, good.
-# We additionally must set the ptxcompiler environment variable, so
-# that we don't check if a numba patch is needed. But if this is done,
-# then the patching mechanism warns, and we want to squash that
-# warning too.
-# TODO: Remove this when we only depend on a pylibcudf package.
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-os.environ["PTXCOMPILER_CHECK_NUMBA_CODEGEN_PATCH_NEEDED"] = "0"
-with warnings.catch_warnings():
-    warnings.simplefilter("ignore")
-    import cudf
-
-    del cudf
-
 # Check we have a supported polars version
-import cudf_polars.utils.versions as v  # noqa: E402
-from cudf_polars._version import __git_commit__, __version__  # noqa: E402
-from cudf_polars.callback import execute_with_cudf  # noqa: E402
-from cudf_polars.dsl.translate import translate_ir  # noqa: E402
+import cudf_polars.utils.versions as v
+from cudf_polars._version import __git_commit__, __version__
+from cudf_polars.callback import execute_with_cudf
+from cudf_polars.dsl.translate import translate_ir
 
 del v
 

From 57ae3e372e93a16db8aef143759ef58392c4215f Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 18 Sep 2024 02:10:58 -0500
Subject: [PATCH 207/270] Enable cudf.pandas REPL and -c command support
 (#16428)

This PR enables support for two features:
- `python -m cudf.pandas` gives a REPL experience (previously it raised an error)
- `python -m cudf.pandas -c "<commands>"` runs the provided commands (previously unsupported)

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16428
---
 docs/cudf/source/cudf_pandas/usage.md      |  20 +++++
 python/cudf/cudf/pandas/__main__.py        |  36 +++++++-
 python/cudf/cudf_pandas_tests/test_main.py | 100 +++++++++++++++++++++
 3 files changed, 154 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf_pandas_tests/test_main.py

diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md
index 0398a8d7086..41838e01dd9 100644
--- a/docs/cudf/source/cudf_pandas/usage.md
+++ b/docs/cudf/source/cudf_pandas/usage.md
@@ -120,3 +120,23 @@ To profile a script being run from the command line, pass the
 ```bash
 python -m cudf.pandas --profile script.py
 ```
+
+### cudf.pandas CLI Features
+
+Several of the ways to provide input to the `python` interpreter also work with `python -m cudf.pandas`, such as the REPL, the `-c` flag, and reading from stdin.
+
+Executing `python -m cudf.pandas` with no script name will enter a REPL (read-eval-print loop) similar to the behavior of the normal `python` interpreter.
+
+The `-c` flag accepts a code string to run, like this:
+
+```bash
+$ python -m cudf.pandas -c "import pandas; print(pandas)"
+<module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
+```
+
+Users can also provide code to execute from stdin, like this:
+
+```bash
+$ echo "import pandas; print(pandas)" | python -m cudf.pandas
+<module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
+```
diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index 3a82829eb7a..e0d3d9101a9 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -10,6 +10,7 @@
 """
 
 import argparse
+import code
 import runpy
 import sys
 import tempfile
@@ -21,6 +22,8 @@
 
 @contextmanager
 def profile(function_profile, line_profile, fn):
+    if fn is None and (line_profile or function_profile):
+        raise RuntimeError("Enabling the profiler requires a script name.")
     if line_profile:
         with open(fn) as f:
             lines = f.readlines()
@@ -54,6 +57,11 @@ def main():
         dest="module",
         nargs=1,
     )
+    parser.add_argument(
+        "-c",
+        dest="cmd",
+        nargs=1,
+    )
     parser.add_argument(
         "--profile",
         action="store_true",
@@ -72,9 +80,18 @@ def main():
 
     args = parser.parse_args()
 
+    if args.cmd:
+        f = tempfile.NamedTemporaryFile(mode="w+b", suffix=".py")
+        f.write(args.cmd[0].encode())
+        f.seek(0)
+        args.args.insert(0, f.name)
+
     install()
-    with profile(args.profile, args.line_profile, args.args[0]) as fn:
-        args.args[0] = fn
+
+    script_name = args.args[0] if len(args.args) > 0 else None
+    with profile(args.profile, args.line_profile, script_name) as fn:
+        if script_name is not None:
+            args.args[0] = fn
         if args.module:
             (module,) = args.module
             # run the module passing the remaining arguments
@@ -85,6 +102,21 @@ def main():
             # Remove ourself from argv and continue
             sys.argv[:] = args.args
             runpy.run_path(args.args[0], run_name="__main__")
+        else:
+            if sys.stdin.isatty():
+                banner = f"Python {sys.version} on {sys.platform}"
+                site_import = not sys.flags.no_site
+                if site_import:
+                    cprt = 'Type "help", "copyright", "credits" or "license" for more information.'
+                    banner += "\n" + cprt
+            else:
+                # Don't show prompts or banners if stdin is not a TTY
+                sys.ps1 = ""
+                sys.ps2 = ""
+                banner = ""
+
+            # Launch an interactive interpreter
+            code.interact(banner=banner, exitmsg="")
 
 
 if __name__ == "__main__":
diff --git a/python/cudf/cudf_pandas_tests/test_main.py b/python/cudf/cudf_pandas_tests/test_main.py
new file mode 100644
index 00000000000..326224c8fc0
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/test_main.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import subprocess
+import tempfile
+import textwrap
+
+
+def _run_python(*, cudf_pandas, command):
+    executable = "python "
+    if cudf_pandas:
+        executable += "-m cudf.pandas "
+    return subprocess.run(
+        executable + command,
+        shell=True,
+        capture_output=True,
+        check=True,
+        text=True,
+    )
+
+
+def test_run_cudf_pandas_with_script():
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=True) as f:
+        code = textwrap.dedent(
+            """
+            import pandas as pd
+            df = pd.DataFrame({'a': [1, 2, 3]})
+            print(df['a'].sum())
+            """
+        )
+        f.write(code)
+        f.flush()
+
+        res = _run_python(cudf_pandas=True, command=f.name)
+        expect = _run_python(cudf_pandas=False, command=f.name)
+
+    assert res.stdout != ""
+    assert res.stdout == expect.stdout
+
+
+def test_run_cudf_pandas_with_script_with_cmd_args():
+    input_args_and_code = """-c 'import pandas as pd; df = pd.DataFrame({"a": [1, 2, 3]}); print(df["a"].sum())'"""
+
+    res = _run_python(cudf_pandas=True, command=input_args_and_code)
+    expect = _run_python(cudf_pandas=False, command=input_args_and_code)
+
+    assert res.stdout != ""
+    assert res.stdout == expect.stdout
+
+
+def test_run_cudf_pandas_with_script_with_cmd_args_check_cudf():
+    """Verify that cudf is active with -m cudf.pandas."""
+    input_args_and_code = """-c 'import pandas as pd; print(pd)'"""
+
+    res = _run_python(cudf_pandas=True, command=input_args_and_code)
+    expect = _run_python(cudf_pandas=False, command=input_args_and_code)
+
+    assert "cudf" in res.stdout
+    assert "cudf" not in expect.stdout
+
+
+def test_cudf_pandas_script_repl():
+    def start_repl_process(cmd):
+        return subprocess.Popen(
+            cmd.split(),
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            text=True,
+        )
+
+    def get_repl_output(process, commands):
+        for command in commands:
+            process.stdin.write(command)
+            process.stdin.flush()
+        return process.communicate()
+
+    p1 = start_repl_process("python -m cudf.pandas")
+    p2 = start_repl_process("python")
+    commands = [
+        "import pandas as pd\n",
+        "print(pd.Series(range(2)).sum())\n",
+        "print(pd.Series(range(5)).sum())\n",
+        "import sys\n",
+        "print(pd.Series(list('abcd')), out=sys.stderr)\n",
+    ]
+
+    res = get_repl_output(p1, commands)
+    expect = get_repl_output(p2, commands)
+
+    # Check stdout
+    assert res[0] != ""
+    assert res[0] == expect[0]
+
+    # Check stderr
+    assert res[1] != ""
+    assert res[1] == expect[1]
+
+    p1.kill()
+    p2.kill()

From 44a9c10105ab06538264e727188a04d623b0811e Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 18 Sep 2024 01:25:59 -0700
Subject: [PATCH 208/270] Add a benchmark to study Parquet reader's performance
 for wide tables (#16751)

Related to #16750

This PR adds a benchmark to study read throughput of Parquet reader for wide tables.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16751
---
 .../io/parquet/parquet_reader_input.cpp       | 87 ++++++++++++++++++-
 1 file changed, 85 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 7563c823454..ce115fd7723 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64;
 void parquet_read_common(cudf::size_type num_rows_to_read,
                          cudf::size_type num_cols_to_read,
                          cuio_source_sink_pair& source_sink,
-                         nvbench::state& state)
+                         nvbench::state& state,
+                         size_t table_data_size = data_size)
 {
   cudf::io::parquet_reader_options read_opts =
     cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
@@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
     });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_element_count(static_cast<double>(table_data_size) / time, "bytes_per_second");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
@@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list<nvbench::e
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+template <data_type DataType>
+void BM_parquet_read_wide_tables(nvbench::state& state,
+                                 nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
+
+  auto const n_col           = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
+  auto const cardinality     = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length      = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type     = io_type::DEVICE_BUFFER;
+  cuio_source_sink_pair source_sink(source_type);
+
+  auto const num_rows_written = [&]() {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, n_col),
+      table_size_bytes{data_size_bytes},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+    return view.num_rows();
+  }();
+
+  parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
+}
+
+void BM_parquet_read_wide_tables_mixed(nvbench::state& state)
+{
+  auto const d_type = []() {
+    auto d_type1 = get_type_or_group(static_cast<int32_t>(data_type::INTEGRAL));
+    auto d_type2 = get_type_or_group(static_cast<int32_t>(data_type::FLOAT));
+    d_type1.reserve(d_type1.size() + d_type2.size());
+    std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1));
+    return d_type1;
+  }();
+
+  auto const n_col           = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
+  auto const cardinality     = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length      = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type     = io_type::DEVICE_BUFFER;
+  cuio_source_sink_pair source_sink(source_type);
+
+  auto const num_rows_written = [&]() {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, n_col),
+      table_size_bytes{data_size_bytes},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+    return view.num_rows();
+  }();
+
+  parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
+}
+
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
                                             data_type::FLOAT,
                                             data_type::DECIMAL,
@@ -272,6 +337,24 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("num_string_cols", {1, 2, 3});
 
+using d_type_list_wide_table = nvbench::enum_type_list<data_type::DECIMAL, data_type::STRING>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table))
+  .set_name("parquet_read_wide_tables")
+  .set_min_samples(4)
+  .set_type_axes_names({"data_type"})
+  .add_int64_axis("data_size_mb", {1024, 2048, 4096})
+  .add_int64_axis("num_cols", {256, 512, 1024})
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
+
+NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed)
+  .set_name("parquet_read_wide_tables_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("data_size_mb", {1024, 2048, 4096})
+  .add_int64_axis("num_cols", {256, 512, 1024})
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
+
 // a benchmark for structs that only contain fixed-width types
 using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
 NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))

From 2a9a8f5b95ea62824147f1629de1fe52fdbf1254 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Wed, 18 Sep 2024 09:02:41 -0500
Subject: [PATCH 209/270] use get-pr-info from nv-gha-runners (#16819)

There are two implementations of the same action; one in
[rapidsai/shared-actions](https://github.com/rapidsai/shared-actions/tree/main/get-pr-info)
and [the other](https://github.com/nv-gha-runners/get-pr-info) in the
nv-gha-runners org. This PR switches to the implementation in the
nv-gha-runners group in order to keep a single source of truth.

Tested in
https://github.com/rapidsai/cudf/actions/runs/10906617425/job/30268277178?pr=16819#step:4:5
---
 .github/workflows/pr.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a4a8f036174..d7d14ea12ff 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -52,7 +52,7 @@ jobs:
     steps:
       - name: Get PR info
         id: get-pr-info
-        uses: rapidsai/shared-actions/get-pr-info@main
+        uses: nv-gha-runners/get-pr-info@main
       - name: Checkout code repo
         uses: actions/checkout@v4
         with:

From 2a3026dec9dca553c2be7d49f2d0e6c09a9f4589 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 18 Sep 2024 10:04:31 -0700
Subject: [PATCH 210/270] Change the Parquet writer's
 `default_row_group_size_bytes` from 128MB to inf (#16750)

Closes #16733.

This PR changes the default value of Parquet writer's default max row group size from 128MB to 1Million rows. This allows avoiding thin row group strips when writing wide (> 512 cols) tables resulting in a significantly improved read throughput for wide tables (especially when low cardinality) with virtually no impact to narrow-tables read performance.

Benchmarked using: #16751

## Results

### Hardware
```
GPU: NVIDIA RTX 5880 Ada Generation
SM Version: 890 (PTX Version: 860)
Number of SMs: 110
SM Default Clock Rate: 18446744071874 MHz
Global Memory: 23879 MiB Free / 48632 MiB Total
Global Memory Bus Peak: 960 GB/sec (384-bit DDR @10001MHz)
Max Shared Memory: 100 KiB/SM, 48 KiB/Block
L2 Cache Size: 98304 KiB
Maximum Active Blocks: 24/SM
Maximum Active Threads: 1536/SM, 1024/Block
Available Registers: 65536/SM, 65536/Block
ECC Enabled: No
```

### Read Throughput
```
## parquet_read_wide_tables_mixed

|     T     | num_rows | num_cols |  GPU Time_old  |  GPU Time_new  | bytes_per_second_old | bytes_per_second_new | peak_memory_usage_old | peak_memory_usage_new | encoded_file_size_old | encoded_file_size_new |
|-----------|----------|----------|----------------|----------------|----------------------|----------------------|-----------------------|-----------------------|-----------------------|-----------------------|
|  INTEGRAL |    10000 |       64 |     940.690 us |     928.387 us |         570720378014 |         578283256754 |             3.405 MiB |             3.405 MiB |           748.248 KiB |           748.248 KiB |
|  INTEGRAL |   100000 |       64 |       2.053 ms |       2.037 ms |         261541794543 |         263500220325 |            28.308 MiB |            28.308 MiB |             5.164 MiB |             5.164 MiB |
|  INTEGRAL |   500000 |       64 |       5.783 ms |       5.693 ms |          92838553328 |          94296134644 |           139.928 MiB |           139.042 MiB |            24.698 MiB |            24.325 MiB |
|  INTEGRAL |  1000000 |       64 |      11.400 ms |      10.775 ms |          47092763803 |          49824643807 |           279.254 MiB |           277.470 MiB |            49.042 MiB |            48.284 MiB |
|  INTEGRAL |    10000 |      256 |       1.718 ms |       1.732 ms |         312407306091 |         309935794547 |            13.752 MiB |            13.752 MiB |             2.956 MiB |             2.956 MiB |
|  INTEGRAL |   100000 |      256 |       5.726 ms |       5.818 ms |          93765292338 |          92275580643 |           114.366 MiB |           114.366 MiB |            20.743 MiB |            20.743 MiB |
|  INTEGRAL |   500000 |      256 |      25.179 ms |      22.159 ms |          21322289603 |          24228371776 |           572.905 MiB |           561.786 MiB |           103.796 MiB |            97.677 MiB |
|  INTEGRAL |  1000000 |      256 |      48.259 ms |      42.428 ms |          11124725758 |          12653746472 |             1.117 GiB |             1.095 GiB |           206.155 MiB |           193.886 MiB |
|  INTEGRAL |    10000 |      512 |       2.741 ms |       2.758 ms |         195853280055 |         194632437549 |            27.508 MiB |            27.508 MiB |             5.918 MiB |             5.918 MiB |
|  INTEGRAL |   100000 |      512 |      11.197 ms |      10.600 ms |          47945685016 |          50646524148 |           235.910 MiB |           228.755 MiB |            44.559 MiB |            41.510 MiB |
|  INTEGRAL |   500000 |      512 |      54.929 ms |      43.554 ms |           9773962645 |          12326557981 |             1.146 GiB |             1.097 GiB |           221.266 MiB |           195.384 MiB |
|  INTEGRAL |  1000000 |      512 |     103.779 ms |      82.403 ms |           5173195193 |           6515218035 |             2.288 GiB |             2.190 GiB |           442.101 MiB |           387.861 MiB |
|  INTEGRAL |    10000 |     1024 |       5.210 ms |       5.405 ms |         103040438112 |          99319591295 |            54.937 MiB |            54.937 MiB |            11.829 MiB |            11.829 MiB |
|  INTEGRAL |   100000 |     1024 |      26.891 ms |      20.194 ms |          19964357393 |          26585391032 |           498.410 MiB |           456.756 MiB |            99.962 MiB |            82.939 MiB |
|  INTEGRAL |   500000 |     1024 |     135.404 ms |      84.676 ms |           3964957208 |           6340314329 |             2.434 GiB |             2.191 GiB |           500.554 MiB |           390.418 MiB |
|  INTEGRAL |  1000000 |     1024 |     256.033 ms |     162.217 ms |           2096879057 |           3309593393 |             4.869 GiB |             4.372 GiB |          1001.573 MiB |           775.040 MiB |
|     FLOAT |    10000 |       64 |     962.219 us |     951.565 us |         557950915640 |         564197923891 |             5.275 MiB |             5.275 MiB |          1012.101 KiB |          1012.101 KiB |
|     FLOAT |   100000 |       64 |       2.032 ms |       2.032 ms |         264218700681 |         264250413360 |            45.321 MiB |            45.321 MiB |             6.316 MiB |             6.316 MiB |
|     FLOAT |   500000 |       64 |       6.660 ms |       6.693 ms |          80611279094 |          80219014175 |           224.129 MiB |           222.946 MiB |            29.685 MiB |            29.044 MiB |
|     FLOAT |  1000000 |       64 |      13.560 ms |      13.758 ms |          39591771965 |          39023315442 |           447.103 MiB |           445.007 MiB |            58.762 MiB |            57.482 MiB |
|     FLOAT |    10000 |      256 |       1.808 ms |       1.825 ms |         297020886609 |         294226222306 |            21.109 MiB |            21.109 MiB |             3.968 MiB |             3.968 MiB |
|     FLOAT |   100000 |      256 |       6.921 ms |       6.307 ms |          77571490752 |          85116522574 |           185.578 MiB |           181.271 MiB |            27.393 MiB |            25.256 MiB |
|     FLOAT |   500000 |      256 |      30.064 ms |      25.955 ms |          17857874786 |          20684696586 |           914.366 MiB |           891.787 MiB |           128.981 MiB |           116.186 MiB |
|     FLOAT |  1000000 |      256 |      59.189 ms |      48.592 ms |           9070460126 |          11048464794 |             1.787 GiB |             1.738 GiB |           258.075 MiB |           229.920 MiB |
|     FLOAT |    10000 |      512 |       2.998 ms |       3.006 ms |         179078195058 |         178594968077 |            42.222 MiB |            42.222 MiB |             7.941 MiB |             7.941 MiB |
|     FLOAT |   100000 |      512 |      14.160 ms |      12.314 ms |          37915291403 |          43597041127 |           376.553 MiB |           362.567 MiB |            60.136 MiB |            50.537 MiB |
|     FLOAT |   500000 |      512 |      69.524 ms |      50.251 ms |           7722076774 |          10683715204 |             1.826 GiB |             1.742 GiB |           292.552 MiB |           232.393 MiB |
|     FLOAT |  1000000 |      512 |     130.729 ms |      95.458 ms |           4106742786 |           5624164002 |             3.647 GiB |             3.477 GiB |           581.180 MiB |           459.927 MiB |
|     FLOAT |    10000 |     1024 |       6.351 ms |       6.492 ms |          84532884515 |          82693769317 |            84.452 MiB |            84.452 MiB |            15.893 MiB |            15.893 MiB |
|     FLOAT |   100000 |     1024 |      36.898 ms |      26.302 ms |          14550146722 |          20411596018 |           778.441 MiB |           725.125 MiB |           136.809 MiB |           101.066 MiB |
|     FLOAT |   500000 |     1024 |     166.699 ms |      98.340 ms |           3220600409 |           5459311820 |             3.802 GiB |             3.484 GiB |           685.702 MiB |           464.775 MiB |
|     FLOAT |  1000000 |     1024 |     339.687 ms |     188.463 ms |           1580487011 |           2848673918 |             7.606 GiB |             6.953 GiB |             1.340 GiB |           919.840 MiB |
|   DECIMAL |    10000 |       64 |       1.076 ms |       1.092 ms |         498752693210 |         491676757508 |             7.485 MiB |             7.485 MiB |             1.216 MiB |             1.216 MiB |
|   DECIMAL |   100000 |       64 |       2.166 ms |       2.172 ms |         247840684988 |         247198078197 |            65.498 MiB |            65.498 MiB |             6.658 MiB |             6.658 MiB |
|   DECIMAL |   500000 |       64 |       7.421 ms |       7.058 ms |          72343289850 |          76066836305 |           325.515 MiB |           322.466 MiB |            31.349 MiB |            29.384 MiB |
|   DECIMAL |  1000000 |       64 |      15.239 ms |      14.020 ms |          35230516583 |          38291860266 |           649.547 MiB |           643.714 MiB |            61.759 MiB |            57.826 MiB |
|   DECIMAL |    10000 |      256 |       1.989 ms |       1.989 ms |         269930562597 |         269886680781 |            30.119 MiB |            30.119 MiB |             4.896 MiB |             4.896 MiB |
|   DECIMAL |   100000 |      256 |       7.839 ms |       6.966 ms |          68483613468 |          77073587059 |           269.638 MiB |           263.547 MiB |            30.588 MiB |            26.664 MiB |
|   DECIMAL |   500000 |      256 |      35.199 ms |      26.893 ms |          15252335676 |          19963411264 |             1.312 GiB |             1.267 GiB |           150.948 MiB |           117.601 MiB |
|   DECIMAL |  1000000 |      256 |      72.584 ms |      50.944 ms |           7396511691 |          10538553316 |             2.622 GiB |             2.529 GiB |           301.231 MiB |           231.353 MiB |
|   DECIMAL |    10000 |      512 |       3.612 ms |       3.595 ms |         148642296188 |         149335059500 |            60.283 MiB |            60.283 MiB |             9.801 MiB |             9.801 MiB |
|   DECIMAL |   100000 |      512 |      19.820 ms |      14.084 ms |          27087819156 |          38119174003 |           562.417 MiB |           527.494 MiB |            75.263 MiB |            53.349 MiB |
|   DECIMAL |   500000 |      512 |      94.913 ms |      51.910 ms |           5656452419 |          10342308581 |             2.747 GiB |             2.536 GiB |           377.112 MiB |           235.187 MiB |
|   DECIMAL |  1000000 |      512 |     180.513 ms |      98.562 ms |           2974131976 |           5447057883 |             5.494 GiB |             5.063 GiB |           754.738 MiB |           462.785 MiB |
|   DECIMAL |    10000 |     1024 |       7.667 ms |       6.777 ms |          70025338013 |          79218913933 |           120.656 MiB |           120.656 MiB |            19.616 MiB |            19.616 MiB |
|   DECIMAL |   100000 |     1024 |      61.182 ms |      26.946 ms |           8775038947 |          19923803470 |             1.184 GiB |             1.031 GiB |           201.928 MiB |           106.705 MiB |
|   DECIMAL |   500000 |     1024 |     261.218 ms |     102.314 ms |           2055261558 |           5247292283 |             5.921 GiB |             5.076 GiB |          1012.826 MiB |           470.402 MiB |
|   DECIMAL |  1000000 |     1024 |     513.386 ms |     196.347 ms |           1045744543 |           2734301880 |            11.843 GiB |            10.133 GiB |             1.980 GiB |           925.576 MiB |
| TIMESTAMP |    10000 |       64 |       1.014 ms |       1.016 ms |         529606978079 |         528414399822 |             6.079 MiB |             6.079 MiB |             1.068 MiB |             1.068 MiB |
| TIMESTAMP |   100000 |       64 |       2.057 ms |       2.053 ms |         261019684779 |         261455248599 |            52.688 MiB |            52.688 MiB |             6.436 MiB |             6.436 MiB |
| TIMESTAMP |   500000 |       64 |       6.950 ms |       6.761 ms |          77245644716 |          79410211533 |           260.606 MiB |           259.304 MiB |            29.924 MiB |            29.164 MiB |
| TIMESTAMP |  1000000 |       64 |      14.506 ms |      13.832 ms |          37010291008 |          38813599633 |           521.240 MiB |           517.604 MiB |            59.878 MiB |            57.601 MiB |
| TIMESTAMP |    10000 |      256 |       1.878 ms |       1.889 ms |         285887176743 |         284275145551 |            24.328 MiB |            24.328 MiB |             4.290 MiB |             4.290 MiB |
| TIMESTAMP |   100000 |      256 |       7.198 ms |       6.458 ms |          74586920018 |          83128450019 |           215.854 MiB |           210.739 MiB |            28.681 MiB |            25.734 MiB |
| TIMESTAMP |   500000 |      256 |      34.185 ms |      26.654 ms |          15705060785 |          20142331826 |             1.044 GiB |             1.013 GiB |           137.016 MiB |           116.663 MiB |
| TIMESTAMP |  1000000 |      256 |      66.420 ms |      49.599 ms |           8083007343 |          10824295857 |             2.085 GiB |             2.022 GiB |           272.580 MiB |           230.395 MiB |
| TIMESTAMP |    10000 |      512 |       3.143 ms |       3.150 ms |         170821086658 |         170446277893 |            48.702 MiB |            48.702 MiB |             8.591 MiB |             8.591 MiB |
| TIMESTAMP |   100000 |      512 |      17.652 ms |      12.615 ms |          30413872283 |          42557024194 |           440.115 MiB |           421.891 MiB |            63.197 MiB |            51.502 MiB |
| TIMESTAMP |   500000 |      512 |      75.454 ms |      50.955 ms |           7115233856 |          10536117334 |             2.146 GiB |             2.028 GiB |           315.073 MiB |           233.355 MiB |
| TIMESTAMP |  1000000 |      512 |     140.692 ms |      95.964 ms |           3815935506 |           5594485106 |             4.285 GiB |             4.048 GiB |           627.348 MiB |           460.885 MiB |
| TIMESTAMP |    10000 |     1024 |       6.436 ms |       6.975 ms |          83411903593 |          76971777095 |            97.454 MiB |            97.454 MiB |            17.196 MiB |            17.196 MiB |
| TIMESTAMP |   100000 |     1024 |      45.659 ms |      26.728 ms |          11758159876 |          20086145129 |           936.005 MiB |           844.159 MiB |           159.908 MiB |           103.000 MiB |
| TIMESTAMP |   500000 |     1024 |     199.636 ms |      99.231 ms |           2689242353 |           5410303529 |             4.557 GiB |             4.057 GiB |           794.728 MiB |           466.703 MiB |
| TIMESTAMP |  1000000 |     1024 |     372.691 ms |     192.598 ms |           1440523696 |           2787517681 |             9.104 GiB |             8.099 GiB |             1.551 GiB |           921.760 MiB |
|  DURATION |    10000 |       64 |     986.208 us |     989.153 us |         544379023579 |         542758221495 |             6.417 MiB |             6.417 MiB |           932.501 KiB |           932.501 KiB |
|  DURATION |   100000 |       64 |       2.222 ms |       2.018 ms |         241594183626 |         266034888500 |            57.291 MiB |            57.291 MiB |             6.079 MiB |             6.079 MiB |
|  DURATION |   500000 |       64 |       6.642 ms |       6.673 ms |          80830328889 |          80453377113 |           284.029 MiB |           283.224 MiB |            28.819 MiB |            28.288 MiB |
|  DURATION |  1000000 |       64 |      13.150 ms |      13.488 ms |          40828039129 |          39804805295 |           567.280 MiB |           565.669 MiB |            57.137 MiB |            56.075 MiB |
|  DURATION |    10000 |      256 |       1.805 ms |       1.815 ms |         297459887040 |         295856879191 |            25.686 MiB |            25.686 MiB |             3.665 MiB |             3.665 MiB |
|  DURATION |   100000 |      256 |       6.839 ms |       6.270 ms |          78502421937 |          85630914910 |           232.874 MiB |           229.165 MiB |            25.863 MiB |            24.323 MiB |
|  DURATION |   500000 |      256 |      29.886 ms |      26.234 ms |          17964080662 |          20464503730 |             1.125 GiB |             1.106 GiB |           123.885 MiB |           113.179 MiB |
|  DURATION |  1000000 |      256 |      58.290 ms |      48.418 ms |           9210348188 |          11088351436 |             2.250 GiB |             2.210 GiB |           247.272 MiB |           224.312 MiB |
|  DURATION |    10000 |      512 |       3.035 ms |       2.964 ms |         176885037888 |         181108374773 |            51.383 MiB |            51.383 MiB |             7.342 MiB |             7.342 MiB |
|  DURATION |   100000 |      512 |      14.492 ms |      12.136 ms |          37044853523 |          44237579412 |           474.355 MiB |           458.371 MiB |            55.996 MiB |            48.689 MiB |
|  DURATION |   500000 |      512 |      70.131 ms |      51.095 ms |           7655286246 |          10507294503 |             2.299 GiB |             2.213 GiB |           271.064 MiB |           226.438 MiB |
|  DURATION |  1000000 |      512 |     132.495 ms |      95.019 ms |           4051999205 |           5650150759 |             4.593 GiB |             4.419 GiB |           541.495 MiB |           448.815 MiB |
|  DURATION |    10000 |     1024 |       6.576 ms |       6.318 ms |          81638807422 |          84977253627 |           102.782 MiB |           102.782 MiB |            14.701 MiB |            14.701 MiB |
|  DURATION |   100000 |     1024 |      38.001 ms |      26.011 ms |          14127627316 |          20640219375 |           964.471 MiB |           916.755 MiB |           127.532 MiB |            97.394 MiB |
|  DURATION |   500000 |     1024 |     159.928 ms |      98.126 ms |           3356945213 |           5471258270 |             4.711 GiB |             4.426 GiB |           639.050 MiB |           452.925 MiB |
|  DURATION |  1000000 |     1024 |     305.818 ms |     188.647 ms |           1755524869 |           2845895428 |             9.422 GiB |             8.839 GiB |             1.249 GiB |           897.737 MiB |
|    STRING |    10000 |       64 |       2.241 ms |       2.244 ms |         239611491431 |         239240518530 |            15.926 MiB |            15.926 MiB |             2.075 MiB |             2.075 MiB |
|    STRING |   100000 |       64 |       4.862 ms |       4.822 ms |         110419679907 |         111346705245 |           132.646 MiB |           132.646 MiB |             8.087 MiB |             8.087 MiB |
|    STRING |   500000 |       64 |      20.498 ms |      17.812 ms |          26191957819 |          30140554720 |           664.294 MiB |           645.028 MiB |            40.456 MiB |            30.817 MiB |
|    STRING |  1000000 |       64 |      37.773 ms |      34.985 ms |          14213079575 |          15345709268 |             1.298 GiB |             1.255 GiB |            80.941 MiB |            59.259 MiB |
|    STRING |    10000 |      256 |       4.125 ms |       4.171 ms |         130163506067 |         128706550148 |            63.789 MiB |            63.789 MiB |             8.319 MiB |             8.319 MiB |
|    STRING |   100000 |      256 |      22.074 ms |      17.799 ms |          24321103825 |          30162947098 |           584.754 MiB |           530.912 MiB |            58.602 MiB |            32.330 MiB |
|    STRING |   500000 |      256 |      93.278 ms |      66.770 ms |           5755572906 |           8040584271 |             2.857 GiB |             2.521 GiB |           294.130 MiB |           123.271 MiB |
|    STRING |  1000000 |      256 |     190.999 ms |     122.359 ms |           2810851154 |           4387682165 |             5.715 GiB |             5.023 GiB |           588.586 MiB |           237.018 MiB |
|    STRING |    10000 |      512 |       7.520 ms |       8.010 ms |          71390390607 |          67021971176 |           127.538 MiB |           127.538 MiB |            16.634 MiB |            16.634 MiB |
|    STRING |   100000 |      512 |      51.666 ms |      32.251 ms |          10391219810 |          16646741143 |             1.259 GiB |             1.037 GiB |           173.940 MiB |            64.682 MiB |
|    STRING |   500000 |      512 |     251.723 ms |     125.963 ms |           2132782858 |           4262141577 |             6.300 GiB |             5.040 GiB |           873.437 MiB |           246.559 MiB |
|    STRING |  1000000 |      512 |     477.668 ms |     244.912 ms |           1123940871 |           2192101011 |            12.602 GiB |            10.044 GiB |             1.707 GiB |           474.121 MiB |
|    STRING |    10000 |     1024 |      17.184 ms |      16.128 ms |          31242201518 |          33288874029 |           276.395 MiB |           254.971 MiB |            40.126 MiB |            33.243 MiB |
|    STRING |   100000 |     1024 |     132.094 ms |      63.304 ms |           4064323158 |           8480799642 |             2.721 GiB |             2.073 GiB |           414.092 MiB |           129.316 MiB |
|    STRING |   500000 |     1024 |     608.283 ms |     251.026 ms |            882600977 |           2138709222 |            13.618 GiB |            10.076 GiB |             2.028 GiB |           493.067 MiB |
|    STRING |  1000000 |     1024 |        1.249 s |     485.734 ms |            429750505 |           1105276473 |            27.239 GiB |            20.079 GiB |             4.059 GiB |           948.185 MiB |
```

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/16750
---
 cpp/include/cudf/io/parquet.hpp          |  5 +++--
 cpp/src/io/parquet/writer_impl.cu        | 10 ++++++++--
 python/cudf/cudf/_lib/parquet.pyx        | 16 ++++++++--------
 python/cudf/cudf/core/dataframe.py       |  2 +-
 python/cudf/cudf/io/parquet.py           |  8 ++++----
 python/cudf/cudf/utils/ioutils.py        | 12 ++++--------
 python/dask_cudf/dask_cudf/io/parquet.py |  7 ++-----
 7 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index ed7b2ac0850..ee03a382bec 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -39,8 +39,9 @@ namespace io {
  * @file
  */
 
-constexpr size_t default_row_group_size_bytes   = 128 * 1024 * 1024;  ///< 128MB per row group
-constexpr size_type default_row_group_size_rows = 1000000;     ///< 1 million rows per row group
+constexpr size_t default_row_group_size_bytes =
+  std::numeric_limits<size_t>::max();                          ///< Infinite bytes per row group
+constexpr size_type default_row_group_size_rows = 1'000'000;   ///< 1 million rows per row group
 constexpr size_t default_max_page_size_bytes    = 512 * 1024;  ///< 512KB per page
 constexpr size_type default_max_page_size_rows  = 20000;       ///< 20k rows per page
 constexpr int32_t default_column_index_truncate_length = 64;   ///< truncate to 64 bytes
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 81fd4ab9f82..ec05f35d405 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1819,8 +1819,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     auto const table_size  = std::reduce(column_sizes.begin(), column_sizes.end());
     auto const avg_row_len = util::div_rounding_up_safe<size_t>(table_size, input.num_rows());
     if (avg_row_len > 0) {
-      auto const rg_frag_size = util::div_rounding_up_safe(max_row_group_size, avg_row_len);
-      max_page_fragment_size  = std::min<size_type>(rg_frag_size, max_page_fragment_size);
+      // Ensure `rg_frag_size` is not bigger than size_type::max for default max_row_group_size
+      // value (=uint64::max) to avoid a sign overflow when comparing
+      auto const rg_frag_size =
+        std::min<size_t>(std::numeric_limits<size_type>::max(),
+                         util::div_rounding_up_safe(max_row_group_size, avg_row_len));
+      // Safe comparison as rg_frag_size fits in size_type
+      max_page_fragment_size =
+        std::min<size_type>(static_cast<size_type>(rg_frag_size), max_page_fragment_size);
     }
 
     // dividing page size by average row length will tend to overshoot the desired
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index a0155671a26..e6c9d60b05b 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -438,7 +438,7 @@ def write_parquet(
     object statistics="ROWGROUP",
     object metadata_file_path=None,
     object int96_timestamps=False,
-    object row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
+    object row_group_size_bytes=None,
     object row_group_size_rows=None,
     object max_page_size_bytes=None,
     object max_page_size_rows=None,
@@ -616,9 +616,9 @@ cdef class ParquetWriter:
         Name of the compression to use. Use ``None`` for no compression.
     statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
         Level at which column statistics should be included in file.
-    row_group_size_bytes: int, default 134217728
+    row_group_size_bytes: int, default ``uint64 max``
         Maximum size of each stripe of the output.
-        By default, 134217728 (128MB) will be used.
+        By default, a virtually infinite size equal to ``uint64 max`` will be used.
     row_group_size_rows: int, default 1000000
         Maximum number of rows of each stripe of the output.
         By default, 1000000 (10^6 rows) will be used.
@@ -661,11 +661,11 @@ cdef class ParquetWriter:
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
-                  int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
-                  int row_group_size_rows=1000000,
-                  int max_page_size_bytes=524288,
-                  int max_page_size_rows=20000,
-                  int max_dictionary_size=1048576,
+                  size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
+                  size_type row_group_size_rows=1000000,
+                  size_t max_page_size_bytes=524288,
+                  size_type max_page_size_rows=20000,
+                  size_t max_dictionary_size=1048576,
                   bool use_dictionary=True,
                   bool store_schema=False):
         filepaths_or_buffers = (
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 58a16a6d504..d73ad8225ca 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6840,7 +6840,7 @@ def to_parquet(
         statistics="ROWGROUP",
         metadata_file_path=None,
         int96_timestamps=False,
-        row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+        row_group_size_bytes=None,
         row_group_size_rows=None,
         max_page_size_bytes=None,
         max_page_size_rows=None,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 62be7378e9e..ce99f98b559 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -64,7 +64,7 @@ def _write_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
-    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes=None,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
@@ -149,7 +149,7 @@ def write_to_dataset(
     return_metadata=False,
     statistics="ROWGROUP",
     int96_timestamps=False,
-    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes=None,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
@@ -205,7 +205,7 @@ def write_to_dataset(
         If ``False``, timestamps will not be altered.
     row_group_size_bytes: integer or None, default None
         Maximum size of each stripe of the output.
-        If None, 134217728 (128MB) will be used.
+        If None, no limit on row group stripe size will be used.
     row_group_size_rows: integer or None, default None
         Maximum number of rows of each stripe of the output.
         If None, 1000000 will be used.
@@ -980,7 +980,7 @@ def to_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
-    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes=None,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 1627107b57d..1180da321e6 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -27,7 +27,7 @@
     fsspec_parquet = None
 
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
-_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
+_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max
 
 _docstring_remote_sources = """
 - cuDF supports local and remote data stores. See configuration details for
@@ -275,10 +275,9 @@
     timestamp[us] to the int96 format, which is the number of Julian
     days and the number of nanoseconds since midnight of 1970-01-01.
     If ``False``, timestamps will not be altered.
-row_group_size_bytes: integer, default {row_group_size_bytes_val}
+row_group_size_bytes: integer, default None
     Maximum size of each stripe of the output.
-    If None, {row_group_size_bytes_val}
-    ({row_group_size_bytes_val_in_mb} MB) will be used.
+    If None, no limit on row group stripe size will be used.
 row_group_size_rows: integer or None, default None
     Maximum number of rows of each stripe of the output.
     If None, 1000000 will be used.
@@ -346,10 +345,7 @@
 See Also
 --------
 cudf.read_parquet
-""".format(
-    row_group_size_bytes_val=_ROW_GROUP_SIZE_BYTES_DEFAULT,
-    row_group_size_bytes_val_in_mb=_ROW_GROUP_SIZE_BYTES_DEFAULT / 1024 / 1024,
-)
+"""
 doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet)
 
 _docstring_merge_parquet_filemetadata = """
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index e793d4381d1..a781b8242fe 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -23,7 +23,6 @@
 from cudf.io import write_to_dataset
 from cudf.io.parquet import _apply_post_filters, _normalize_filters
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
-from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -341,9 +340,7 @@ def write_partition(
                 return_metadata=return_metadata,
                 statistics=kwargs.get("statistics", "ROWGROUP"),
                 int96_timestamps=kwargs.get("int96_timestamps", False),
-                row_group_size_bytes=kwargs.get(
-                    "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT
-                ),
+                row_group_size_bytes=kwargs.get("row_group_size_bytes", None),
                 row_group_size_rows=kwargs.get("row_group_size_rows", None),
                 max_page_size_bytes=kwargs.get("max_page_size_bytes", None),
                 max_page_size_rows=kwargs.get("max_page_size_rows", None),
@@ -365,7 +362,7 @@ def write_partition(
                     statistics=kwargs.get("statistics", "ROWGROUP"),
                     int96_timestamps=kwargs.get("int96_timestamps", False),
                     row_group_size_bytes=kwargs.get(
-                        "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT
+                        "row_group_size_bytes", None
                     ),
                     row_group_size_rows=kwargs.get(
                         "row_group_size_rows", None

From e68f55c98f257bdeedeb31e68c9737264bd0b393 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Wed, 18 Sep 2024 12:12:23 -0500
Subject: [PATCH 211/270] Refactor mixed_semi_join using cuco::static_set
 (#16230)

This PR refactors `mixed_semi_join` by replacing **cuco** legacy `static_map` with latest `static_set`.
Contributes to #12261.

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16230
---
 cpp/src/join/join_common_utils.hpp       |  6 --
 cpp/src/join/mixed_join_common_utils.cuh | 33 +++++++++
 cpp/src/join/mixed_join_kernels_semi.cu  | 35 ++++-----
 cpp/src/join/mixed_join_kernels_semi.cuh |  6 +-
 cpp/src/join/mixed_join_semi.cu          | 90 +++++++-----------------
 cpp/tests/join/mixed_join_tests.cu       | 30 ++++++++
 6 files changed, 109 insertions(+), 91 deletions(-)

diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 86402a0e7de..573101cefd9 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -22,7 +22,6 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <cuco/static_map.cuh>
 #include <cuco/static_multimap.cuh>
 #include <cuda/atomic>
 
@@ -51,11 +50,6 @@ using mixed_multimap_type =
                         cudf::detail::cuco_allocator<char>,
                         cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
-using semi_map_type = cuco::legacy::static_map<hash_value_type,
-                                               size_type,
-                                               cuda::thread_scope_device,
-                                               cudf::detail::cuco_allocator<char>>;
-
 using row_hash_legacy =
   cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
 
diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh
index 19701816867..89c13285cfe 100644
--- a/cpp/src/join/mixed_join_common_utils.cuh
+++ b/cpp/src/join/mixed_join_common_utils.cuh
@@ -25,6 +25,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cub/cub.cuh>
+#include <cuco/static_set.cuh>
 
 namespace cudf {
 namespace detail {
@@ -160,6 +161,38 @@ struct pair_expression_equality : public expression_equality<has_nulls> {
   }
 };
 
+/**
+ * @brief Equality comparator that composes two row_equality comparators.
+ */
+struct double_row_equality_comparator {
+  row_equality const equality_comparator;
+  row_equality const conditional_comparator;
+
+  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
+  {
+    using experimental::row::lhs_index_type;
+    using experimental::row::rhs_index_type;
+
+    return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
+           conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
+  }
+};
+
+// A CUDA Cooperative Group of 4 threads for the hash set.
+auto constexpr DEFAULT_MIXED_JOIN_CG_SIZE = 4;
+
+// The hash set type used by mixed_semi_join with the build_table.
+using hash_set_type = cuco::static_set<size_type,
+                                       cuco::extent<size_t>,
+                                       cuda::thread_scope_device,
+                                       double_row_equality_comparator,
+                                       cuco::linear_probing<DEFAULT_MIXED_JOIN_CG_SIZE, row_hash>,
+                                       cudf::detail::cuco_allocator<char>,
+                                       cuco::storage<1>>;
+
+// The hash_set_ref_type used by mixed_semi_join kerenels for probing.
+using hash_set_ref_type = hash_set_type::ref_type<cuco::contains_tag>;
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 7459ac3e99c..f2c5ff13638 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -38,12 +38,16 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                   table_device_view right_table,
                   table_device_view probe,
                   table_device_view build,
-                  row_hash const hash_probe,
                   row_equality const equality_probe,
-                  cudf::detail::semi_map_type::device_view hash_table_view,
+                  hash_set_ref_type set_ref,
                   cudf::device_span<bool> left_table_keep_mask,
                   cudf::ast::detail::expression_device_view device_expression_data)
 {
+  auto constexpr cg_size = hash_set_ref_type::cg_size;
+
+  auto const tile =
+    cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
+
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
   // used to circumvent conflicts between arrays of different types between
@@ -52,24 +56,24 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
     reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
   auto thread_intermediate_storage =
-    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
-
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = left_num_rows;
+    &intermediate_storage[tile.meta_group_rank() * device_expression_data.num_intermediates];
 
-  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
+  cudf::size_type const outer_num_rows = left_table.num_rows();
+  auto const outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>() / cg_size;
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
     left_table, right_table, device_expression_data);
 
   if (outer_row_index < outer_num_rows) {
+    // Make sure to swap_tables here as hash_set will use probe table as the left one.
+    auto constexpr swap_tables = true;
     // Figure out the number of elements for this key.
     auto equality = single_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, false, equality_probe};
+      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
 
-    left_table_keep_mask[outer_row_index] =
-      hash_table_view.contains(outer_row_index, hash_probe, equality);
+    auto const set_ref_equality = set_ref.with_key_eq(equality);
+    auto const result           = set_ref_equality.contains(tile, outer_row_index);
+    if (tile.thread_rank() == 0) left_table_keep_mask[outer_row_index] = result;
   }
 }
 
@@ -78,9 +82,8 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
-                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            hash_set_ref_type set_ref,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
@@ -94,9 +97,8 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
-        hash_probe,
         equality_probe,
-        hash_table_view,
+        set_ref,
         left_table_keep_mask,
         device_expression_data);
   } else {
@@ -106,9 +108,8 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
-        hash_probe,
         equality_probe,
-        hash_table_view,
+        set_ref,
         left_table_keep_mask,
         device_expression_data);
   }
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index 43714ffb36a..b08298e64e4 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -45,9 +45,8 @@ namespace detail {
  * @param[in] right_table The right table
  * @param[in] probe The table with which to probe the hash table for matches.
  * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
  * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] hash_table_view The hash table built from `build`.
+ * @param[in] set_ref The hash table device view built from `build`.
  * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating
  * the corresponding index from left table is present in output
  * @param[in] device_expression_data Container of device data required to evaluate the desired
@@ -58,9 +57,8 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
-                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            hash_set_ref_type set_ref,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index cfb785e242c..719b1d47105 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -46,45 +46,6 @@
 namespace cudf {
 namespace detail {
 
-namespace {
-/**
- * @brief Device functor to create a pair of hash value and index for a given row.
- */
-struct make_pair_function_semi {
-  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
-  {
-    // The value is irrelevant since we only ever use the hash map to check for
-    // membership of a particular row index.
-    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
-  }
-};
-
-/**
- * @brief Equality comparator that composes two row_equality comparators.
- */
-class double_row_equality {
- public:
-  double_row_equality(row_equality equality_comparator, row_equality conditional_comparator)
-    : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator}
-  {
-  }
-
-  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
-  {
-    using experimental::row::lhs_index_type;
-    using experimental::row::rhs_index_type;
-
-    return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
-           _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
-  }
-
- private:
-  row_equality _equality_comparator;
-  row_equality _conditional_comparator;
-};
-
-}  // namespace
-
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -96,7 +57,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
+  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and
                  (join_type != join_kind::FULL_JOIN),
                "Inner, left, and full joins should use mixed_join.");
 
@@ -137,7 +98,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // output column and follow the null-supporting expression evaluation code
   // path.
   auto const has_nulls = cudf::nullate::DYNAMIC{
-    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
+    cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or
     binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)};
 
   auto const parser = ast::detail::expression_parser{
@@ -156,27 +117,20 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto right_conditional_view = table_device_view::create(right_conditional, stream);
 
   auto const preprocessed_build =
-    experimental::row::equality::preprocessed_table::create(build, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(build, stream);
   auto const preprocessed_probe =
-    experimental::row::equality::preprocessed_table::create(probe, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
   auto const row_comparator =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
+    cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe};
   auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
 
-  semi_map_type hash_table{
-    compute_hash_table_size(build.num_rows()),
-    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-    cuco::empty_value{cudf::detail::JoinNoneValue},
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
   // Create hash table containing all keys found in right table
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const build_nulls    = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
   auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build};
-  auto const hash_build     = row_hash_build.device_hasher(build_nulls);
+
   // Since we may see multiple rows that are identical in the equality tables
   // but differ in the conditional tables, the equality comparator used for
   // insertion must account for both sets of tables. An alternative solution
@@ -191,20 +145,28 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const equality_build_equality =
     row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
   auto const preprocessed_build_condtional =
-    experimental::row::equality::preprocessed_table::create(right_conditional, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream);
   auto const row_comparator_conditional_build =
     cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
                                                             preprocessed_build_condtional};
   auto const equality_build_conditional =
     row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
-  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
-  make_pair_function_semi pair_func_build{};
 
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
+  hash_set_type row_set{
+    {compute_hash_table_size(build.num_rows())},
+    cuco::empty_key{JoinNoneValue},
+    {equality_build_equality, equality_build_conditional},
+    {row_hash_build.device_hasher(build_nulls)},
+    {},
+    {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    {stream.value()}};
+
+  auto iter = thrust::make_counting_iterator(0);
 
   // skip rows that are null here.
   if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
+    row_set.insert(iter, iter + right_num_rows, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
     auto const [row_bitmask, _] =
@@ -212,18 +174,19 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
-    hash_table.insert_if(
-      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
+    row_set.insert_if(iter, iter + right_num_rows, stencil, pred, stream.value());
   }
 
-  auto hash_table_view = hash_table.get_device_view();
-
   detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+  auto const shmem_size_per_block =
+    parser.shmem_per_thread *
+    cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size);
 
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
+  hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);
+
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
@@ -232,9 +195,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
                          *right_conditional_view,
                          *probe_view,
                          *build_view,
-                         hash_probe,
                          equality_probe,
-                         hash_table_view,
+                         row_set_ref,
                          cudf::device_span<bool>(left_table_keep_mask),
                          parser.device_expression_data,
                          config,
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index 6c147c8a128..08a0136700d 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -778,6 +778,21 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality)
              {1});
 }
 
+TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_one_greater_right_one =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {1},
+             left_one_greater_right_one,
+             {2, 7, 8});
+}
+
 TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)
 {
   this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}},
@@ -900,3 +915,18 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality)
              left_zero_eq_right_zero,
              {0, 1, 3});
 }
+
+TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_one_greater_right_one =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {1},
+             left_one_greater_right_one,
+             {0, 1, 3, 4, 5, 6, 9});
+}

From 42c53247bd3933c83fde18d378902a76d1506c57 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 18 Sep 2024 14:42:09 -0500
Subject: [PATCH 212/270] Use CI workflow branch 'branch-24.10' again (#16832)

All RAPIDS libraries have been updated with Python 3.12 support, so
Python 3.12 changes
have been merged into `branch-24.10` of `shared-workflows`:
https://github.com/rapidsai/shared-workflows/pull/213

This updates GitHub Actions configs here to that branch.
---
 .github/workflows/build.yaml                  | 28 +++++------
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 48 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  6 +--
 .github/workflows/test.yaml                   | 24 +++++-----
 5 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index d6d3e3fdd33..b5d17022a3a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-libcudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -81,7 +81,7 @@ jobs:
   wheel-publish-libcudf:
     needs: wheel-build-libcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-build-pylibcudf:
     needs: [wheel-publish-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-pylibcudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -113,7 +113,7 @@ jobs:
   wheel-build-cudf:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -134,7 +134,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -157,7 +157,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -169,7 +169,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index d670132cca9..10c803f7921 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
       with:
         # This selects "ARCH=amd64 + the latest supported Python + CUDA".
         matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d7d14ea12ff..b515dbff9f3 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -37,7 +37,7 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -104,39 +104,39 @@ jobs:
               - '!notebooks/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     if: needs.changed-files.outputs.test_cpp == 'true'
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -145,7 +145,7 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -153,7 +153,7 @@ jobs:
   conda-java-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     if: needs.changed-files.outputs.test_java == 'true'
     with:
       build_type: pull-request
@@ -164,7 +164,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -174,7 +174,7 @@ jobs:
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     if: needs.changed-files.outputs.test_notebooks == 'true'
     with:
       build_type: pull-request
@@ -185,7 +185,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -195,7 +195,7 @@ jobs:
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -204,21 +204,21 @@ jobs:
   wheel-build-pylibcudf:
     needs: [checks, wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -226,7 +226,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -235,7 +235,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -247,7 +247,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -256,7 +256,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -265,7 +265,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -276,7 +276,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -287,7 +287,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -299,7 +299,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index fe77ad4b6b2..45e5191eb54 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 4af6a0d690d..8605fa46f68 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -126,7 +126,7 @@ jobs:
       script: ci/cudf_pandas_scripts/run_tests.sh
   third-party-integration-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From a0c6fc8300bb713721c355feec21e43c83268b47 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 18 Sep 2024 20:52:23 -0700
Subject: [PATCH 213/270] Rename the NDS-H benchmark binaries (#16831)

Renames the NDS-H benchmark binaries with 0 prefixes for better lexicographical sorting

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16831
---
 cpp/benchmarks/CMakeLists.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 6c5f4a68a4c..abc6f74fccf 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -177,11 +177,11 @@ ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp)
 
 # ##################################################################################################
 # * nds-h benchmark --------------------------------------------------------------------------------
-ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp)
-ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp)
-ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp)
-ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp)
-ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q01_NVBENCH ndsh/q01.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q05_NVBENCH ndsh/q05.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q06_NVBENCH ndsh/q06.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q09_NVBENCH ndsh/q09.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q10_NVBENCH ndsh/q10.cpp ndsh/utilities.cpp)
 
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------

From 30e3946ae79396b7fd09ea368fada0df4babea85 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 19 Sep 2024 01:44:30 -0400
Subject: [PATCH 214/270] Whitespace normalization of nested column coerced as
 string column in JSONL inputs (#16759)

Addresses #15280

Whitespace normalization is expected to remove unquoted whitespace characters in JSON lines inputs. However, in the cases where the JSON line is invalid due to an unquoted whitespace occurring in between numbers or literals, the existing normalization implementation is incorrect since it removes these invalidating whitespaces and makes the line valid.

This PR implements the normalization as a post-processing step on only nested columns forced as string columns.
Idea:
1. Create a single buffer by concatenating the rows of the string column. Create segment offsets and lengths array for concatenated buffer
2. Run a complementary whitespace normalization FST i.e. NOP for non-whitespace and quoted whitespace characters, and output indices of unquoted whitespace characters
3. Update segment lengths based on the number of output indices between segment offsets
4. Remove characters at output indices from concatenated buffer.
5. Return updated buffer, segment lengths and updated segment offsets

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16759
---
 cpp/include/cudf/io/detail/json.hpp           |  16 +-
 cpp/src/io/json/json_column.cu                | 149 +++++++++-----
 cpp/src/io/json/json_normalization.cu         | 165 ++++++++++++----
 cpp/src/io/json/nested_json_gpu.cu            |  10 +-
 cpp/src/io/json/read_json.cu                  |   6 -
 cpp/src/io/utilities/parsing_utils.cuh        |   6 +
 cpp/tests/io/json/json_test.cpp               |  43 +++++
 .../json_whitespace_normalization_test.cu     | 182 +++++++++---------
 8 files changed, 388 insertions(+), 189 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 73ff17b2b93..940d03cdb41 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -69,11 +69,21 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
  * @brief Normalize unquoted whitespace (space and tab characters) using FST
  *
  * @param indata Input device buffer
+ * @param col_offsets Offsets to column contents in input buffer
+ * @param col_lengths Length of contents of each row in column
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
+ *
+ * @returns Tuple of the normalized column, offsets to each row in column, and lengths of contents
+ * of each row
  */
-void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
-                          rmm::cuda_stream_view stream,
-                          rmm::device_async_resource_ref mr);
+std::
+  tuple<rmm::device_uvector<char>, rmm::device_uvector<size_type>, rmm::device_uvector<size_type>>
+  normalize_whitespace(device_span<char const> d_input,
+                       device_span<size_type const> col_offsets,
+                       device_span<size_type const> col_lengths,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
+
 }  // namespace io::json::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 8890c786287..756047d383a 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
+#include <cudf/io/detail/json.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -625,6 +626,8 @@ void make_device_json_column(device_span<SymbolT const> input,
   auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   std::vector<uint8_t> is_pruned(num_columns, 0);
+  // for columns that are not mixed type but have been forced as string
+  std::vector<bool> forced_as_string_column(num_columns);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
   std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
@@ -695,11 +698,14 @@ void make_device_json_column(device_span<SymbolT const> input,
     // Struct, List, String, Value
     auto [name, parent_col_id] = name_and_parent_index(this_col_id);
 
-    // if parent is mixed type column or this column is pruned, ignore this column.
+    // if parent is mixed type column or this column is pruned or if parent
+    // has been forced as string, ignore this column.
     if (parent_col_id != parent_node_sentinel &&
-        (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) {
+          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
+        forced_as_string_column[parent_col_id]) {
       ignore_vals[this_col_id] = 1;
       if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
+      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
       continue;
     }
 
@@ -765,22 +771,26 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
 
     auto this_column_category = column_categories[this_col_id];
-    if (is_enabled_mixed_types_as_string) {
-      // get path of this column, check if it is a struct/list forced as string, and enforce it
-      auto const nt                             = tree_path.get_path(this_col_id);
-      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if ((column_categories[this_col_id] == NC_STRUCT or
-           column_categories[this_col_id] == NC_LIST) and
-          user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-        is_mixed_type_column[this_col_id] = 1;
-        this_column_category              = NC_STR;
-      }
+    // get path of this column, check if it is a struct/list forced as string, and enforce it
+    auto const nt                             = tree_path.get_path(this_col_id);
+    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      this_column_category = NC_STR;
     }
 
     CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
     // move into parent
     device_json_column col(stream, mr);
     initialize_json_columns(this_col_id, col, this_column_category);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      col.forced_as_string_column          = true;
+      forced_as_string_column[this_col_id] = true;
+    }
+
     auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
     CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
     if (not replaced) parent_col.column_order.push_back(name);
@@ -802,12 +812,30 @@ void make_device_json_column(device_span<SymbolT const> input,
           is_mixed_type_column[this_col_id] == 1)
         column_categories[this_col_id] = NC_STR;
     }
-    cudaMemcpyAsync(d_column_tree.node_categories.begin(),
-                    column_categories.data(),
-                    column_categories.size() * sizeof(column_categories[0]),
-                    cudaMemcpyDefault,
-                    stream.value());
+    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                    column_categories.data(),
+                                    column_categories.size() * sizeof(column_categories[0]),
+                                    cudf::detail::host_memory_kind::PAGEABLE,
+                                    stream);
+  }
+
+  // ignore all children of columns forced as string
+  for (auto const this_col_id : unique_col_ids) {
+    auto parent_col_id = column_parent_ids[this_col_id];
+    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
+      forced_as_string_column[this_col_id] = true;
+      ignore_vals[this_col_id]             = 1;
+    }
+    // Convert only mixed type columns as string (so to copy), but not its children
+    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
+        forced_as_string_column[this_col_id])
+      column_categories[this_col_id] = NC_STR;
   }
+  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                  column_categories.data(),
+                                  column_categories.size() * sizeof(column_categories[0]),
+                                  cudf::detail::host_memory_kind::PAGEABLE,
+                                  stream);
 
   // restore unique_col_ids order
   std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
@@ -982,39 +1010,58 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                    "string offset, string length mismatch");
       rmm::device_uvector<char_length_pair_t> d_string_data(col_size, stream);
       // TODO how about directly storing pair<char*, size_t> in json_column?
-      auto offset_length_it =
-        thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin());
 
-      data_type target_type{};
+      auto [result_bitmask, null_count] = make_validity(json_col);
 
-      if (schema.has_value()) {
+      data_type target_type{};
+      std::unique_ptr<column> col{};
+      if (options.normalize_whitespace && json_col.forced_as_string_column) {
+        CUDF_EXPECTS(prune_columns || options.mixed_types_as_string,
+                     "Whitespace normalization of nested columns requested as string requires "
+                     "either prune_columns or mixed_types_as_string to be enabled");
+        auto [normalized_d_input, col_offsets, col_lengths] =
+          cudf::io::json::detail::normalize_whitespace(
+            d_input, json_col.string_offsets, json_col.string_lengths, stream, mr);
+        auto offset_length_it = thrust::make_zip_iterator(col_offsets.begin(), col_lengths.begin());
+        target_type           = data_type{type_id::STRING};
+        // Convert strings to the inferred data type
+        col = parse_data(normalized_d_input.data(),
+                         offset_length_it,
+                         col_size,
+                         target_type,
+                         std::move(result_bitmask),
+                         null_count,
+                         options.view(),
+                         stream,
+                         mr);
+      } else {
+        auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(),
+                                                          json_col.string_lengths.begin());
+        if (schema.has_value()) {
 #ifdef NJP_DEBUG_PRINT
-        std::cout << "-> explicit type: "
-                  << (schema.has_value() ? std::to_string(static_cast<int>(schema->type.id()))
-                                         : "n/a");
+          std::cout << "-> explicit type: "
+                    << (schema.has_value() ? std::to_string(static_cast<int>(schema->type.id()))
+                                           : "n/a");
 #endif
-        target_type = schema.value().type;
-      } else if (json_col.forced_as_string_column) {
-        target_type = data_type{type_id::STRING};
-      }
-      // Infer column type, if we don't have an explicit type for it
-      else {
-        target_type = cudf::io::detail::infer_data_type(
-          options.json_view(), d_input, offset_length_it, col_size, stream);
+          target_type = schema.value().type;
+        }
+        // Infer column type, if we don't have an explicit type for it
+        else {
+          target_type = cudf::io::detail::infer_data_type(
+            options.json_view(), d_input, offset_length_it, col_size, stream);
+        }
+        // Convert strings to the inferred data type
+        col = parse_data(d_input.data(),
+                         offset_length_it,
+                         col_size,
+                         target_type,
+                         std::move(result_bitmask),
+                         null_count,
+                         options.view(),
+                         stream,
+                         mr);
       }
 
-      auto [result_bitmask, null_count] = make_validity(json_col);
-      // Convert strings to the inferred data type
-      auto col = parse_data(d_input.data(),
-                            offset_length_it,
-                            col_size,
-                            target_type,
-                            std::move(result_bitmask),
-                            null_count,
-                            options.view(),
-                            stream,
-                            mr);
-
       // Reset nullable if we do not have nulls
       // This is to match the existing JSON reader's behaviour:
       // - Non-string columns will always be returned as nullable
@@ -1120,11 +1167,15 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     const auto [tokens_gpu, token_indices_gpu] =
       get_token_stream(d_input, options, stream, cudf::get_current_device_resource_ref());
     // gpu tree generation
-    return get_tree_representation(tokens_gpu,
-                                   token_indices_gpu,
-                                   options.is_enabled_mixed_types_as_string(),
-                                   stream,
-                                   cudf::get_current_device_resource_ref());
+    // Note that to normalize whitespaces in nested columns coerced to be string, we need the column
+    // to either be of mixed type or we need to request the column to be returned as string by
+    // pruning it with the STRING dtype
+    return get_tree_representation(
+      tokens_gpu,
+      token_indices_gpu,
+      options.is_enabled_mixed_types_as_string() || options.is_enabled_prune_columns(),
+      stream,
+      cudf::get_current_device_resource_ref());
   }();  // IILE used to free memory of token data.
 #ifdef NJP_DEBUG_PRINT
   auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 97d5884fef1..2d435dc8e1a 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -17,6 +17,7 @@
 #include "io/fst/lookup_tables.cuh"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -25,8 +26,17 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-
+#include <rmm/exec_policy.hpp>
+
+#include <cub/device/device_copy.cuh>
+#include <cuda/atomic>
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
 
 #include <cstdlib>
 #include <string>
@@ -215,14 +225,6 @@ std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
  *        |   state is necessary to process escaped double-quote characters. Without this
  *        |   state, whitespaces following escaped double quotes inside strings may be removed.
  *
- * NOTE: An important case NOT handled by this FST is that of whitespace following newline
- * characters within a string. Consider the following example
- * Input:           {"a":"x\n y"}
- * FST output:      {"a":"x\ny"}
- * Expected output: {"a":"x\n y"}
- * Such strings are not part of the JSON standard (characters allowed within quotes should
- * have ASCII at least 0x20 i.e. space character and above) but may be encountered while
- * reading JSON files
  */
 enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
 // Aliases for readability of the transition table
@@ -255,17 +257,17 @@ struct TransduceToNormalizedWS {
     //      Let the alphabet set be Sigma
     // ---------------------------------------
     // ---------- NON-SPECIAL CASES: ----------
-    //      Output symbol same as input symbol <s>
+    //    Input symbol translates to output symbol
     // state | read_symbol <s>  -> output_symbol <s>
-    // DQS   | Sigma            -> Sigma
-    // OOS   | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
-    // DEC   | Sigma            -> Sigma
+    // DQS   | Sigma            -> <nop>
+    // OOS   | Sigma\{<SPC>,\t} -> <nop>
+    // DEC   | Sigma            -> <nop>
     // ---------- SPECIAL CASES: --------------
-    //    Input symbol translates to output symbol
-    // OOS   | {<SPC>}          -> <nop>
-    // OOS   | {\t}             -> <nop>
+    //      Output symbol same as input symbol <s>
+    // OOS   | {<SPC>}          -> {<SPC>}
+    // OOS   | {\t}             -> {\t}
 
-    // Case when read symbol is a space or tab but is unquoted
+    // Case when read symbol is not an unquoted space or tab
     // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
     // However, since there is no output in this case i.e. the count returned by
     // operator()(state_id, match_id, read_symbol) is zero, this function is never called.
@@ -287,8 +289,8 @@ struct TransduceToNormalizedWS {
                                                  SymbolT const read_symbol) const
   {
     // Case when read symbol is a space or tab but is unquoted
-    if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
-        state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
+    if (!(match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
+          state_id == static_cast<StateT>(dfa_states::TT_OOS))) {
       return 0;
     }
     return 1;
@@ -328,33 +330,126 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
   std::swap(indata, outdata);
 }
 
-void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
-                          rmm::cuda_stream_view stream,
-                          rmm::device_async_resource_ref mr)
+std::
+  tuple<rmm::device_uvector<char>, rmm::device_uvector<size_type>, rmm::device_uvector<size_type>>
+  normalize_whitespace(device_span<char const> d_input,
+                       device_span<size_type const> col_offsets,
+                       device_span<size_type const> col_lengths,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr)
 {
-  CUDF_FUNC_RANGE();
-  static constexpr std::int32_t min_out = 0;
-  static constexpr std::int32_t max_out = 2;
+  /*
+   * Algorithm:
+    1. Create a single buffer by concatenating the rows of the string column. Create segment offsets
+   and lengths array for concatenated buffer
+    2. Run a whitespace normalization FST that performs NOP for non-whitespace and quoted
+   whitespace characters, and outputs indices of unquoted whitespace characters
+    3. Update segment lengths based on the number of output indices between segment offsets
+    4. Remove characters at output indices from concatenated buffer.
+    5. Return updated buffer, segment lengths and updated segment offsets
+   */
+  auto inbuf_lengths = cudf::detail::make_device_uvector_async(
+    col_lengths, stream, cudf::get_current_device_resource_ref());
+  size_t inbuf_lengths_size = inbuf_lengths.size();
+  size_type inbuf_size =
+    thrust::reduce(rmm::exec_policy_nosync(stream), inbuf_lengths.begin(), inbuf_lengths.end());
+  rmm::device_uvector<char> inbuf(inbuf_size, stream);
+  rmm::device_uvector<size_type> inbuf_offsets(inbuf_lengths_size, stream);
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                         inbuf_lengths.begin(),
+                         inbuf_lengths.end(),
+                         inbuf_offsets.begin(),
+                         0);
+
+  auto input_it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    cuda::proclaim_return_type<char const*>(
+      [d_input = d_input.begin(), col_offsets = col_offsets.begin()] __device__(
+        size_t i) -> char const* { return &d_input[col_offsets[i]]; }));
+  auto output_it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    cuda::proclaim_return_type<char*>(
+      [inbuf = inbuf.begin(), inbuf_offsets = inbuf_offsets.cbegin()] __device__(
+        size_t i) -> char* { return &inbuf[inbuf_offsets[i]]; }));
+
+  {
+    // cub device batched copy
+    size_t temp_storage_bytes = 0;
+    cub::DeviceCopy::Batched(nullptr,
+                             temp_storage_bytes,
+                             input_it,
+                             output_it,
+                             inbuf_lengths.begin(),
+                             inbuf_lengths_size,
+                             stream.value());
+    rmm::device_buffer temp_storage(temp_storage_bytes, stream);
+    cub::DeviceCopy::Batched(temp_storage.data(),
+                             temp_storage_bytes,
+                             input_it,
+                             output_it,
+                             inbuf_lengths.begin(),
+                             inbuf_lengths_size,
+                             stream.value());
+  }
+
+  // whitespace normalization : get the indices of the unquoted whitespace characters
   auto parser =
     fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
                           fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
-                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+                          fst::detail::make_translation_functor<SymbolT, 0, 2>(
                             normalize_whitespace::TransduceToNormalizedWS{}),
                           stream);
 
-  rmm::device_buffer outbuf(indata.size(), stream, mr);
-  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
-                   static_cast<SymbolOffsetT>(indata.size()),
-                   static_cast<SymbolT*>(outbuf.data()),
+  rmm::device_uvector<size_type> outbuf_indices(inbuf.size(), stream, mr);
+  rmm::device_scalar<SymbolOffsetT> outbuf_indices_size(stream, mr);
+  parser.Transduce(inbuf.data(),
+                   static_cast<SymbolOffsetT>(inbuf.size()),
                    thrust::make_discard_iterator(),
-                   outbuf_size.data(),
+                   outbuf_indices.data(),
+                   outbuf_indices_size.data(),
                    normalize_whitespace::start_state,
                    stream);
 
-  outbuf.resize(outbuf_size.value(stream), stream);
-  datasource::owning_buffer<rmm::device_buffer> outdata(std::move(outbuf));
-  std::swap(indata, outdata);
+  auto const num_deletions = outbuf_indices_size.value(stream);
+  outbuf_indices.resize(num_deletions, stream);
+
+  // now these indices need to be removed
+  // TODO: is there a better way to do this?
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    outbuf_indices.begin(),
+    outbuf_indices.end(),
+    [inbuf_offsets_begin = inbuf_offsets.begin(),
+     inbuf_offsets_end   = inbuf_offsets.end(),
+     inbuf_lengths       = inbuf_lengths.begin()] __device__(size_type idx) {
+      auto it  = thrust::upper_bound(thrust::seq, inbuf_offsets_begin, inbuf_offsets_end, idx);
+      auto pos = thrust::distance(inbuf_offsets_begin, it) - 1;
+      cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(inbuf_lengths + pos)};
+      ref.fetch_add(-1, cuda::std::memory_order_relaxed);
+    });
+
+  auto stencil = cudf::detail::make_zeroed_device_uvector_async<bool>(
+    static_cast<std::size_t>(inbuf_size), stream, cudf::get_current_device_resource_ref());
+  thrust::scatter(rmm::exec_policy_nosync(stream),
+                  thrust::make_constant_iterator(true),
+                  thrust::make_constant_iterator(true) + num_deletions,
+                  outbuf_indices.begin(),
+                  stencil.begin());
+  thrust::remove_if(rmm::exec_policy_nosync(stream),
+                    inbuf.begin(),
+                    inbuf.end(),
+                    stencil.begin(),
+                    thrust::identity<int>());
+  inbuf.resize(inbuf_size - num_deletions, stream);
+
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                         inbuf_lengths.begin(),
+                         inbuf_lengths.end(),
+                         inbuf_offsets.begin(),
+                         0);
+
+  stream.synchronize();
+  return std::tuple{std::move(inbuf), std::move(inbuf_offsets), std::move(inbuf_lengths)};
 }
 
 }  // namespace detail
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 4e513d3495c..1c15e147b13 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -2079,10 +2079,12 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
 {
   auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'};
 
-  parse_opts.dayfirst   = options.is_enabled_dayfirst();
-  parse_opts.keepquotes = options.is_enabled_keep_quotes();
-  parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
-  parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  parse_opts.dayfirst              = options.is_enabled_dayfirst();
+  parse_opts.keepquotes            = options.is_enabled_keep_quotes();
+  parse_opts.normalize_whitespace  = options.is_enabled_normalize_whitespace();
+  parse_opts.mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  parse_opts.trie_true             = cudf::detail::create_serialized_trie({"true"}, stream);
+  parse_opts.trie_false            = cudf::detail::create_serialized_trie({"false"}, stream);
   std::vector<std::string> na_values{"", "null"};
   na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end());
   parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream);
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index bd82b040359..99a5b17bce8 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -232,12 +232,6 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
     normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref());
   }
 
-  // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
-  // enabled, invoke pre-processing FST
-  if (reader_opts.is_enabled_normalize_whitespace()) {
-    normalize_whitespace(bufview, stream, cudf::get_current_device_resource_ref());
-  }
-
   auto buffer =
     cudf::device_span<char const>(reinterpret_cast<char const*>(bufview.data()), bufview.size());
   stream.synchronize();
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index bc2722441d0..734067582f7 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -67,6 +67,8 @@ struct parse_options_view {
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
+  bool normalize_whitespace;
+  bool mixed_types_as_string;
   cudf::detail::trie_view trie_true;
   cudf::detail::trie_view trie_false;
   cudf::detail::trie_view trie_na;
@@ -85,6 +87,8 @@ struct parse_options {
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
+  bool normalize_whitespace;
+  bool mixed_types_as_string;
   cudf::detail::optional_trie trie_true;
   cudf::detail::optional_trie trie_false;
   cudf::detail::optional_trie trie_na;
@@ -111,6 +115,8 @@ struct parse_options {
             doublequote,
             dayfirst,
             skipblanklines,
+            normalize_whitespace,
+            mixed_types_as_string,
             cudf::detail::make_trie_view(trie_true),
             cudf::detail::make_trie_view(trie_false),
             cudf::detail::make_trie_view(trie_na),
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 960c19fce2e..48bc982d0e3 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2856,4 +2856,47 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren)
   }
 }
 
+TEST_F(JsonReaderTest, JsonDtypeSchema)
+{
+  std::string data = R"(
+    {"a": 1, "b": {"0": "abc", "1": ["a", "b"]}, "c": true}
+    {"a": 1, "b": {"0": "abc"          }, "c": false}
+    {"a": 1, "b": {"0": "lolol  "}, "c": true}
+    )";
+
+  std::map<std::string, cudf::io::schema_element> dtype_schema{{"c", {data_type{type_id::STRING}}},
+                                                               {"b", {data_type{type_id::STRING}}},
+                                                               {"a", {dtype<double>()}}};
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .dtypes(dtype_schema)
+      .prune_columns(true)
+      .lines(true);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), 3);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING);
+  EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+  EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+  EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+
+  // cudf::column::contents contents = result.tbl->get_column(1).release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), float64_wrapper{{1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(1),
+    cudf::test::strings_column_wrapper({"{\"0\": \"abc\", \"1\": [\"a\", \"b\"]}",
+                                        "{\"0\": \"abc\"          }",
+                                        "{\"0\": \"lolol  \"}"}),
+    cudf::test::debug_output_level::ALL_ERRORS);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2),
+                                 cudf::test::strings_column_wrapper({"true", "false", "true"}),
+                                 cudf::test::debug_output_level::ALL_ERRORS);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
index 6d79fdc98ef..6a3bd69de81 100644
--- a/cpp/tests/io/json/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu
@@ -34,129 +34,127 @@
 // Base test fixture for tests
 struct JsonWSNormalizationTest : public cudf::test::BaseFixture {};
 
-void run_test(std::string const& host_input, std::string const& expected_host_output)
-{
-  // Prepare cuda stream for data transfers & kernels
-  auto stream_view = cudf::test::get_default_stream();
-
-  auto device_input = rmm::device_buffer(
-    host_input.c_str(), host_input.size(), stream_view, cudf::get_current_device_resource_ref());
-
-  // Preprocessing FST
-  cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
-  cudf::io::json::detail::normalize_whitespace(
-    device_data, stream_view, cudf::get_current_device_resource_ref());
-
-  std::string preprocessed_host_output(device_data.size(), 0);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
-                                device_data.data(),
-                                preprocessed_host_output.size(),
-                                cudaMemcpyDeviceToHost,
-                                stream_view.value()));
-
-  stream_view.synchronize();
-  ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size());
-  CUDF_TEST_EXPECT_VECTOR_EQUAL(
-    preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
-}
-
-TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces)
+TEST_F(JsonWSNormalizationTest, ReadJsonOption)
 {
-  std::string input  = R"({ "A" : "TEST" })";
-  std::string output = R"({"A":"TEST"})";
-  run_test(input, output);
-}
+  // When mixed type fields are read as strings, the table read will differ depending the
+  // value of normalize_whitespace
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_MoreSpaces)
-{
-  std::string input  = R"({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": {"c": "d"}})";
-  std::string output = R"({"a":[1,2,3,4,5,6,7,8],"b":{"c":"d"}})";
-  run_test(input, output);
-}
+  // Test input
+  std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(true);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesInString)
-{
-  std::string input  = R"({" a ":50})";
-  std::string output = R"({" a ":50})";
-  run_test(input, output);
-}
+  cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_NewlineInString)
-{
-  std::string input  = "{\"a\" : \"x\ny\"}\n{\"a\" : \"x\\ny\"}";
-  std::string output = "{\"a\":\"x\ny\"}\n{\"a\":\"x\\ny\"}";
-  run_test(input, output);
-}
+  // Expected table
+  std::string const expected_input = R"({ "a" : {"b":"c"}})";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(false);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_Tabs)
-{
-  std::string input  = "{\"a\":\t\"b\"}";
-  std::string output = R"({"a":"b"})";
-  run_test(input, output);
+  cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
 }
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesAndTabs)
+TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows)
 {
-  std::string input  = "{\"A\" : \t\"TEST\" }";
-  std::string output = R"({"A":"TEST"})";
-  run_test(input, output);
-}
+  // When mixed type fields are read as strings, the table read will differ depending the
+  // value of normalize_whitespace
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_MultilineJSONWithSpacesAndTabs)
-{
-  std::string input =
-    "{ \"foo rapids\": [1,2,3], \"bar\trapids\": 123 }\n\t{ \"foo rapids\": { \"a\": 1 }, "
-    "\"bar\trapids\": 456 }";
-  std::string output =
-    "{\"foo rapids\":[1,2,3],\"bar\trapids\":123}\n{\"foo rapids\":{\"a\":1},\"bar\trapids\":456}";
-  run_test(input, output);
-}
+  // Test input
+  std::string const host_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } ] } }
+  { "Root": { "Key": "abc" } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [ { "EE": "efg" } ] } }
+  )";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_PureJSONExample)
-{
-  std::string input  = R"([{"a":50}, {"a" : 60}])";
-  std::string output = R"([{"a":50},{"a":60}])";
-  run_test(input, output);
-}
+  cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_NoNormalizationRequired)
-{
-  std::string input  = R"({"a\\n\r\a":50})";
-  std::string output = R"({"a\\n\r\a":50})";
-  run_test(input, output);
-}
+  // Expected table
+  std::string const expected_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } ] } }
+  { "Root": { "Key": "abc" } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{"YY":1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [{"EE":"efg"}] } }
+  )";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(false)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput)
-{
-  std::string input  = "{\"a\" : \"b }\n{ \"c \" :\t\"d\"}";
-  std::string output = "{\"a\":\"b }\n{\"c \":\"d\"}";
-  run_test(input, output);
+  cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
 }
 
-TEST_F(JsonWSNormalizationTest, ReadJsonOption)
+TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows_NoMixedType)
 {
   // When mixed type fields are read as strings, the table read will differ depending the
   // value of normalize_whitespace
 
   // Test input
-  std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}";
+  std::string const host_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } ] } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [ { "EE": "efg" }, { "YY" :   "abc" }    ] } }
+  { "Root": { "Key": [  { "YY" :   "abc" }    ] } }
+  )";
+
+  std::map<std::string, cudf::io::schema_element> dtype_schema{
+    {"Key", {cudf::data_type{cudf::type_id::STRING}}}};
+
   cudf::io::json_reader_options input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{host_input.data(), host_input.size()})
+      .dtypes(dtype_schema)
       .lines(true)
-      .mixed_types_as_string(true)
-      .normalize_whitespace(true);
+      .prune_columns(true)
+      .normalize_whitespace(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
   cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
 
   // Expected table
-  std::string const expected_input = R"({ "a" : {"b":"c"}})";
+  std::string const expected_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } , { "YY" :    2 } ] } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{"YY":1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [{"EE":"efg"},{"YY":"abc"}] } }
+  { "Root": { "Key": [{"YY":"abc"}] } }
+  )";
+
   cudf::io::json_reader_options expected_input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .dtypes(dtype_schema)
       .lines(true)
-      .mixed_types_as_string(true)
-      .normalize_whitespace(false);
+      .prune_columns(true)
+      .normalize_whitespace(false)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
   cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());

From dafb3e7559710d5af7118a206312f250eb671558 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 19 Sep 2024 12:06:53 -0500
Subject: [PATCH 215/270] Generate GPU vs CPU usage metrics per pytest file in
 pandas testsuite for `cudf.pandas` (#16739)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR introduces GPU and CPU usage reporting to cudf.pandas pytest suite and the generated metrics will be available for viewing in the existing pandas pytest summary page:
https://github.com/rapidsai/cudf/actions/runs/10886370333/attempts/1#summary-30220192117

![Screenshot 2024-09-16 at 2 39 07 PM](https://github.com/user-attachments/assets/6d31c7d2-8a27-4f02-bf9d-c1b40ad1d756)


Note: I'm aware of cases of where both GPU and CPU usage show 0%, which is due to various reasons that I'm working on addressing in a follow-up PR.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/16739
---
 .../pandas-tests/job-summary.py               | 14 ++++-
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 16 +++++
 .../cudf/pandas/scripts/conftest-patch.py     | 59 ++++++++++++++++++-
 .../cudf/pandas/scripts/run-pandas-tests.sh   |  5 +-
 .../pandas/scripts/summarize-test-results.py  | 40 +++++++++++++
 5 files changed, 128 insertions(+), 6 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 93a815838b7..7a12db927e5 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -68,8 +68,18 @@ def emoji_failed(x):
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
 diff_df = pr_df - main_df
+total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call']
+pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1)
+pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1)
 
-pr_df = pr_df[["total", "passed", "failed", "skipped"]]
+cpu_usage_mean = pr_df['CPU Usage'].mean().round(2)
+gpu_usage_mean = pr_df['GPU Usage'].mean().round(2)
+
+# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns
+pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%'
+pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%'
+
+pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']]
 diff_df = diff_df[["total", "passed", "failed", "skipped"]]
 diff_df.columns = diff_df.columns + "_diff"
 diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
@@ -95,6 +105,8 @@ def emoji_failed(x):
 
 print(comment)
 print()
+print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%")
+print()
 print("Here are the results of running the Pandas tests against this PR:")
 print()
 print(df.to_markdown())
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index afa1ce5f86c..bf2ee6ae624 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -881,6 +881,20 @@ def _assert_fast_slow_eq(left, right):
         assert_eq(left, right)
 
 
+def _fast_function_call():
+    """
+    Placeholder fast function for pytest profiling purposes.
+    """
+    return None
+
+
+def _slow_function_call():
+    """
+    Placeholder slow function for pytest profiling purposes.
+    """
+    return None
+
+
 def _fast_slow_function_call(
     func: Callable,
     /,
@@ -910,6 +924,7 @@ def _fast_slow_function_call(
                 # try slow path
                 raise Exception()
             fast = True
+            _fast_function_call()
             if _env_get_bool("CUDF_PANDAS_DEBUGGING", False):
                 try:
                     with nvtx.annotate(
@@ -952,6 +967,7 @@ def _fast_slow_function_call(
                 from ._logger import log_fallback
 
                 log_fallback(slow_args, slow_kwargs, err)
+            _slow_function_call()
             with disable_module_accelerator():
                 result = func(*slow_args, **slow_kwargs)
     return _maybe_wrap_result(result, func, *args, **kwargs), fast
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 505a40b0bfa..d12d2697729 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -1,10 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import contextlib
+import json
 import os
 import sys
+import traceback
+from collections import defaultdict
 from functools import wraps
 
 import pytest
@@ -36,4 +39,58 @@ def patch_testing_functions():
     pytest.raises = replace_kwargs({"match": None})(pytest.raises)
 
 
+# Dictionary to store function call counts
+function_call_counts = {}  # type: ignore
+
+# The specific functions to track
+FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"}
+
+
+def find_pytest_file(frame):
+    stack = traceback.extract_stack()
+    absolute_paths = [frame.filename for frame in stack]
+    for file in absolute_paths:
+        if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[
+            -1
+        ].startswith("test_"):
+            return str(file).rsplit("pandas-tests/", 1)[-1]
+    return None
+
+
+def trace_calls(frame, event, arg):
+    if event != "call":
+        return
+    code = frame.f_code
+    func_name = code.co_name
+
+    if func_name in FUNCTION_NAME:
+        filename = find_pytest_file(frame)
+        if filename is None:
+            return
+        if filename not in function_call_counts:
+            function_call_counts[filename] = defaultdict(int)
+        function_call_counts[filename][func_name] += 1
+
+
+def pytest_sessionstart(session):
+    # Set the profile function to trace calls
+    sys.setprofile(trace_calls)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    # Remove the profile function
+    sys.setprofile(None)
+
+
+@pytest.hookimpl(trylast=True)
+def pytest_unconfigure(config):
+    if hasattr(config, "workerinput"):
+        # Running in xdist worker, write the counts before exiting
+        worker_id = config.workerinput["workerid"]
+        output_file = f"function_call_counts_worker_{worker_id}.json"
+        with open(output_file, "w") as f:
+            json.dump(function_call_counts, f, indent=4)
+        print(f"Function call counts have been written to {output_file}")
+
+
 sys.path.append(os.path.dirname(__file__))
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 9c65b74d081..9b9ce026571 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -64,8 +64,6 @@ markers = [
   "skip_ubsan: Tests known to fail UBSAN check",
 ]
 EOF
-    # append the contents of patch-confest.py to conftest.py
-    cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py
 
     # Substitute `pandas.tests` with a relative import.
     # This will depend on the location of the test module relative to
@@ -137,7 +135,7 @@ and not test_eof_states \
 and not test_array_tz"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
-PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \
+PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
     -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \
     --import-mode=importlib \
@@ -146,5 +144,4 @@ PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \
 
 mv *.json ..
 cd ..
-
 rm -rf pandas-testing/pandas-tests/
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index ffd2abb960d..4ea0b3b4413 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -12,7 +12,9 @@
 """
 
 import argparse
+import glob
 import json
+import os
 
 from rich.console import Console
 from rich.table import Table
@@ -57,6 +59,44 @@ def get_per_module_results(log_file_name):
                 per_module_results[module_name].setdefault(outcome, 0)
                 per_module_results[module_name]["total"] += 1
                 per_module_results[module_name][outcome] += 1
+
+    directory = os.path.dirname(log_file_name)
+    pattern = os.path.join(directory, "function_call_counts_worker_*.json")
+    matching_files = glob.glob(pattern)
+    function_call_counts = {}
+
+    for file in matching_files:
+        with open(file) as f:
+            function_call_count = json.load(f)
+        if not function_call_counts:
+            function_call_counts.update(function_call_count)
+        else:
+            for key, value in function_call_count.items():
+                if key not in function_call_counts:
+                    function_call_counts[key] = value
+                else:
+                    if "_slow_function_call" not in function_call_counts[key]:
+                        function_call_counts[key]["_slow_function_call"] = 0
+                    if "_fast_function_call" not in function_call_counts[key]:
+                        function_call_counts[key]["_fast_function_call"] = 0
+                    function_call_counts[key]["_slow_function_call"] += (
+                        value.get("_slow_function_call", 0)
+                    )
+                    function_call_counts[key]["_fast_function_call"] += (
+                        value.get("_fast_function_call", 0)
+                    )
+
+    for key, value in per_module_results.items():
+        if key in function_call_counts:
+            per_module_results[key]["_slow_function_call"] = (
+                function_call_counts[key].get("_slow_function_call", 0)
+            )
+            per_module_results[key]["_fast_function_call"] = (
+                function_call_counts[key].get("_fast_function_call", 0)
+            )
+        else:
+            per_module_results[key]["_slow_function_call"] = 0
+            per_module_results[key]["_fast_function_call"] = 0
     return per_module_results
 
 
From 3886c7ca2b28b723586f583ba218d3f913ed9508 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <wence@gmx.li>
Date: Thu, 19 Sep 2024 18:17:30 +0100
Subject: [PATCH 216/270] Download pylibcudf wheel when testing polars itself

---
 ci/test_cudf_polars_polars_tests.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index 25ed44df316..8ec87c7bd62 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -25,10 +25,10 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
 # Download the cudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
 
-rapids-logger "Install cudf"
-python -m pip install ./local-cudf-dep/cudf*.whl
+rapids-logger "Install pylibcudf"
+python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
 
 rapids-logger "Install cudf_polars"
 python -m pip install $(echo ./dist/cudf_polars*.whl)

From 9df13d1094a559a6b123c74393344057be6f2ecf Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <wence@gmx.li>
Date: Thu, 19 Sep 2024 18:18:06 +0100
Subject: [PATCH 217/270] No cover for 1.6 IR changes

CI only tests 1.7
---
 python/cudf_polars/cudf_polars/dsl/translate.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index a5e17c25e4d..2fa96a59bb7 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -98,8 +98,9 @@ def _(
         and visitor.version()[0] == 1
         and reader_options["schema"] is not None
     ):
-        # Polars 1.7 renames the inner slot from "inner" to "fields".
-        reader_options["schema"] = {"fields": reader_options["schema"]["inner"]}
+        reader_options["schema"] = {
+            "fields": reader_options["schema"]["inner"]
+        } # pragma: no cover; CI tests 1.7
     file_options = node.file_options
     with_columns = file_options.with_columns
     n_rows = file_options.n_rows

From 8782a1d63e82ee20964e36ef885af6b36f75732c Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 19 Sep 2024 10:20:55 -0700
Subject: [PATCH 218/270] Improve aggregation documentation (#16822)

This PR fixes several documentation issues uncovered while working on #16619. There are no actual code changes.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16822
---
 cpp/include/cudf/detail/aggregation/aggregation.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index b257eef1e9e..4255faea702 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1497,8 +1497,7 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation);
  *
  * @tparam F Type of callable
  * @param k The `aggregation::Kind` value to dispatch
- * aram f The callable that accepts an `aggregation::Kind` non-type template
- * argument.
+ * @param f The callable that accepts an `aggregation::Kind` callable function object.
  * @param args Parameter pack forwarded to the `operator()` invocation
  * @return Forwards the return value of the callable.
  */
@@ -1626,6 +1625,7 @@ struct dispatch_source {
  * parameter of the callable `F`
  * @param k The `aggregation::Kind` used to dispatch an `aggregation::Kind`
  * non-type template parameter for the second template parameter of the callable
+ * @param f The callable that accepts `data_type` and `aggregation::Kind` function object.
  * @param args Parameter pack forwarded to the `operator()` invocation
  * `F`.
  */
@@ -1644,8 +1644,8 @@ CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(d
  * @brief Returns the target `data_type` for the specified aggregation  k
  * performed on elements of type  source_type.
  *
- * aram source_type The element type to be aggregated
- * aram k The aggregation
+ * @param source_type The element type to be aggregated
+ * @param k The aggregation kind
  * @return data_type The target_type of  k performed on  source_type
  * elements
  */

From e9b5b538d515219ea36ec62f31ff78424e1fcf89 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 19 Sep 2024 07:36:55 -1000
Subject: [PATCH 219/270] Add string.repeats API to pylibcudf (#16834)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16834
---
 .../api_docs/pylibcudf/strings/index.rst      |  1 +
 .../api_docs/pylibcudf/strings/repeat.rst     |  6 +++
 python/cudf/cudf/_lib/strings/repeat.pyx      | 40 +++++----------
 .../pylibcudf/libcudf/strings/repeat.pxd      |  8 +--
 .../pylibcudf/strings/CMakeLists.txt          |  2 +-
 .../pylibcudf/pylibcudf/strings/__init__.py   |  1 +
 python/pylibcudf/pylibcudf/strings/repeat.pxd | 10 ++++
 python/pylibcudf/pylibcudf/strings/repeat.pyx | 51 +++++++++++++++++++
 .../pylibcudf/tests/test_string_repeat.py     | 20 ++++++++
 9 files changed, 106 insertions(+), 33 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/repeat.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/repeat.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_repeat.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 462a756a092..1200ecba5d9 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -10,5 +10,6 @@ strings
     find
     regex_flags
     regex_program
+    repeat
     replace
     slice
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst
new file mode 100644
index 00000000000..0041fe4c3da
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst
@@ -0,0 +1,6 @@
+======
+repeat
+======
+
+.. automodule:: pylibcudf.strings.repeat
+   :members:
diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx
index 42fcfa5d94e..43649d4defe 100644
--- a/python/cudf/cudf/_lib/strings/repeat.pyx
+++ b/python/cudf/cudf/_lib/strings/repeat.pyx
@@ -1,17 +1,12 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings cimport repeat as cpp_repeat
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def repeat_scalar(Column source_strings,
@@ -21,16 +16,11 @@ def repeat_scalar(Column source_strings,
     each string in `source_strings`
     `repeats` number of times.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_repeat.repeat_strings(
-            source_view,
-            repeats
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.repeat.repeat_strings(
+        source_strings.to_pylibcudf(mode="read"),
+        repeats
+    )
+    return Column.from_pylibcudf(plc_result)
 
 
 @acquire_spill_lock()
@@ -41,14 +31,8 @@ def repeat_sequence(Column source_strings,
     each string in `source_strings`
     `repeats` number of times.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view repeats_view = repeats.view()
-
-    with nogil:
-        c_result = move(cpp_repeat.repeat_strings(
-            source_view,
-            repeats_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.repeat.repeat_strings(
+        source_strings.to_pylibcudf(mode="read"),
+        repeats.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
index 410ff58f299..59262820411 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
@@ -10,9 +10,9 @@ cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \
         nogil:
 
     cdef unique_ptr[column] repeat_strings(
-        column_view strings,
-        size_type repeat) except +
+        column_view input,
+        size_type repeat_times) except +
 
     cdef unique_ptr[column] repeat_strings(
-        column_view strings,
-        column_view repeats) except +
+        column_view input,
+        column_view repeat_times) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index b499a127541..457e462e3cf 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
-                   regex_program.pyx replace.pyx slice.pyx
+                   regex_program.pyx repeat.pyx replace.pyx slice.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index ef102aff2af..250cefedf55 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -8,6 +8,7 @@
     find,
     regex_flags,
     regex_program,
+    repeat,
     replace,
     slice,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/strings/repeat.pxd
new file mode 100644
index 00000000000..bc70926b6fa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+
+ctypedef fused ColumnorSizeType:
+    Column
+    size_type
+
+cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times)
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx
new file mode 100644
index 00000000000..5f627218f6e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport repeat as cpp_repeat
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times):
+    """
+    Repeat each string in the given strings column by the numbers
+    of times given in another numeric column.
+
+    For details, see :cpp:func:`cudf::strings::repeat`.
+
+    Parameters
+    ----------
+    input : Column
+        The column containing strings to repeat.
+    repeat_times : Column or int
+        Number(s) of times that the corresponding input strings
+        for each row are repeated.
+
+    Returns
+    -------
+    Column
+        New column containing the repeated strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    if ColumnorSizeType is Column:
+        with nogil:
+            c_result = move(
+                cpp_repeat.repeat_strings(
+                    input.view(),
+                    repeat_times.view()
+                )
+            )
+    elif ColumnorSizeType is size_type:
+        with nogil:
+            c_result = move(
+                cpp_repeat.repeat_strings(
+                    input.view(),
+                    repeat_times
+                )
+            )
+    else:
+        raise ValueError("repeat_times must be size_type or integer")
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
new file mode 100644
index 00000000000..18b5d8bf4d0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+
+
+@pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2])
+def test_repeat_strings(repeats):
+    arr = pa.array(["1", None])
+    plc_result = plc.strings.repeat.repeat_strings(
+        plc.interop.from_arrow(arr),
+        plc.interop.from_arrow(repeats)
+        if not isinstance(repeats, int)
+        else repeats,
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.chunked_array(pc.binary_repeat(arr, repeats))
+    assert result.equals(expected)

From 51c2dd6f05f9c9d07f6e07b0119906e1ea32fc2d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 19 Sep 2024 07:38:48 -1000
Subject: [PATCH 220/270] Add string.contains APIs to pylibcudf (#16814)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16814
---
 python/cudf/cudf/_lib/strings/contains.pyx    |  80 ++---------
 .../pylibcudf/libcudf/strings/contains.pxd    |   7 +-
 .../pylibcudf/pylibcudf/strings/contains.pxd  |  14 ++
 .../pylibcudf/pylibcudf/strings/contains.pyx  | 130 +++++++++++++++++-
 .../pylibcudf/tests/test_string_contains.py   |  37 +++++
 5 files changed, 199 insertions(+), 69 deletions(-)

diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
index 82f5e06c547..03b4887f200 100644
--- a/python/cudf/cudf/_lib/strings/contains.pyx
+++ b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -1,27 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.contains cimport (
-    count_re as cpp_count_re,
-    like as cpp_like,
-    matches_re as cpp_matches_re,
-)
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 
 from pylibcudf.strings import contains
 from pylibcudf.strings.regex_program import RegexProgram
@@ -45,21 +28,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags):
     Returns a Column with count of occurrences of `reg_ex` in
     each string of `source_strings`
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string reg_ex_string = <string>str(reg_ex).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(reg_ex_string, c_flags))
-        c_result = move(cpp_count_re(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    prog = RegexProgram.create(str(reg_ex), flags)
+    return Column.from_pylibcudf(
+        contains.count_re(source_strings.to_pylibcudf(mode="read"), prog)
+    )
 
 
 @acquire_spill_lock()
@@ -68,21 +40,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags):
     Returns a Column with each value True if the string matches `reg_ex`
     regular expression with each record of `source_strings`
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string reg_ex_string = <string>str(reg_ex).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(reg_ex_string, c_flags))
-        c_result = move(cpp_matches_re(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    prog = RegexProgram.create(str(reg_ex), flags)
+    return Column.from_pylibcudf(
+        contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog)
+    )
 
 
 @acquire_spill_lock()
@@ -91,24 +52,9 @@ def like(Column source_strings, object py_pattern, object py_escape):
     Returns a Column with each value True if the string matches the
     `py_pattern` like expression with each record of `source_strings`
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef DeviceScalar pattern = py_pattern.device_value
-    cdef DeviceScalar escape = py_escape.device_value
-
-    cdef const string_scalar* scalar_ptn = <const string_scalar*>(
-        pattern.get_raw_ptr()
-    )
-    cdef const string_scalar* scalar_esc = <const string_scalar*>(
-        escape.get_raw_ptr()
+    plc_column = contains.like(
+        source_strings.to_pylibcudf(mode="read"),
+        py_pattern.device_value.c_value,
+        py_escape.device_value.c_value,
     )
-
-    with nogil:
-        c_result = move(cpp_like(
-            source_view,
-            scalar_ptn[0],
-            scalar_esc[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
index c2fb5f0dce4..eac0f748257 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
@@ -24,4 +24,9 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] like(
         column_view source_strings,
         string_scalar pattern,
-        string_scalar escape) except +
+        string_scalar escape_character) except +
+
+    cdef unique_ptr[column] like(
+        column_view source_strings,
+        column_view patterns,
+        string_scalar escape_character) except +
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd
index 2cd4891a0ea..6146a1119d6 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pxd
+++ b/python/pylibcudf/pylibcudf/strings/contains.pxd
@@ -1,7 +1,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
 from pylibcudf.strings.regex_program cimport RegexProgram
 
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
 
 cpdef Column contains_re(Column input, RegexProgram prog)
+
+cpdef Column count_re(Column input, RegexProgram prog)
+
+cpdef Column matches_re(Column input, RegexProgram prog)
+
+cpdef Column like(
+    Column input,
+    ColumnOrScalar pattern,
+    Scalar escape_character = *
+)
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx
index 1a2446f6e2c..82bd1fbea32 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pyx
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyx
@@ -1,8 +1,14 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from cython.operator import dereference
+
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
 from pylibcudf.libcudf.strings cimport contains as cpp_contains
 from pylibcudf.strings.regex_program cimport RegexProgram
 
@@ -32,9 +38,131 @@ cpdef Column contains_re(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = cpp_contains.contains_re(
+        result = move(cpp_contains.contains_re(
+            input.view(),
+            prog.c_obj.get()[0]
+        ))
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column count_re(
+    Column input,
+    RegexProgram prog
+):
+    """Returns the number of times the given regex_program's pattern
+    matches in each string.
+
+    For details, see :cpp:func:`cudf::strings::count_re`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of match counts for each string
+    """
+
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = move(cpp_contains.count_re(
             input.view(),
             prog.c_obj.get()[0]
+        ))
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column matches_re(
+    Column input,
+    RegexProgram prog
+):
+    """Returns a boolean column identifying rows which
+    matching the given regex_program object but only at
+    the beginning the string.
+
+    For details, see :cpp:func:`cudf::strings::matches_re`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of boolean results for each string
+    """
+
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = move(cpp_contains.matches_re(
+            input.view(),
+            prog.c_obj.get()[0]
+        ))
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=None):
+    """
+    Returns a boolean column identifying rows which
+    match the given like pattern.
+
+    For details, see :cpp:func:`cudf::strings::like`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    pattern : Column or Scalar
+        Like patterns to match within each string
+    escape_character : Scalar
+        Optional character specifies the escape prefix.
+        Default is no escape character.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of boolean results for each string
+    """
+    cdef unique_ptr[column] result
+
+    if escape_character is None:
+        escape_character = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
         )
 
+    cdef const string_scalar* c_escape_character = <const string_scalar*>(
+        escape_character.c_obj.get()
+    )
+    cdef const string_scalar* c_pattern
+
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(cpp_contains.like(
+                input.view(),
+                pattern.view(),
+                dereference(c_escape_character)
+            ))
+    elif ColumnOrScalar is Scalar:
+        c_pattern = <const string_scalar*>(pattern.c_obj.get())
+        with nogil:
+            result = move(cpp_contains.like(
+                input.view(),
+                dereference(c_pattern),
+                dereference(c_escape_character)
+            ))
+    else:
+        raise ValueError("pattern must be a Column or a Scalar")
+
     return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
index 4f88e09183f..4e4dd7cbb00 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
@@ -48,3 +48,40 @@ def test_contains_re(target_col, pa_target_scalar, plc_target_pat):
         pa_target_col, pa_target_scalar.as_py()
     )
     assert_column_eq(got, expected)
+
+
+def test_count_re():
+    pattern = "[1-9][a-z]"
+    arr = pa.array(["A1a2A3a4", "A1A2A3", None])
+    result = plc.strings.contains.count_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    expected = pc.count_substring_regex(arr, pattern)
+    assert_column_eq(result, expected)
+
+
+def test_match_re():
+    pattern = "[1-9][a-z]"
+    arr = pa.array(["1a2b", "b1a2", None])
+    result = plc.strings.contains.matches_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    expected = pc.match_substring_regex(arr, f"^{pattern}")
+    assert_column_eq(result, expected)
+
+
+def test_like():
+    pattern = "%a"
+    arr = pa.array(["1a2aa3aaa"])
+    result = plc.strings.contains.like(
+        plc.interop.from_arrow(arr),
+        plc.interop.from_arrow(pa.array([pattern])),
+    )
+    expected = pc.match_like(arr, pattern)
+    assert_column_eq(result, expected)

From 7233da9c38e374ad6be6ebcc13ea8bd209c8a496 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 19 Sep 2024 07:59:03 -1000
Subject: [PATCH 221/270] Remove `MultiIndex._poplevel` inplace implementation.
 (#16767)

`MultiIndex._poplevel`, which backs `MultiIndex.droplevel`, operates by dropping a given level inplace. There 2 places where `._poplevel` is called, and both usages makes a shallow copy of the data first, presumably to work around side effects of this inplace behavior.

This PR remove the `MultiIndex._poplevel` implementation and just implements dropping level like behavior by just returning a new object.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16767
---
 python/cudf/cudf/core/multiindex.py | 111 ++++++++++++----------------
 python/cudf/cudf/core/reshape.py    |  26 +++++--
 2 files changed, 65 insertions(+), 72 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index e00890ac5c3..b86ad38c944 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -36,7 +36,7 @@
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 if TYPE_CHECKING:
-    from collections.abc import Generator
+    from collections.abc import Generator, Hashable
 
     from typing_extensions import Self
 
@@ -1041,9 +1041,11 @@ def to_frame(
         )
 
     @_performance_tracking
-    def get_level_values(self, level) -> cudf.Index:
+    def _level_to_ca_label(self, level) -> tuple[Hashable, int]:
         """
-        Return the values at the requested level
+        Convert a level to a ColumAccessor label and an integer position.
+
+        Useful if self._column_names != self.names.
 
         Parameters
         ----------
@@ -1051,10 +1053,13 @@ def get_level_values(self, level) -> cudf.Index:
 
         Returns
         -------
-        An Index containing the values at the requested level.
+        tuple[Hashable, int]
+            (ColumnAccessor label corresponding to level, integer position of the level)
         """
-        colnames = self._data.names
-        if level not in colnames:
+        colnames = self._column_names
+        try:
+            level_idx = colnames.index(level)
+        except ValueError:
             if isinstance(level, int):
                 if level < 0:
                     level = level + len(colnames)
@@ -1067,8 +1072,22 @@ def get_level_values(self, level) -> cudf.Index:
                 level = colnames[level_idx]
             else:
                 raise KeyError(f"Level not found: '{level}'")
-        else:
-            level_idx = colnames.index(level)
+        return level, level_idx
+
+    @_performance_tracking
+    def get_level_values(self, level) -> cudf.Index:
+        """
+        Return the values at the requested level
+
+        Parameters
+        ----------
+        level : int or label
+
+        Returns
+        -------
+        An Index containing the values at the requested level.
+        """
+        level, level_idx = self._level_to_ca_label(level)
         level_values = cudf.Index._from_column(
             self._data[level], name=self.names[level_idx]
         )
@@ -1420,57 +1439,6 @@ def from_arrays(
             codes=codes, levels=levels, sortorder=sortorder, names=names
         )
 
-    @_performance_tracking
-    def _poplevels(self, level) -> None | MultiIndex | cudf.Index:
-        """
-        Remove and return the specified levels from self.
-
-        Parameters
-        ----------
-        level : level name or index, list
-            One or more levels to remove
-
-        Returns
-        -------
-        Index composed of the removed levels. If only a single level
-        is removed, a flat index is returned. If no levels are specified
-        (empty list), None is returned.
-        """
-        if not pd.api.types.is_list_like(level):
-            level = (level,)
-
-        ilevels = sorted(self._level_index_from_level(lev) for lev in level)
-
-        if not ilevels:
-            return None
-
-        popped_data = {}
-        popped_names = []
-        names = list(self.names)
-
-        # build the popped data and names
-        for i in ilevels:
-            n = self._data.names[i]
-            popped_data[n] = self._data[n]
-            popped_names.append(self.names[i])
-
-        # pop the levels out from self
-        # this must be done iterating backwards
-        for i in reversed(ilevels):
-            n = self._data.names[i]
-            names.pop(i)
-            popped_data[n] = self._data.pop(n)
-
-        # construct the popped result
-        popped = cudf.core.index._index_from_data(popped_data)
-        popped.names = popped_names
-
-        # update self
-        self.names = names
-        self._levels, self._codes = _compute_levels_and_codes(self._data)
-
-        return popped
-
     @_performance_tracking
     def swaplevel(self, i=-2, j=-1) -> Self:
         """
@@ -1523,7 +1491,7 @@ def swaplevel(self, i=-2, j=-1) -> Self:
         return midx
 
     @_performance_tracking
-    def droplevel(self, level=-1) -> MultiIndex | cudf.Index:
+    def droplevel(self, level=-1) -> Self | cudf.Index:
         """
         Removes the specified levels from the MultiIndex.
 
@@ -1578,11 +1546,24 @@ def droplevel(self, level=-1) -> MultiIndex | cudf.Index:
         >>> idx.droplevel(["first", "second"])
         Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third')
         """
-        mi = self.copy(deep=False)
-        mi._poplevels(level)
-        if mi.nlevels == 1:
-            return mi.get_level_values(mi.names[0])
+        if is_scalar(level):
+            level = (level,)
+        elif len(level) == 0:
+            return self
+
+        new_names = list(self.names)
+        new_data = self._data.copy(deep=False)
+        for i in sorted(
+            (self._level_index_from_level(lev) for lev in level), reverse=True
+        ):
+            new_names.pop(i)
+            new_data.pop(self._data.names[i])
+
+        if len(new_data) == 1:
+            return cudf.core.index._index_from_data(new_data)
         else:
+            mi = MultiIndex._from_data(new_data)
+            mi.names = new_names
             return mi
 
     @_performance_tracking
@@ -1886,7 +1867,7 @@ def __array_function__(self, func, types, args, kwargs):
         else:
             return NotImplemented
 
-    def _level_index_from_level(self, level):
+    def _level_index_from_level(self, level) -> int:
         """
         Return level index from given level name or index
         """
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index c026579b8b5..c951db00c9a 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -12,6 +12,7 @@
 from cudf._lib.transform import one_hot_encode
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
+from cudf.api.types import is_scalar
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import ColumnBase, as_column, column_empty_like
 from cudf.core.column_accessor import ColumnAccessor
@@ -1227,13 +1228,24 @@ def unstack(df, level, fill_value=None, sort: bool = True):
         )
         return res
     else:
-        df = df.copy(deep=False)
-        columns = df.index._poplevels(level)
-        index = df.index
-    result = _pivot(df, index, columns)
-    if result.index.nlevels == 1:
-        result.index = result.index.get_level_values(result.index.names[0])
-    return result
+        index = df.index.droplevel(level)
+        if is_scalar(level):
+            columns = df.index.get_level_values(level)
+        else:
+            new_names = []
+            ca_data = {}
+            for lev in level:
+                ca_level, level_idx = df.index._level_to_ca_label(lev)
+                new_names.append(df.index.names[level_idx])
+                ca_data[ca_level] = df.index._data[ca_level]
+            columns = type(df.index)._from_data(
+                ColumnAccessor(ca_data, verify=False)
+            )
+            columns.names = new_names
+        result = _pivot(df, index, columns)
+        if result.index.nlevels == 1:
+            result.index = result.index.get_level_values(result.index.names[0])
+        return result
 
 
 def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase:

From 272a70307017c95805d9a7ae77e66b836afccc7b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 19 Sep 2024 11:05:00 -1000
Subject: [PATCH 222/270] Add string.extract APIs to pylibcudf (#16823)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16823
---
 .../api_docs/pylibcudf/strings/extract.rst    |  6 ++
 .../api_docs/pylibcudf/strings/index.rst      |  1 +
 python/cudf/cudf/_lib/strings/extract.pyx     | 34 ++-------
 python/cudf/cudf/core/column/string.py        |  6 +-
 .../pylibcudf/libcudf/strings/extract.pxd     |  8 +-
 .../pylibcudf/strings/CMakeLists.txt          |  4 +-
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |  1 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |  1 +
 .../pylibcudf/pylibcudf/strings/extract.pxd   | 10 +++
 .../pylibcudf/pylibcudf/strings/extract.pyx   | 76 +++++++++++++++++++
 .../pylibcudf/tests/test_string_extract.py    | 38 ++++++++++
 11 files changed, 149 insertions(+), 36 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/extract.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/extract.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_extract.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst
new file mode 100644
index 00000000000..06f74a38709
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst
@@ -0,0 +1,6 @@
+=======
+extract
+=======
+
+.. automodule:: pylibcudf.strings.extract
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 1200ecba5d9..2518afc80a7 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -7,6 +7,7 @@ strings
     capitalize
     char_types
     contains
+    extract
     find
     regex_flags
     regex_program
diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx
index 63f4d57e562..5bf336f4f3c 100644
--- a/python/cudf/cudf/_lib/strings/extract.pyx
+++ b/python/cudf/cudf/_lib/strings/extract.pyx
@@ -1,21 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.extract cimport extract as cpp_extract
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.table.table cimport table
-
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -26,21 +17,8 @@ def extract(Column source_strings, object pattern, uint32_t flags):
     The returning data contains one row for each subject string,
     and one column for each group.
     """
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_extract(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags)
+    plc_result = plc.strings.extract.extract(
+        source_strings.to_pylibcudf(mode="read"), prog
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns()))
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index e059917b0b8..4463e3280df 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -623,11 +623,9 @@ def extract(
                 "unsupported value for `flags` parameter"
             )
 
-        data, _ = libstrings.extract(self._column, pat, flags)
+        data = libstrings.extract(self._column, pat, flags)
         if len(data) == 1 and expand is False:
-            data = next(iter(data.values()))
-        else:
-            data = data
+            _, data = data.popitem()
         return self._return_or_inplace(data, expand=expand)
 
     def contains(
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
index 12cd628fc1f..b7166167cfd 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
@@ -10,5 +10,9 @@ from pylibcudf.libcudf.table.table cimport table
 cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[table] extract(
-        column_view source_strings,
-        regex_program) except +
+        column_view input,
+        regex_program prog) except +
+
+    cdef unique_ptr[column] extract_all_record(
+        column_view input,
+        regex_program prog) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 457e462e3cf..d3065cf8667 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -12,8 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
-                   regex_program.pyx repeat.pyx replace.pyx slice.pyx
+set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx
+                   regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx slice.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index d1f632d6d8e..6848c8e6e86 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -5,6 +5,7 @@ from . cimport (
     case,
     char_types,
     contains,
+    extract,
     find,
     regex_flags,
     regex_program,
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index 250cefedf55..bba86e818cc 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -5,6 +5,7 @@
     case,
     char_types,
     contains,
+    extract,
     find,
     regex_flags,
     regex_program,
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/strings/extract.pxd
new file mode 100644
index 00000000000..3871f5a0e4e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/extract.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+
+cpdef Table extract(Column input, RegexProgram prog)
+
+cpdef Column extract_all_record(Column input, RegexProgram prog)
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx
new file mode 100644
index 00000000000..dcb11ca10ce
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/extract.pyx
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport extract as cpp_extract
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+
+cpdef Table extract(Column input, RegexProgram prog):
+    """
+    Returns a table of strings columns where each column
+    corresponds to the matching group specified in the given
+    egex_program object.
+
+    For details, see :cpp:func:`cudf::strings::extract`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    Table
+        Columns of strings extracted from the input column.
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_extract.extract(
+                input.view(),
+                prog.c_obj.get()[0]
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+
+cpdef Column extract_all_record(Column input, RegexProgram prog):
+    """
+    Returns a lists column of strings where each string column
+    row corresponds to the matching group specified in the given
+    regex_program object.
+
+    For details, see :cpp:func:`cudf::strings::extract_all_record`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    Column
+        Lists column containing strings extracted from the input column
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_extract.extract_all_record(
+                input.view(),
+                prog.c_obj.get()[0]
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
new file mode 100644
index 00000000000..788b86423c4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+
+
+def test_extract():
+    pattern = "([ab])(\\d)"
+    pa_pattern = "(?P<letter>[ab])(?P<digit>\\d)"
+    arr = pa.array(["a1", "b2", "c3"])
+    plc_result = plc.strings.extract.extract(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pc.extract_regex(arr, pa_pattern)
+    for i, result_col in enumerate(result.itercolumns()):
+        expected_col = pa.chunked_array(expected.field(i))
+        assert result_col.fill_null("").equals(expected_col)
+
+
+def test_extract_all_record():
+    pattern = "([ab])(\\d)"
+    arr = pa.array(["a1", "b2", "c3"])
+    plc_result = plc.strings.extract.extract_all_record(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.chunked_array(
+        [pa.array([["a", "1"], ["b", "2"], None], type=result.type)]
+    )
+    assert result.equals(expected)

From 944312d9020417d3dcf76f46cfea081d90944d43 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 19 Sep 2024 16:58:04 -0500
Subject: [PATCH 223/270] Fix package name.

---
 ci/test_cudf_polars_polars_tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index 8ec87c7bd62..6c728a9537f 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -24,7 +24,7 @@ rapids-logger "Download wheels"
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-# Download the cudf built in the previous step
+# Download the pylibcudf built in the previous step
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
 
 rapids-logger "Install pylibcudf"

From 8e1345faef8db194828feacd8f6446b358fc07ae Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 19 Sep 2024 18:08:42 -0400
Subject: [PATCH 224/270] Intentionally leak thread_local CUDA resources to
 avoid crash (part 1) (#16787)

The NVbench application `PARQUET_READER_NVBENCH` in libcudf currently crashes with the segmentation fault. To reproduce:

```
./PARQUET_READER_NVBENCH -d 0 -b 1 --run-once -a io_type=FILEPATH -a compression_type=SNAPPY -a cardinality=0 -a run_length=1
```

The root cause is that some (1) `thread_local`  objects on the main thread in `libcudf` and (2) `static` objects in `kvikio` are destroyed after `cudaDeviceReset()` in NVbench and upon program termination. These objects should simply be leaked, since their destructors making CUDA calls upon program termination constitutes UB in CUDA.

This simple PR is the cuDF side of the fix. The other part is done here https://github.com/rapidsai/kvikio/pull/462.

closes #13229

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16787
---
 cpp/src/utilities/stream_pool.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 9d3a7ce5a4e..9824c472b20 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -132,6 +132,13 @@ struct cuda_event {
   cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); }
   virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); }
 
+  // Moveable but not copyable.
+  cuda_event(const cuda_event&)            = delete;
+  cuda_event& operator=(const cuda_event&) = delete;
+
+  cuda_event(cuda_event&&)            = default;
+  cuda_event& operator=(cuda_event&&) = default;
+
   operator cudaEvent_t() { return e_; }
 
  private:
@@ -147,11 +154,12 @@ struct cuda_event {
  */
 cudaEvent_t event_for_thread()
 {
-  thread_local std::vector<std::unique_ptr<cuda_event>> thread_events(get_num_cuda_devices());
+  // The program may crash if this function is called from the main thread and user application
+  // subsequently calls cudaDeviceReset().
+  // As a workaround, here we intentionally disable RAII and leak cudaEvent_t.
+  thread_local std::vector<cuda_event*> thread_events(get_num_cuda_devices());
   auto const device_id = get_current_cuda_device();
-  if (not thread_events[device_id.value()]) {
-    thread_events[device_id.value()] = std::make_unique<cuda_event>();
-  }
+  if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); }
   return *thread_events[device_id.value()];
 }
 

From d63ca6a90059a7c956de1eee0b60feba9059375e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 19 Sep 2024 13:52:16 -1000
Subject: [PATCH 225/270] Access Frame attributes instead of ColumnAccessor
 attributes when available (#16652)

There are some places where a public object like `DataFrame` or `Index` accesses a `ColumnAccessor` attribute when it's accessible in a shared subclass attribute instead (like `Frame`).

In an effort to access the `ColumnAccessor` less, replaced usages of `._data.attribute` with a `Frame` specific attribute`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16652
---
 python/cudf/cudf/_lib/concat.pyx           |   4 +-
 python/cudf/cudf/_lib/copying.pyx          |   2 +-
 python/cudf/cudf/_lib/csv.pyx              |   2 +-
 python/cudf/cudf/_lib/io/utils.pyx         |   2 +-
 python/cudf/cudf/_lib/parquet.pyx          |  12 +--
 python/cudf/cudf/_lib/utils.pyx            |   6 +-
 python/cudf/cudf/core/_base_index.py       |   2 +-
 python/cudf/cudf/core/column_accessor.py   |  24 ++---
 python/cudf/cudf/core/dataframe.py         | 100 ++++++++++-----------
 python/cudf/cudf/core/frame.py             |  52 ++++++-----
 python/cudf/cudf/core/groupby/groupby.py   |  23 ++---
 python/cudf/cudf/core/index.py             |  20 ++++-
 python/cudf/cudf/core/indexed_frame.py     |  39 ++++----
 python/cudf/cudf/core/join/join.py         |  18 ++--
 python/cudf/cudf/core/multiindex.py        |  44 ++++-----
 python/cudf/cudf/core/reshape.py           |  22 ++---
 python/cudf/cudf/core/tools/datetimes.py   |   4 +-
 python/cudf/cudf/core/udf/groupby_utils.py |   2 +-
 python/cudf/cudf/core/udf/utils.py         |  18 ++--
 python/cudf/cudf/io/csv.py                 |  13 ++-
 python/cudf/cudf/io/dlpack.py              |   6 +-
 python/cudf/cudf/io/orc.py                 |   4 +-
 python/cudf/cudf/testing/testing.py        |   2 +-
 python/cudf/cudf/tests/test_multiindex.py  |  12 +--
 24 files changed, 223 insertions(+), 210 deletions(-)

diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index e661059faa3..e6c2d136f0d 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -23,9 +23,9 @@ def concat_columns(object columns):
 def concat_tables(object tables, bool ignore_index=False):
     plc_tables = []
     for table in tables:
-        cols = table._data.columns
+        cols = table._columns
         if not ignore_index:
-            cols = table._index._data.columns + cols
+            cols = table._index._columns + cols
         plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols]))
 
     return data_from_pylibcudf_table(
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 16182e31c08..49714091f46 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -384,7 +384,7 @@ cdef class _CPackedColumns:
 
         p.column_names = input_table._column_names
         p.column_dtypes = {}
-        for name, col in input_table._data.items():
+        for name, col in input_table._column_labels_and_values:
             if isinstance(col.dtype, cudf.core.dtypes._BaseDtype):
                 p.column_dtypes[name] = col.dtype
 
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 058e884e08b..9ad96f610b3 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -273,7 +273,7 @@ def read_csv(
         elif isinstance(dtype, abc.Collection):
             for index, col_dtype in enumerate(dtype):
                 if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
-                    col_name = df._data.names[index]
+                    col_name = df._column_names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
     if names is not None and len(names) and isinstance(names[0], int):
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index b1900138d94..564daefbae2 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -179,7 +179,7 @@ cdef update_struct_field_names(
 ):
     # Deprecated, remove in favor of add_col_struct_names
     # when a reader is ported to pylibcudf
-    for i, (name, col) in enumerate(table._data.items()):
+    for i, (name, col) in enumerate(table._column_labels_and_values):
         table._data[name] = update_column_struct_field_names(
             col, schema_info[i]
         )
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index e6c9d60b05b..fa2690c7f21 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -235,16 +235,16 @@ cdef object _process_metadata(object df,
             df._index = idx
         elif set(index_col).issubset(names):
             index_data = df[index_col]
-            actual_index_names = list(index_col_names.values())
-            if len(index_data._data) == 1:
+            actual_index_names = iter(index_col_names.values())
+            if index_data._num_columns == 1:
                 idx = cudf.Index._from_column(
-                    index_data._data.columns[0],
-                    name=actual_index_names[0]
+                    index_data._columns[0],
+                    name=next(actual_index_names)
                 )
             else:
                 idx = cudf.MultiIndex.from_frame(
                     index_data,
-                    names=actual_index_names
+                    names=list(actual_index_names)
                 )
             df.drop(columns=index_col, inplace=True)
             df._index = idx
@@ -252,7 +252,7 @@ cdef object _process_metadata(object df,
             if use_pandas_metadata:
                 df.index.names = index_col
 
-    if len(df._data.names) == 0 and column_index_type is not None:
+    if df._num_columns == 0 and column_index_type is not None:
         df._data.label_dtype = cudf.dtype(column_index_type)
 
     return df
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index cae28d02ef4..8660cca9322 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -49,9 +49,9 @@ cdef table_view table_view_from_table(tbl, ignore_index=False) except*:
         If True, don't include the index in the columns.
     """
     return table_view_from_columns(
-        tbl._index._data.columns + tbl._data.columns
+        tbl._index._columns + tbl._columns
         if not ignore_index and tbl._index is not None
-        else tbl._data.columns
+        else tbl._columns
     )
 
 
@@ -62,7 +62,7 @@ cpdef generate_pandas_metadata(table, index):
     index_descriptors = []
     columns_to_convert = list(table._columns)
     # Columns
-    for name, col in table._data.items():
+    for name, col in table._column_labels_and_values:
         if cudf.get_option("mode.pandas_compatible"):
             # in pandas-compat mode, non-string column names are stringified.
             col_names.append(str(name))
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index ff114474aa4..a6abd63d042 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1951,7 +1951,7 @@ def drop_duplicates(
         return self._from_columns_like_self(
             drop_duplicates(
                 list(self._columns),
-                keys=range(len(self._data)),
+                keys=range(len(self._columns)),
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
             ),
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 09b0f453692..bc093fdaa9a 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -151,9 +151,9 @@ def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None:
         self.set_by_label(key, value)
 
     def __delitem__(self, key: abc.Hashable) -> None:
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         del self._data[key]
-        new_ncols = len(self._data)
+        new_ncols = len(self)
         self._clear_cache(old_ncols, new_ncols)
 
     def __len__(self) -> int:
@@ -213,7 +213,7 @@ def level_names(self) -> tuple[abc.Hashable, ...]:
 
     @property
     def nlevels(self) -> int:
-        if len(self._data) == 0:
+        if len(self) == 0:
             return 0
         if not self.multiindex:
             return 1
@@ -226,7 +226,7 @@ def name(self) -> abc.Hashable:
 
     @cached_property
     def nrows(self) -> int:
-        if len(self._data) == 0:
+        if len(self) == 0:
             return 0
         else:
             return len(next(iter(self.values())))
@@ -257,9 +257,9 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
         Parameters
         ----------
         old_ncols: int
-            len(self._data) before self._data was modified
+            len(self) before self._data was modified
         new_ncols: int
-            len(self._data) after self._data was modified
+            len(self) after self._data was modified
         """
         cached_properties = ("columns", "names", "_grouped_data")
         for attr in cached_properties:
@@ -335,7 +335,7 @@ def insert(
         if name in self._data:
             raise ValueError(f"Cannot insert '{name}', already exists")
 
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         if loc == -1:
             loc = old_ncols
         elif not (0 <= loc <= old_ncols):
@@ -414,7 +414,7 @@ def get_labels_by_index(self, index: Any) -> tuple:
         tuple
         """
         if isinstance(index, slice):
-            start, stop, step = index.indices(len(self._data))
+            start, stop, step = index.indices(len(self))
             return self.names[start:stop:step]
         elif pd.api.types.is_integer(index):
             return (self.names[index],)
@@ -526,9 +526,9 @@ def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None:
         if len(self) > 0 and len(value) != self.nrows:
             raise ValueError("All columns must be of equal length")
 
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         self._data[key] = value
-        new_ncols = len(self._data)
+        new_ncols = len(self)
         self._clear_cache(old_ncols, new_ncols)
 
     def _select_by_label_list_like(self, key: tuple) -> Self:
@@ -718,12 +718,12 @@ def droplevel(self, level: int) -> None:
         if level < 0:
             level += self.nlevels
 
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         self._data = {
             _remove_key_level(key, level): value  # type: ignore[arg-type]
             for key, value in self._data.items()
         }
-        new_ncols = len(self._data)
+        new_ncols = len(self)
         self._level_names = (
             self._level_names[:level] + self._level_names[level + 1 :]
         )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d73ad8225ca..16b0aa95c35 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -176,7 +176,7 @@ def _can_downcast_to_series(self, df, arg):
         return False
 
     @_performance_tracking
-    def _downcast_to_series(self, df, arg):
+    def _downcast_to_series(self, df: DataFrame, arg):
         """
         "Downcast" from a DataFrame to a Series
         based on Pandas indexing rules
@@ -203,16 +203,16 @@ def _downcast_to_series(self, df, arg):
 
         # take series along the axis:
         if axis == 1:
-            return df[df._data.names[0]]
+            return df[df._column_names[0]]
         else:
             if df._num_columns > 0:
                 dtypes = df.dtypes.values.tolist()
                 normalized_dtype = np.result_type(*dtypes)
-                for name, col in df._data.items():
+                for name, col in df._column_labels_and_values:
                     df[name] = col.astype(normalized_dtype)
 
             sr = df.T
-            return sr[sr._data.names[0]]
+            return sr[sr._column_names[0]]
 
 
 class _DataFrameLocIndexer(_DataFrameIndexer):
@@ -258,7 +258,7 @@ def _getitem_tuple_arg(self, arg):
                     and len(arg) > 1
                     and is_scalar(arg[1])
                 ):
-                    return result._data.columns[0].element_indexing(0)
+                    return result._columns[0].element_indexing(0)
                 return result
         else:
             if isinstance(arg[0], slice):
@@ -310,7 +310,7 @@ def _getitem_tuple_arg(self, arg):
                 else:
                     tmp_col_name = str(uuid4())
                     cantor_name = "_" + "_".join(
-                        map(str, columns_df._data.names)
+                        map(str, columns_df._column_names)
                     )
                     if columns_df._data.multiindex:
                         # column names must be appropriate length tuples
@@ -1412,7 +1412,7 @@ def __setitem__(self, arg, value):
                             else column.column_empty_like(
                                 col, masked=True, newsize=length
                             )
-                            for key, col in self._data.items()
+                            for key, col in self._column_labels_and_values
                         )
                         self._data = self._data._from_columns_like_self(
                             new_columns, verify=False
@@ -1494,8 +1494,8 @@ def __delitem__(self, name):
 
     @_performance_tracking
     def memory_usage(self, index=True, deep=False) -> cudf.Series:
-        mem_usage = [col.memory_usage for col in self._data.columns]
-        names = [str(name) for name in self._data.names]
+        mem_usage = [col.memory_usage for col in self._columns]
+        names = [str(name) for name in self._column_names]
         if index:
             mem_usage.append(self.index.memory_usage())
             names.append("Index")
@@ -1725,7 +1725,7 @@ def _concat(
                 []
                 if are_all_range_index
                 or (ignore_index and not empty_has_index)
-                else list(f.index._data.columns)
+                else list(f.index._columns)
             )
             + [f._data[name] if name in f._data else None for name in names]
             for f in objs
@@ -1808,7 +1808,7 @@ def _concat(
                 out.index.dtype, cudf.CategoricalDtype
             ):
                 out = out.set_index(out.index)
-        for name, col in out._data.items():
+        for name, col in out._column_labels_and_values:
             out._data[name] = col._with_type_metadata(
                 tables[0]._data[name].dtype
             )
@@ -1831,13 +1831,13 @@ def astype(
         errors: Literal["raise", "ignore"] = "raise",
     ):
         if is_dict_like(dtype):
-            if len(set(dtype.keys()) - set(self._data.names)) > 0:
+            if len(set(dtype.keys()) - set(self._column_names)) > 0:
                 raise KeyError(
                     "Only a column name can be used for the "
                     "key in a dtype mappings argument."
                 )
         else:
-            dtype = {cc: dtype for cc in self._data.names}
+            dtype = {cc: dtype for cc in self._column_names}
         return super().astype(dtype, copy, errors)
 
     def _clean_renderable_dataframe(self, output):
@@ -2601,7 +2601,7 @@ def equals(self, other) -> bool:
         # If all other checks matched, validate names.
         if ret:
             for self_name, other_name in zip(
-                self._data.names, other._data.names
+                self._column_names, other._column_names
             ):
                 if self_name != other_name:
                     ret = False
@@ -2676,7 +2676,7 @@ def columns(self, columns):
             )
 
         self._data = ColumnAccessor(
-            data=dict(zip(pd_columns, self._data.columns)),
+            data=dict(zip(pd_columns, self._columns)),
             multiindex=multiindex,
             level_names=level_names,
             label_dtype=label_dtype,
@@ -2698,7 +2698,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None:
                 f"got {len(self)} elements"
             )
         self._data = ColumnAccessor(
-            data=dict(zip(other.names, self._data.columns)),
+            data=dict(zip(other.names, self._columns)),
             multiindex=other.multiindex,
             rangeindex=other.rangeindex,
             level_names=other.level_names,
@@ -2983,7 +2983,7 @@ def set_index(
             elif isinstance(col, (MultiIndex, pd.MultiIndex)):
                 if isinstance(col, pd.MultiIndex):
                     col = MultiIndex.from_pandas(col)
-                data_to_add.extend(col._data.columns)
+                data_to_add.extend(col._columns)
                 names.extend(col.names)
             elif isinstance(
                 col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
@@ -3110,7 +3110,9 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
             )
 
         out = []
-        for (name, col), other_col in zip(self._data.items(), other_cols):
+        for (name, col), other_col in zip(
+            self._column_labels_and_values, other_cols
+        ):
             source_col, other_col = _check_and_cast_columns_with_other(
                 source_col=col,
                 other=other_col,
@@ -3314,7 +3316,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
                             column.column_empty_like(
                                 col_data, masked=True, newsize=length
                             )
-                            for col_data in self._data.values()
+                            for col_data in self._columns
                         ),
                         verify=False,
                     )
@@ -3664,7 +3666,7 @@ def rename(
                             name: col.find_and_replace(
                                 to_replace, vals, is_all_na
                             )
-                            for name, col in self.index._data.items()
+                            for name, col in self.index._column_labels_and_values
                         }
                     )
                 except OverflowError:
@@ -3686,9 +3688,7 @@ def add_prefix(self, prefix, axis=None):
             raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
-        out.columns = [
-            prefix + col_name for col_name in list(self._data.keys())
-        ]
+        out.columns = [prefix + col_name for col_name in self._column_names]
         return out
 
     @_performance_tracking
@@ -3697,9 +3697,7 @@ def add_suffix(self, suffix, axis=None):
             raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
-        out.columns = [
-            col_name + suffix for col_name in list(self._data.keys())
-        ]
+        out.columns = [col_name + suffix for col_name in self._column_names]
         return out
 
     @_performance_tracking
@@ -4805,7 +4803,7 @@ def _func(x):  # pragma: no cover
         # TODO: naive implementation
         # this could be written as a single kernel
         result = {}
-        for name, col in self._data.items():
+        for name, col in self._column_labels_and_values:
             apply_sr = Series._from_column(col)
             result[name] = apply_sr.apply(_func)._column
 
@@ -5444,7 +5442,7 @@ def to_pandas(
         out_index = self.index.to_pandas()
         out_data = {
             i: col.to_pandas(nullable=nullable, arrow_type=arrow_type)
-            for i, col in enumerate(self._data.columns)
+            for i, col in enumerate(self._columns)
         }
 
         out_df = pd.DataFrame(out_data, index=out_index)
@@ -5665,14 +5663,16 @@ def to_arrow(self, preserve_index=None) -> pa.Table:
                     index = index._as_int_index()
                     index.name = "__index_level_0__"
                 if isinstance(index, MultiIndex):
-                    index_descr = list(index._data.names)
+                    index_descr = index._column_names
                     index_levels = index.levels
                 else:
                     index_descr = (
                         index.names if index.name is not None else ("index",)
                     )
                 data = data.copy(deep=False)
-                for gen_name, col_name in zip(index_descr, index._data.names):
+                for gen_name, col_name in zip(
+                    index_descr, index._column_names
+                ):
                     data._insert(
                         data.shape[1],
                         gen_name,
@@ -5681,7 +5681,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table:
 
         out = super(DataFrame, data).to_arrow()
         metadata = pa.pandas_compat.construct_metadata(
-            columns_to_convert=[self[col] for col in self._data.names],
+            columns_to_convert=[self[col] for col in self._column_names],
             df=self,
             column_names=out.schema.names,
             index_levels=index_levels,
@@ -5724,12 +5724,12 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None):
                 "column_dtypes is currently not supported."
             )
         members = [("index", self.index.dtype)] if index else []
-        members += [(col, self[col].dtype) for col in self._data.names]
+        members += list(self._dtypes)
         dtype = np.dtype(members)
         ret = np.recarray(len(self), dtype=dtype)
         if index:
             ret["index"] = self.index.to_numpy()
-        for col in self._data.names:
+        for col in self._column_names:
             ret[col] = self[col].to_numpy()
         return ret
 
@@ -6059,7 +6059,7 @@ def quantile(
             )
 
         if columns is None:
-            columns = data_df._data.names
+            columns = set(data_df._column_names)
 
         if isinstance(q, numbers.Number):
             q_is_number = True
@@ -6084,7 +6084,7 @@ def quantile(
             # Ensure that qs is non-scalar so that we always get a column back.
             interpolation = interpolation or "linear"
             result = {}
-            for k in data_df._data.names:
+            for k in data_df._column_names:
                 if k in columns:
                     ser = data_df[k]
                     res = ser.quantile(
@@ -6198,7 +6198,7 @@ def make_false_column_like_self():
                 if isinstance(values, DataFrame)
                 else {name: values._column for name in self._data}
             )
-            for col, self_col in self._data.items():
+            for col, self_col in self._column_labels_and_values:
                 if col in other_cols:
                     other_col = other_cols[col]
                     self_is_cat = isinstance(self_col, CategoricalColumn)
@@ -6231,13 +6231,13 @@ def make_false_column_like_self():
                 else:
                     result[col] = make_false_column_like_self()
         elif is_dict_like(values):
-            for name, col in self._data.items():
+            for name, col in self._column_labels_and_values:
                 if name in values:
                     result[name] = col.isin(values[name])
                 else:
                     result[name] = make_false_column_like_self()
         elif is_list_like(values):
-            for name, col in self._data.items():
+            for name, col in self._column_labels_and_values:
                 result[name] = col.isin(values)
         else:
             raise TypeError(
@@ -6292,7 +6292,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
                     name: filtered._data[name]._get_mask_as_column()
                     if filtered._data[name].nullable
                     else as_column(True, length=len(filtered._data[name]))
-                    for name in filtered._data.names
+                    for name in filtered._column_names
                 }
             )
             mask = mask.all(axis=1)
@@ -6342,7 +6342,7 @@ def count(self, axis=0, numeric_only=False):
         length = len(self)
         return Series._from_column(
             as_column([length - col.null_count for col in self._columns]),
-            index=cudf.Index(self._data.names),
+            index=cudf.Index(self._column_names),
         )
 
     _SUPPORT_AXIS_LOOKUP = {
@@ -6409,7 +6409,7 @@ def _reduce(
             return source._apply_cupy_method_axis_1(op, **kwargs)
         else:
             axis_0_results = []
-            for col_label, col in source._data.items():
+            for col_label, col in source._column_labels_and_values:
                 try:
                     axis_0_results.append(getattr(col, op)(**kwargs))
                 except AttributeError as err:
@@ -6634,7 +6634,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
         prepared, mask, common_dtype = self._prepare_for_rowwise_op(
             method, skipna, numeric_only
         )
-        for col in prepared._data.names:
+        for col in prepared._column_names:
             if prepared._data[col].nullable:
                 prepared._data[col] = (
                     prepared._data[col]
@@ -6820,7 +6820,7 @@ def select_dtypes(self, include=None, exclude=None):
         # remove all exclude types
         inclusion = inclusion - exclude_subtypes
 
-        for k, col in self._data.items():
+        for k, col in self._column_labels_and_values:
             infered_type = cudf_dtype_from_pydata_dtype(col.dtype)
             if infered_type in inclusion:
                 df._insert(len(df._data), k, col)
@@ -7192,7 +7192,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
         # Compute the column indices that serves as the input for
         # `interleave_columns`
         column_idx_df = pd.DataFrame(
-            data=range(len(self._data)), index=named_levels
+            data=range(self._num_columns), index=named_levels
         )
 
         column_indices: list[list[int]] = []
@@ -7392,17 +7392,17 @@ def to_struct(self, name=None):
         -----
         Note: a copy of the columns is made.
         """
-        if not all(isinstance(name, str) for name in self._data.names):
+        if not all(isinstance(name, str) for name in self._column_names):
             warnings.warn(
                 "DataFrame contains non-string column name(s). Struct column "
                 "requires field name to be string. Non-string column names "
                 "will be casted to string as the field name."
             )
-        fields = {str(name): col.dtype for name, col in self._data.items()}
+        fields = {str(name): dtype for name, dtype in self._dtypes}
         col = StructColumn(
             data=None,
             dtype=cudf.StructDtype(fields=fields),
-            children=tuple(col.copy(deep=True) for col in self._data.columns),
+            children=tuple(col.copy(deep=True) for col in self._columns),
             size=len(self),
             offset=0,
         )
@@ -7984,7 +7984,7 @@ def value_counts(
             diff = set(subset) - set(self._data)
             if len(diff) != 0:
                 raise KeyError(f"columns {diff} do not exist")
-        columns = list(self._data.names) if subset is None else subset
+        columns = list(self._column_names) if subset is None else subset
         result = (
             self.groupby(
                 by=columns,
@@ -8105,7 +8105,7 @@ def func(left, right, output):
                 right._column_names
             )
         elif _is_scalar_or_zero_d_array(right):
-            for name, col in output._data.items():
+            for name, col in output._column_labels_and_values:
                 output._data[name] = col.fillna(value)
             return output
         else:
@@ -8387,7 +8387,7 @@ def extract_col(df, col):
             and col not in df.index._data
             and not isinstance(df.index, MultiIndex)
         ):
-            return df.index._data.columns[0]
+            return df.index._column
         return df.index._data[col]
 
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 7b2bc85b13b..98af006f6e5 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -75,8 +75,15 @@ def _columns(self) -> tuple[ColumnBase, ...]:
         return self._data.columns
 
     @property
-    def _dtypes(self) -> abc.Iterable:
-        return zip(self._data.names, (col.dtype for col in self._data.columns))
+    def _column_labels_and_values(
+        self,
+    ) -> abc.Iterable[tuple[abc.Hashable, ColumnBase]]:
+        return zip(self._column_names, self._columns)
+
+    @property
+    def _dtypes(self) -> abc.Generator[tuple[abc.Hashable, Dtype], None, None]:
+        for label, col in self._column_labels_and_values:
+            yield label, col.dtype
 
     @property
     def ndim(self) -> int:
@@ -87,7 +94,7 @@ def serialize(self):
         # TODO: See if self._data can be serialized outright
         header = {
             "type-serialized": pickle.dumps(type(self)),
-            "column_names": pickle.dumps(tuple(self._data.names)),
+            "column_names": pickle.dumps(self._column_names),
             "column_rangeindex": pickle.dumps(self._data.rangeindex),
             "column_multiindex": pickle.dumps(self._data.multiindex),
             "column_label_dtype": pickle.dumps(self._data.label_dtype),
@@ -156,7 +163,7 @@ def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Self | None:
         if inplace:
-            for col in self._data:
+            for col in self._column_names:
                 if col in result._data:
                     self._data[col]._mimic_inplace(
                         result._data[col], inplace=True
@@ -267,7 +274,7 @@ def __len__(self) -> int:
     def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
         casted = (
             col.astype(dtype.get(col_name, col.dtype), copy=copy)
-            for col_name, col in self._data.items()
+            for col_name, col in self._column_labels_and_values
         )
         ca = self._data._from_columns_like_self(casted, verify=False)
         return self._from_data_like_self(ca)
@@ -338,9 +345,7 @@ def equals(self, other) -> bool:
 
         return all(
             self_col.equals(other_col, check_dtypes=True)
-            for self_col, other_col in zip(
-                self._data.values(), other._data.values()
-            )
+            for self_col, other_col in zip(self._columns, other._columns)
         )
 
     @_performance_tracking
@@ -434,11 +439,9 @@ def to_array(
 
         if dtype is None:
             if ncol == 1:
-                dtype = next(iter(self._data.values())).dtype
+                dtype = next(self._dtypes)[1]
             else:
-                dtype = find_common_type(
-                    [col.dtype for col in self._data.values()]
-                )
+                dtype = find_common_type([dtype for _, dtype in self._dtypes])
 
             if not isinstance(dtype, numpy.dtype):
                 raise NotImplementedError(
@@ -446,12 +449,12 @@ def to_array(
                 )
 
         if self.ndim == 1:
-            return to_array(self._data.columns[0], dtype)
+            return to_array(self._columns[0], dtype)
         else:
             matrix = module.empty(
                 shape=(len(self), ncol), dtype=dtype, order="F"
             )
-            for i, col in enumerate(self._data.values()):
+            for i, col in enumerate(self._columns):
                 # TODO: col.values may fail if there is nullable data or an
                 # unsupported dtype. We may want to catch and provide a more
                 # suitable error.
@@ -751,7 +754,7 @@ def fillna(
 
         filled_columns = [
             col.fillna(value[name], method) if name in value else col.copy()
-            for name, col in self._data.items()
+            for name, col in self._column_labels_and_values
         ]
 
         return self._mimic_inplace(
@@ -988,7 +991,10 @@ def to_arrow(self):
         index: [[1,2,3]]
         """
         return pa.Table.from_pydict(
-            {str(name): col.to_arrow() for name, col in self._data.items()}
+            {
+                str(name): col.to_arrow()
+                for name, col in self._column_labels_and_values
+            }
         )
 
     @_performance_tracking
@@ -1012,7 +1018,9 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
 
         See `ColumnBase._with_type_metadata` for more information.
         """
-        for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes):
+        for (name, col), (_, dtype) in zip(
+            self._column_labels_and_values, other._dtypes
+        ):
             self._data.set_by_label(name, col._with_type_metadata(dtype))
 
         return self
@@ -1422,7 +1430,7 @@ def _split(self, splits):
         """
         return [
             self._from_columns_like_self(
-                libcudf.copying.columns_split([*self._data.columns], splits)[
+                libcudf.copying.columns_split(list(self._columns), splits)[
                     split_idx
                 ],
                 self._column_names,
@@ -1432,7 +1440,7 @@ def _split(self, splits):
 
     @_performance_tracking
     def _encode(self):
-        columns, indices = libcudf.transform.table_encode([*self._columns])
+        columns, indices = libcudf.transform.table_encode(list(self._columns))
         keys = self._from_columns_like_self(columns)
         return keys, indices
 
@@ -1578,7 +1586,7 @@ def __neg__(self):
                     col.unary_operator("not")
                     if col.dtype.kind == "b"
                     else -1 * col
-                    for col in self._data.columns
+                    for col in self._columns
                 )
             )
         )
@@ -1840,9 +1848,7 @@ def __copy__(self):
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
-            self._data._from_columns_like_self(
-                (~col for col in self._data.columns)
-            )
+            self._data._from_columns_like_self((~col for col in self._columns))
         )
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6424c8af877..cb8cd0cd28b 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -751,10 +751,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
             ) and not libgroupby._is_all_scan_aggregate(normalized_aggs):
                 # Even with `sort=False`, pandas guarantees that
                 # groupby preserves the order of rows within each group.
-                left_cols = list(
-                    self.grouping.keys.drop_duplicates()._data.columns
-                )
-                right_cols = list(result_index._data.columns)
+                left_cols = list(self.grouping.keys.drop_duplicates()._columns)
+                right_cols = list(result_index._columns)
                 join_keys = [
                     _match_join_keys(lcol, rcol, "left")
                     for lcol, rcol in zip(left_cols, right_cols)
@@ -1483,7 +1481,7 @@ def _post_process_chunk_results(
                     # the column name should be, especially if we applied
                     # a nameless UDF.
                     result = result.to_frame(
-                        name=grouped_values._data.names[0]
+                        name=grouped_values._column_names[0]
                     )
                 else:
                     index_data = group_keys._data.copy(deep=True)
@@ -1632,7 +1630,7 @@ def mult(df):
             if func in {"sum", "product"}:
                 # For `sum` & `product`, boolean types
                 # will need to result in `int64` type.
-                for name, col in res._data.items():
+                for name, col in res._column_labels_and_values:
                     if col.dtype.kind == "b":
                         res._data[name] = col.astype("int")
             return res
@@ -2715,11 +2713,8 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
     def _reduce_numeric_only(self, op: str):
         columns = list(
             name
-            for name in self.obj._data.names
-            if (
-                is_numeric_dtype(self.obj._data[name].dtype)
-                and name not in self.grouping.names
-            )
+            for name, dtype in self.obj._dtypes
+            if (is_numeric_dtype(dtype) and name not in self.grouping.names)
         )
         return self[columns].agg(op)
 
@@ -3209,7 +3204,7 @@ def values(self) -> cudf.core.frame.Frame:
         """
         # If the key columns are in `obj`, filter them out
         value_column_names = [
-            x for x in self._obj._data.names if x not in self._named_columns
+            x for x in self._obj._column_names if x not in self._named_columns
         ]
         value_columns = self._obj._data.select_by_label(value_column_names)
         return self._obj.__class__._from_data(value_columns)
@@ -3224,8 +3219,8 @@ def _handle_series(self, by):
         self.names.append(by.name)
 
     def _handle_index(self, by):
-        self._key_columns.extend(by._data.columns)
-        self.names.extend(by._data.names)
+        self._key_columns.extend(by._columns)
+        self.names.extend(by._column_names)
 
     def _handle_mapping(self, by):
         by = cudf.Series(by.values(), index=by.keys())
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b2bd20c4982..cd07c58c5d9 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -122,13 +122,13 @@ def _lexsorted_equal_range(
         sort_inds = None
         sort_vals = idx
     lower_bound = search_sorted(
-        [*sort_vals._data.columns],
+        list(sort_vals._columns),
         keys,
         side="left",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
     upper_bound = search_sorted(
-        [*sort_vals._data.columns],
+        list(sort_vals._columns),
         keys,
         side="right",
         ascending=sort_vals.is_monotonic_increasing,
@@ -286,6 +286,20 @@ def name(self):
     def name(self, value):
         self._name = value
 
+    @property
+    @_performance_tracking
+    def _column_names(self) -> tuple[Any]:
+        return (self.name,)
+
+    @property
+    @_performance_tracking
+    def _columns(self) -> tuple[ColumnBase]:
+        return (self._values,)
+
+    @property
+    def _column_labels_and_values(self) -> Iterable:
+        return zip(self._column_names, self._columns)
+
     @property  # type: ignore
     @_performance_tracking
     def start(self) -> int:
@@ -1068,7 +1082,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
             else:
                 inputs = {
                     name: (col, None, False, None)
-                    for name, col in self._data.items()
+                    for name, col in self._column_labels_and_values
                 }
 
             data = self._apply_cupy_ufunc_to_operands(
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index fd6bf37f0e6..810d4ad74e7 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -294,7 +294,7 @@ def _num_rows(self) -> int:
 
     @property
     def _index_names(self) -> tuple[Any, ...]:  # TODO: Tuple[str]?
-        return self.index._data.names
+        return self.index._column_names
 
     @classmethod
     def _from_data(
@@ -307,6 +307,7 @@ def _from_data(
             raise ValueError(
                 f"index must be None or a cudf.Index not {type(index).__name__}"
             )
+        # out._num_rows requires .index to be defined
         out._index = RangeIndex(out._data.nrows) if index is None else index
         return out
 
@@ -882,7 +883,7 @@ def replace(
                 columns_dtype_map=dict(self._dtypes),
             )
             copy_data = []
-            for name, col in self._data.items():
+            for name, col in self._column_labels_and_values:
                 try:
                     replaced = col.find_and_replace(
                         to_replace_per_column[name],
@@ -2703,11 +2704,11 @@ def sort_index(
                         by.extend(
                             filter(
                                 lambda n: n not in handled,
-                                self.index._data.names,
+                                self.index._column_names,
                             )
                         )
                 else:
-                    by = list(idx._data.names)
+                    by = list(idx._column_names)
 
                 inds = idx._get_sorted_inds(
                     by=by, ascending=ascending, na_position=na_position
@@ -3013,7 +3014,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
 
         columns_to_slice = [
             *(
-                self.index._data.columns
+                self.index._columns
                 if keep_index and not has_range_index
                 else []
             ),
@@ -3210,7 +3211,7 @@ def _empty_like(self, keep_index=True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
                 [
-                    *(self.index._data.columns if keep_index else ()),
+                    *(self.index._columns if keep_index else ()),
                     *self._columns,
                 ]
             ),
@@ -3227,7 +3228,7 @@ def _split(self, splits, keep_index=True):
 
         columns_split = libcudf.copying.columns_split(
             [
-                *(self.index._data.columns if keep_index else []),
+                *(self.index._columns if keep_index else []),
                 *self._columns,
             ],
             splits,
@@ -3763,8 +3764,8 @@ def _reindex(
             idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
                 _is_same_dtype(left_dtype, right_dtype)
                 for left_dtype, right_dtype in zip(
-                    (col.dtype for col in df.index._data.columns),
-                    (col.dtype for col in index._data.columns),
+                    (dtype for _, dtype in df.index._dtypes),
+                    (dtype for _, dtype in index._dtypes),
                 )
             )
 
@@ -3783,7 +3784,7 @@ def _reindex(
                         (name or 0)
                         if isinstance(self, cudf.Series)
                         else name: col
-                        for name, col in df._data.items()
+                        for name, col in df._column_labels_and_values
                     },
                     index=df.index,
                 )
@@ -3794,7 +3795,7 @@ def _reindex(
         index = index if index is not None else df.index
 
         if column_names is None:
-            names = list(df._data.names)
+            names = list(df._column_names)
             level_names = self._data.level_names
             multiindex = self._data.multiindex
             rangeindex = self._data.rangeindex
@@ -3948,7 +3949,7 @@ def round(self, decimals=0, how="half_even"):
             col.round(decimals[name], how=how)
             if name in decimals and col.dtype.kind in "fiu"
             else col.copy(deep=True)
-            for name, col in self._data.items()
+            for name, col in self._column_labels_and_values
         )
         return self._from_data_like_self(
             self._data._from_columns_like_self(cols)
@@ -4270,7 +4271,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
             else:
                 thresh = len(df)
 
-        for name, col in df._data.items():
+        for name, col in df._column_labels_and_values:
             check_col = col.nans_to_nulls()
             no_threshold_valid_count = (
                 len(col) - check_col.null_count
@@ -4305,7 +4306,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
 
         return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(
-                [*self.index._data.columns, *data_columns],
+                [*self.index._columns, *data_columns],
                 how=how,
                 keys=self._positions_from_column_names(
                     subset, offset_by_index_columns=True
@@ -4853,7 +4854,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
                 # This works for Index too
                 inputs = {
                     name: (col, None, False, None)
-                    for name, col in self._data.items()
+                    for name, col in self._column_labels_and_values
                 }
                 index = self.index
 
@@ -4933,7 +4934,7 @@ def repeat(self, repeats, axis=None):
         """
         res = self._from_columns_like_self(
             Frame._repeat(
-                [*self.index._data.columns, *self._columns], repeats, axis
+                [*self.index._columns, *self._columns], repeats, axis
             ),
             self._column_names,
             self._index_names,
@@ -6224,7 +6225,7 @@ def _preprocess_subset(self, subset):
             not np.iterable(subset)
             or isinstance(subset, str)
             or isinstance(subset, tuple)
-            and subset in self._data.names
+            and subset in self._column_names
         ):
             subset = (subset,)
         diff = set(subset) - set(self._data)
@@ -6306,8 +6307,8 @@ def rank(
                 )
             numeric_cols = (
                 name
-                for name in self._data.names
-                if _is_non_decimal_numeric_dtype(self._data[name])
+                for name, dtype in self._dtypes
+                if _is_non_decimal_numeric_dtype(dtype)
             )
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index b65bc7af832..cfeaca00888 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -140,11 +140,15 @@ def __init__(
         # right_on.
         self._using_left_index = bool(left_index)
         left_on = (
-            lhs.index._data.names if left_index else left_on if left_on else on
+            lhs.index._column_names
+            if left_index
+            else left_on
+            if left_on
+            else on
         )
         self._using_right_index = bool(right_index)
         right_on = (
-            rhs.index._data.names
+            rhs.index._column_names
             if right_index
             else right_on
             if right_on
@@ -334,18 +338,18 @@ def _merge_results(
         # All columns from the left table make it into the output. Non-key
         # columns that share a name with a column in the right table are
         # suffixed with the provided suffix.
-        common_names = set(left_result._data.names) & set(
-            right_result._data.names
+        common_names = set(left_result._column_names) & set(
+            right_result._column_names
         )
         cols_to_suffix = common_names - self._key_columns_with_same_name
         data = {
             (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col
-            for name, col in left_result._data.items()
+            for name, col in left_result._column_labels_and_values
         }
 
         # The right table follows the same rule as the left table except that
         # key columns from the right table are removed.
-        for name, col in right_result._data.items():
+        for name, col in right_result._column_labels_and_values:
             if name in common_names:
                 if name not in self._key_columns_with_same_name:
                     data[f"{name}{self.rsuffix}"] = col
@@ -399,7 +403,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         # producing the input result.
         by: list[Any] = []
         if self._using_left_index and self._using_right_index:
-            by.extend(result.index._data.columns)
+            by.extend(result.index._columns)
         if not self._using_left_index:
             by.extend([result._data[col.name] for col in self._left_keys])
         if not self._using_right_index:
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index b86ad38c944..6de3981ba66 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -233,8 +233,8 @@ def names(self, value):
             # to unexpected behavior in some cases. This is
             # definitely buggy, but we can't disallow non-unique
             # names either...
-            self._data = self._data.__class__(
-                dict(zip(value, self._data.values())),
+            self._data = type(self._data)(
+                dict(zip(value, self._columns)),
                 level_names=self._data.level_names,
                 verify=False,
             )
@@ -693,19 +693,25 @@ def where(self, cond, other=None, inplace=False):
     @_performance_tracking
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
-        lookup = cudf.DataFrame()
+        lookup_dict = {}
         for i, row in enumerate(row_tuple):
             if isinstance(row, slice) and row == slice(None):
                 continue
-            lookup[i] = cudf.Series(row)
-        frame = cudf.DataFrame(dict(enumerate(index._data.columns)))
+            lookup_dict[i] = row
+        lookup = cudf.DataFrame(lookup_dict)
+        frame = cudf.DataFrame._from_data(
+            ColumnAccessor(dict(enumerate(index._columns)), verify=False)
+        )
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", FutureWarning)
             data_table = cudf.concat(
                 [
                     frame,
                     cudf.DataFrame._from_data(
-                        {"idx": column.as_column(range(len(frame)))}
+                        ColumnAccessor(
+                            {"idx": column.as_column(range(len(frame)))},
+                            verify=False,
+                        )
                     ),
                 ],
                 axis=1,
@@ -716,7 +722,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
         # TODO: Remove this after merge/join
         # obtain deterministic ordering.
         if cudf.get_option("mode.pandas_compatible"):
-            lookup_order = "_" + "_".join(map(str, lookup._data.names))
+            lookup_order = "_" + "_".join(map(str, lookup._column_names))
             lookup[lookup_order] = column.as_column(range(len(lookup)))
             postprocess = operator.methodcaller(
                 "sort_values", by=[lookup_order, "idx"]
@@ -784,7 +790,7 @@ def _index_and_downcast(self, result, index, index_key):
             out_index.insert(
                 out_index._num_columns,
                 k,
-                cudf.Series._from_column(index._data.columns[k]),
+                cudf.Series._from_column(index._columns[k]),
             )
 
         # determine if we should downcast from a DataFrame to a Series
@@ -800,19 +806,19 @@ def _index_and_downcast(self, result, index, index_key):
         )
         if need_downcast:
             result = result.T
-            return result[result._data.names[0]]
+            return result[result._column_names[0]]
 
         if len(result) == 0 and not slice_access:
             # Pandas returns an empty Series with a tuple as name
             # the one expected result column
             result = cudf.Series._from_data(
-                {}, name=tuple(col[0] for col in index._data.columns)
+                {}, name=tuple(col[0] for col in index._columns)
             )
         elif out_index._num_columns == 1:
             # If there's only one column remaining in the output index, convert
             # it into an Index and name the final index values according
             # to that column's name.
-            *_, last_column = index._data.columns
+            last_column = index._columns[-1]
             out_index = cudf.Index._from_column(
                 last_column, name=index.names[-1]
             )
@@ -894,7 +900,7 @@ def __eq__(self, other):
                 [
                     self_col.equals(other_col)
                     for self_col, other_col in zip(
-                        self._data.values(), other._data.values()
+                        self._columns, other._columns
                     )
                 ]
             )
@@ -1475,10 +1481,10 @@ def swaplevel(self, i=-2, j=-1) -> Self:
             ('aa', 'b')],
            )
         """
-        name_i = self._data.names[i] if isinstance(i, int) else i
-        name_j = self._data.names[j] if isinstance(j, int) else j
+        name_i = self._column_names[i] if isinstance(i, int) else i
+        name_j = self._column_names[j] if isinstance(j, int) else j
         new_data = {}
-        for k, v in self._data.items():
+        for k, v in self._column_labels_and_values:
             if k not in (name_i, name_j):
                 new_data[k] = v
             elif k == name_i:
@@ -1916,7 +1922,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         join_keys = [
             _match_join_keys(lcol, rcol, "inner")
-            for lcol, rcol in zip(target._data.columns, self._data.columns)
+            for lcol, rcol in zip(target._columns, self._columns)
         ]
         join_keys = map(list, zip(*join_keys))
         scatter_map, indices = libcudf.join.join(
@@ -2113,7 +2119,7 @@ def _split_columns_by_levels(
             lv if isinstance(lv, int) else level_names.index(lv)
             for lv in levels
         }
-        for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
+        for i, (name, col) in enumerate(zip(self.names, self._columns)):
             if in_levels and i in level_indices:
                 name = f"level_{i}" if name is None else name
                 yield name, col
@@ -2154,9 +2160,7 @@ def _columns_for_reset_index(
     ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
         """Return the columns and column names for .reset_index"""
         if levels is None:
-            for i, (col, name) in enumerate(
-                zip(self._data.columns, self.names)
-            ):
+            for i, (col, name) in enumerate(zip(self._columns, self.names)):
                 yield f"level_{i}" if name is None else name, col
         else:
             yield from self._split_columns_by_levels(levels, in_levels=True)
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index c951db00c9a..401fef67ee6 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -410,7 +410,7 @@ def concat(
         result_columns = None
         if keys_objs is None:
             for o in objs:
-                for name, col in o._data.items():
+                for name, col in o._column_labels_and_values:
                     if name in result_data:
                         raise NotImplementedError(
                             f"A Column with duplicate name found: {name}, cuDF "
@@ -438,7 +438,7 @@ def concat(
         else:
             # All levels in the multiindex label must have the same type
             has_multiple_level_types = (
-                len({type(name) for o in objs for name in o._data.keys()}) > 1
+                len({type(name) for o in objs for name in o._column_names}) > 1
             )
             if has_multiple_level_types:
                 raise NotImplementedError(
@@ -447,7 +447,7 @@ def concat(
                     "the labels to the same type."
                 )
             for k, o in zip(keys_objs, objs):
-                for name, col in o._data.items():
+                for name, col in o._column_labels_and_values:
                     # if only series, then only keep keys_objs as column labels
                     # if the existing column is multiindex, prepend it
                     # to handle cases where dfs and srs are concatenated
@@ -843,7 +843,7 @@ def get_dummies(
         else:
             result_data = {
                 col_name: col
-                for col_name, col in data._data.items()
+                for col_name, col in data._column_labels_and_values
                 if col_name not in columns
             }
 
@@ -943,7 +943,7 @@ def _merge_sorted(
 
     columns = [
         [
-            *(obj.index._data.columns if not ignore_index else ()),
+            *(obj.index._columns if not ignore_index else ()),
             *obj._columns,
         ]
         for obj in objs
@@ -985,7 +985,7 @@ def as_tuple(x):
             return x if isinstance(x, tuple) else (x,)
 
         nrows = len(index_labels)
-        for col_label, col in df._data.items():
+        for col_label, col in df._column_labels_and_values:
             names = [
                 as_tuple(col_label) + as_tuple(name) for name in column_labels
             ]
@@ -1009,7 +1009,7 @@ def as_tuple(x):
     ca = ColumnAccessor(
         result,
         multiindex=True,
-        level_names=(None,) + columns._data.names,
+        level_names=(None,) + columns._column_names,
         verify=False,
     )
     return cudf.DataFrame._from_data(
@@ -1087,11 +1087,7 @@ def pivot(data, columns=None, index=no_default, values=no_default):
     # Create a DataFrame composed of columns from both
     # columns and index
     ca = ColumnAccessor(
-        dict(
-            enumerate(
-                itertools.chain(index._data.columns, columns._data.columns)
-            )
-        ),
+        dict(enumerate(itertools.chain(index._columns, columns._columns))),
         verify=False,
     )
     columns_index = cudf.DataFrame._from_data(ca)
@@ -1560,7 +1556,7 @@ def pivot_table(
     if values_passed and not values_multi and table._data.multiindex:
         column_names = table._data.level_names[1:]
         table_columns = tuple(
-            map(lambda column: column[1:], table._data.names)
+            map(lambda column: column[1:], table._column_names)
         )
         table.columns = pd.MultiIndex.from_tuples(
             tuples=table_columns, names=column_names
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 7197560b5a4..68f34fa28ff 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -186,7 +186,7 @@ def to_datetime(
         if isinstance(arg, cudf.DataFrame):
             # we require at least Ymd
             required = ["year", "month", "day"]
-            req = list(set(required) - set(arg._data.names))
+            req = list(set(required) - set(arg._column_names))
             if len(req):
                 err_req = ",".join(req)
                 raise ValueError(
@@ -196,7 +196,7 @@ def to_datetime(
                 )
 
             # replace passed column name with values in _unit_map
-            got_units = {k: get_units(k) for k in arg._data.names}
+            got_units = {k: get_units(k) for k in arg._column_names}
             unit_rev = {v: k for k, v in got_units.items()}
 
             # keys we don't recognize
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 265b87350ae..3af662b62ea 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -210,7 +210,7 @@ def _can_be_jitted(frame, func, args):
         # See https://github.com/numba/numba/issues/4587
         return False
 
-    if any(col.has_nulls() for col in frame._data.values()):
+    if any(col.has_nulls() for col in frame._columns):
         return False
     np_field_types = np.dtype(
         list(
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index 6d7362952c9..bfe716f0afc 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -126,25 +126,23 @@ def _get_udf_return_type(argty, func: Callable, args=()):
 
 def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES):
     return {
-        colname: col.dtype
-        if str(col.dtype) in supported_types
-        else np.dtype("O")
-        for colname, col in frame._data.items()
+        colname: dtype if str(dtype) in supported_types else np.dtype("O")
+        for colname, dtype in frame._dtypes
     }
 
 
 def _supported_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES):
     return {
-        colname: col.dtype
-        for colname, col in frame._data.items()
-        if str(col.dtype) in supported_types
+        colname: dtype
+        for colname, dtype in frame._dtypes
+        if str(dtype) in supported_types
     }
 
 
 def _supported_cols_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES):
     return {
         colname: col
-        for colname, col in frame._data.items()
+        for colname, col in frame._column_labels_and_values
         if str(col.dtype) in supported_types
     }
 
@@ -232,8 +230,8 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"):
         *cudautils.make_cache_key(
             func, tuple(_all_dtypes_from_frame(frame).values())
         ),
-        *(col.mask is None for col in frame._data.values()),
-        *frame._data.keys(),
+        *(col.mask is None for col in frame._columns),
+        *frame._column_names,
         scalar_argtypes,
         suffix,
     )
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index a9c20150930..3dc8915bfd1 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -186,13 +186,13 @@ def to_csv(
                 "Dataframe doesn't have the labels provided in columns"
             )
 
-    for col in df._data.columns:
-        if isinstance(col, cudf.core.column.ListColumn):
+    for _, dtype in df._dtypes:
+        if isinstance(dtype, cudf.ListDtype):
             raise NotImplementedError(
                 "Writing to csv format is not yet supported with "
                 "list columns."
             )
-        elif isinstance(col, cudf.core.column.StructColumn):
+        elif isinstance(dtype, cudf.StructDtype):
             raise NotImplementedError(
                 "Writing to csv format is not yet supported with "
                 "Struct columns."
@@ -203,12 +203,11 @@ def to_csv(
     # workaround once following issue is fixed:
     # https://github.com/rapidsai/cudf/issues/6661
     if any(
-        isinstance(col, cudf.core.column.CategoricalColumn)
-        for col in df._data.columns
+        isinstance(dtype, cudf.CategoricalDtype) for _, dtype in df._dtypes
     ) or isinstance(df.index, cudf.CategoricalIndex):
         df = df.copy(deep=False)
-        for col_name, col in df._data.items():
-            if isinstance(col, cudf.core.column.CategoricalColumn):
+        for col_name, col in df._column_labels_and_values:
+            if isinstance(col.dtype, cudf.CategoricalDtype):
                 df._data[col_name] = col.astype(col.categories.dtype)
 
         if isinstance(df.index, cudf.CategoricalIndex):
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index 1347b2cc38f..fe8e446f9c0 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -79,13 +79,13 @@ def to_dlpack(cudf_obj):
         )
 
     if any(
-        not cudf.api.types._is_non_decimal_numeric_dtype(col.dtype)
-        for col in gdf._data.columns
+        not cudf.api.types._is_non_decimal_numeric_dtype(dtype)
+        for _, dtype in gdf._dtypes
     ):
         raise TypeError("non-numeric data not yet supported")
 
     dtype = cudf.utils.dtypes.find_common_type(
-        [col.dtype for col in gdf._data.columns]
+        [dtype for _, dtype in gdf._dtypes]
     )
     gdf = gdf.astype(dtype)
 
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index fd246c6215f..c54293badbe 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -396,8 +396,8 @@ def to_orc(
 ):
     """{docstring}"""
 
-    for col in df._data.columns:
-        if isinstance(col, cudf.core.column.CategoricalColumn):
+    for _, dtype in df._dtypes:
+        if isinstance(dtype, cudf.CategoricalDtype):
             raise NotImplementedError(
                 "Writing to ORC format is not yet supported with "
                 "Categorical columns."
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 31ad24a4664..668e7a77454 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -676,7 +676,7 @@ def assert_frame_equal(
 
     if check_like:
         left, right = left.reindex(index=right.index), right
-        right = right[list(left._data.names)]
+        right = right[list(left._column_names)]
 
     # index comparison
     assert_index_equal(
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index b1e095e8853..c41be3e4428 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -813,8 +813,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep):
         mi1 = gdf.groupby(["Date", "Symbol"]).mean().index
         mi2 = mi1.copy(deep=deep)
 
-        lchildren = [col.children for _, col in mi1._data.items()]
-        rchildren = [col.children for _, col in mi2._data.items()]
+        lchildren = [col.children for col in mi1._columns]
+        rchildren = [col.children for col in mi2._columns]
 
         # Flatten
         lchildren = reduce(operator.add, lchildren)
@@ -849,12 +849,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep):
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
         # Assert ._data identity
-        lptrs = [
-            d.base_data.get_ptr(mode="read") for _, d in mi1._data.items()
-        ]
-        rptrs = [
-            d.base_data.get_ptr(mode="read") for _, d in mi2._data.items()
-        ]
+        lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns]
+        rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns]
 
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
     cudf.set_option("copy_on_write", original_cow_setting)

From dc57c1b1284816d0e5ed7493e6b661590c305511 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 19 Sep 2024 18:00:30 -0700
Subject: [PATCH 226/270] Revert "Refactor mixed_semi_join using
 cuco::static_set" (#16855)

Reverting rapidsai/cudf#16230 as this PR leads to https://github.com/rapidsai/cudf/issues/16852.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16855
---
 cpp/src/join/join_common_utils.hpp       |  6 ++
 cpp/src/join/mixed_join_common_utils.cuh | 33 ---------
 cpp/src/join/mixed_join_kernels_semi.cu  | 35 +++++----
 cpp/src/join/mixed_join_kernels_semi.cuh |  6 +-
 cpp/src/join/mixed_join_semi.cu          | 90 +++++++++++++++++-------
 cpp/tests/join/mixed_join_tests.cu       | 30 --------
 6 files changed, 91 insertions(+), 109 deletions(-)

diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 573101cefd9..86402a0e7de 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -22,6 +22,7 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
+#include <cuco/static_map.cuh>
 #include <cuco/static_multimap.cuh>
 #include <cuda/atomic>
 
@@ -50,6 +51,11 @@ using mixed_multimap_type =
                         cudf::detail::cuco_allocator<char>,
                         cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
+using semi_map_type = cuco::legacy::static_map<hash_value_type,
+                                               size_type,
+                                               cuda::thread_scope_device,
+                                               cudf::detail::cuco_allocator<char>>;
+
 using row_hash_legacy =
   cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
 
diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh
index 89c13285cfe..19701816867 100644
--- a/cpp/src/join/mixed_join_common_utils.cuh
+++ b/cpp/src/join/mixed_join_common_utils.cuh
@@ -25,7 +25,6 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cub/cub.cuh>
-#include <cuco/static_set.cuh>
 
 namespace cudf {
 namespace detail {
@@ -161,38 +160,6 @@ struct pair_expression_equality : public expression_equality<has_nulls> {
   }
 };
 
-/**
- * @brief Equality comparator that composes two row_equality comparators.
- */
-struct double_row_equality_comparator {
-  row_equality const equality_comparator;
-  row_equality const conditional_comparator;
-
-  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
-  {
-    using experimental::row::lhs_index_type;
-    using experimental::row::rhs_index_type;
-
-    return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
-           conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
-  }
-};
-
-// A CUDA Cooperative Group of 4 threads for the hash set.
-auto constexpr DEFAULT_MIXED_JOIN_CG_SIZE = 4;
-
-// The hash set type used by mixed_semi_join with the build_table.
-using hash_set_type = cuco::static_set<size_type,
-                                       cuco::extent<size_t>,
-                                       cuda::thread_scope_device,
-                                       double_row_equality_comparator,
-                                       cuco::linear_probing<DEFAULT_MIXED_JOIN_CG_SIZE, row_hash>,
-                                       cudf::detail::cuco_allocator<char>,
-                                       cuco::storage<1>>;
-
-// The hash_set_ref_type used by mixed_semi_join kerenels for probing.
-using hash_set_ref_type = hash_set_type::ref_type<cuco::contains_tag>;
-
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index f2c5ff13638..7459ac3e99c 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -38,16 +38,12 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                   table_device_view right_table,
                   table_device_view probe,
                   table_device_view build,
+                  row_hash const hash_probe,
                   row_equality const equality_probe,
-                  hash_set_ref_type set_ref,
+                  cudf::detail::semi_map_type::device_view hash_table_view,
                   cudf::device_span<bool> left_table_keep_mask,
                   cudf::ast::detail::expression_device_view device_expression_data)
 {
-  auto constexpr cg_size = hash_set_ref_type::cg_size;
-
-  auto const tile =
-    cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
-
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
   // used to circumvent conflicts between arrays of different types between
@@ -56,24 +52,24 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
     reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
   auto thread_intermediate_storage =
-    &intermediate_storage[tile.meta_group_rank() * device_expression_data.num_intermediates];
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = left_num_rows;
 
-  cudf::size_type const outer_num_rows = left_table.num_rows();
-  auto const outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>() / cg_size;
+  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
     left_table, right_table, device_expression_data);
 
   if (outer_row_index < outer_num_rows) {
-    // Make sure to swap_tables here as hash_set will use probe table as the left one.
-    auto constexpr swap_tables = true;
     // Figure out the number of elements for this key.
     auto equality = single_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+      evaluator, thread_intermediate_storage, false, equality_probe};
 
-    auto const set_ref_equality = set_ref.with_key_eq(equality);
-    auto const result           = set_ref_equality.contains(tile, outer_row_index);
-    if (tile.thread_rank() == 0) left_table_keep_mask[outer_row_index] = result;
+    left_table_keep_mask[outer_row_index] =
+      hash_table_view.contains(outer_row_index, hash_probe, equality);
   }
 }
 
@@ -82,8 +78,9 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
+                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            hash_set_ref_type set_ref,
+                            cudf::detail::semi_map_type::device_view hash_table_view,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
@@ -97,8 +94,9 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
+        hash_probe,
         equality_probe,
-        set_ref,
+        hash_table_view,
         left_table_keep_mask,
         device_expression_data);
   } else {
@@ -108,8 +106,9 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
+        hash_probe,
         equality_probe,
-        set_ref,
+        hash_table_view,
         left_table_keep_mask,
         device_expression_data);
   }
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index b08298e64e4..43714ffb36a 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -45,8 +45,9 @@ namespace detail {
  * @param[in] right_table The right table
  * @param[in] probe The table with which to probe the hash table for matches.
  * @param[in] build The table with which the hash table was built.
+ * @param[in] hash_probe The hasher used for the probe table.
  * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] set_ref The hash table device view built from `build`.
+ * @param[in] hash_table_view The hash table built from `build`.
  * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating
  * the corresponding index from left table is present in output
  * @param[in] device_expression_data Container of device data required to evaluate the desired
@@ -57,8 +58,9 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
+                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            hash_set_ref_type set_ref,
+                            cudf::detail::semi_map_type::device_view hash_table_view,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 719b1d47105..cfb785e242c 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -46,6 +46,45 @@
 namespace cudf {
 namespace detail {
 
+namespace {
+/**
+ * @brief Device functor to create a pair of hash value and index for a given row.
+ */
+struct make_pair_function_semi {
+  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
+  {
+    // The value is irrelevant since we only ever use the hash map to check for
+    // membership of a particular row index.
+    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
+  }
+};
+
+/**
+ * @brief Equality comparator that composes two row_equality comparators.
+ */
+class double_row_equality {
+ public:
+  double_row_equality(row_equality equality_comparator, row_equality conditional_comparator)
+    : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator}
+  {
+  }
+
+  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
+  {
+    using experimental::row::lhs_index_type;
+    using experimental::row::rhs_index_type;
+
+    return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
+           _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
+  }
+
+ private:
+  row_equality _equality_comparator;
+  row_equality _conditional_comparator;
+};
+
+}  // namespace
+
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -57,7 +96,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and
+  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
                  (join_type != join_kind::FULL_JOIN),
                "Inner, left, and full joins should use mixed_join.");
 
@@ -98,7 +137,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // output column and follow the null-supporting expression evaluation code
   // path.
   auto const has_nulls = cudf::nullate::DYNAMIC{
-    cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or
+    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
     binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)};
 
   auto const parser = ast::detail::expression_parser{
@@ -117,20 +156,27 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto right_conditional_view = table_device_view::create(right_conditional, stream);
 
   auto const preprocessed_build =
-    cudf::experimental::row::equality::preprocessed_table::create(build, stream);
+    experimental::row::equality::preprocessed_table::create(build, stream);
   auto const preprocessed_probe =
-    cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
+    experimental::row::equality::preprocessed_table::create(probe, stream);
   auto const row_comparator =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe};
+    cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
   auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
 
+  semi_map_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
+    cuco::empty_value{cudf::detail::JoinNoneValue},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+
   // Create hash table containing all keys found in right table
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const build_nulls    = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
   auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build};
-
+  auto const hash_build     = row_hash_build.device_hasher(build_nulls);
   // Since we may see multiple rows that are identical in the equality tables
   // but differ in the conditional tables, the equality comparator used for
   // insertion must account for both sets of tables. An alternative solution
@@ -145,28 +191,20 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const equality_build_equality =
     row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
   auto const preprocessed_build_condtional =
-    cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream);
+    experimental::row::equality::preprocessed_table::create(right_conditional, stream);
   auto const row_comparator_conditional_build =
     cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
                                                             preprocessed_build_condtional};
   auto const equality_build_conditional =
     row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
+  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
+  make_pair_function_semi pair_func_build{};
 
-  hash_set_type row_set{
-    {compute_hash_table_size(build.num_rows())},
-    cuco::empty_key{JoinNoneValue},
-    {equality_build_equality, equality_build_conditional},
-    {row_hash_build.device_hasher(build_nulls)},
-    {},
-    {},
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    {stream.value()}};
-
-  auto iter = thrust::make_counting_iterator(0);
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
 
   // skip rows that are null here.
   if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    row_set.insert(iter, iter + right_num_rows, stream.value());
+    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
     auto const [row_bitmask, _] =
@@ -174,19 +212,18 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
-    row_set.insert_if(iter, iter + right_num_rows, stencil, pred, stream.value());
+    hash_table.insert_if(
+      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
   }
 
+  auto hash_table_view = hash_table.get_device_view();
+
   detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block =
-    parser.shmem_per_thread *
-    cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
-  hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);
-
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
@@ -195,8 +232,9 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
                          *right_conditional_view,
                          *probe_view,
                          *build_view,
+                         hash_probe,
                          equality_probe,
-                         row_set_ref,
+                         hash_table_view,
                          cudf::device_span<bool>(left_table_keep_mask),
                          parser.device_expression_data,
                          config,
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index 08a0136700d..6c147c8a128 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -778,21 +778,6 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality)
              {1});
 }
 
-TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap)
-{
-  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
-  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-  auto left_one_greater_right_one =
-    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
-
-  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
-             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
-             {0},
-             {1},
-             left_one_greater_right_one,
-             {2, 7, 8});
-}
-
 TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)
 {
   this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}},
@@ -915,18 +900,3 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality)
              left_zero_eq_right_zero,
              {0, 1, 3});
 }
-
-TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap)
-{
-  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
-  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-  auto left_one_greater_right_one =
-    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
-
-  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
-             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
-             {0},
-             {1},
-             left_one_greater_right_one,
-             {0, 1, 3, 4, 5, 6, 9});
-}

From 267692490ba245404bf09c526bd61375ba72493b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 19 Sep 2024 20:52:08 -0500
Subject: [PATCH 227/270] Switch to using native `traceback` (#16851)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR switches pytest traceback to `native` instead of prettified pytest traceback that takes longer to finish and spits out the source code of the file where the error happens too which is not needed given the time savings.

With pytest traceback:
<img width="1063" alt="Screenshot 2024-09-19 at 2 34 57 PM" src="https://github.com/user-attachments/assets/9658dd5a-eeb9-4ded-8c77-21b71c74d0a5">
<img width="1073" alt="Screenshot 2024-09-19 at 2 35 07 PM" src="https://github.com/user-attachments/assets/b8500e8a-9d7d-4c0d-8b9a-b2546a0741ee">
<img width="1065" alt="Screenshot 2024-09-19 at 2 35 20 PM" src="https://github.com/user-attachments/assets/a7c2925d-f94d-4b74-97a5-e3d2a0ebf36c">

With `native` traceback:
<img width="713" alt="Screenshot 2024-09-19 at 2 34 04 PM" src="https://github.com/user-attachments/assets/e540bc4b-c351-4815-b2dd-dfe4bb491ecb">

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/16851
---
 ci/test_wheel_cudf.sh                                         | 1 +
 ci/test_wheel_dask_cudf.sh                                    | 2 ++
 python/cudf/benchmarks/pytest.ini                             | 1 +
 python/cudf/cudf/tests/pytest.ini                             | 1 +
 .../third_party_integration_tests/tests/pytest.ini            | 3 +++
 python/cudf_kafka/cudf_kafka/tests/pytest.ini                 | 4 ++++
 python/cudf_polars/tests/pytest.ini                           | 4 ++++
 python/custreamz/custreamz/tests/pytest.ini                   | 4 ++++
 python/dask_cudf/dask_cudf/tests/pytest.ini                   | 4 ++++
 python/pylibcudf/pylibcudf/tests/pytest.ini                   | 1 +
 10 files changed, 25 insertions(+)
 create mode 100644 python/cudf_kafka/cudf_kafka/tests/pytest.ini
 create mode 100644 python/cudf_polars/tests/pytest.ini
 create mode 100644 python/custreamz/custreamz/tests/pytest.ini
 create mode 100644 python/dask_cudf/dask_cudf/tests/pytest.ini

diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 28ded2f8e0f..a701bfe15e0 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -39,6 +39,7 @@ rapids-logger "pytest pylibcudf"
 pushd python/pylibcudf/pylibcudf/tests
 python -m pytest \
   --cache-clear \
+  --numprocesses=8 \
   --dist=worksteal \
   .
 popd
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 0d39807d56c..361a42ccda9 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -41,6 +41,7 @@ pushd python/dask_cudf/dask_cudf
 DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
+  --dist=worksteal \
   .
 popd
 
@@ -50,5 +51,6 @@ pushd python/dask_cudf/dask_cudf
 DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
   --numprocesses=8 \
+  --dist=worksteal \
   .
 popd
diff --git a/python/cudf/benchmarks/pytest.ini b/python/cudf/benchmarks/pytest.ini
index db24415ef9e..187d91996b2 100644
--- a/python/cudf/benchmarks/pytest.ini
+++ b/python/cudf/benchmarks/pytest.ini
@@ -6,3 +6,4 @@ python_classes = Bench
 python_functions = bench_*
 markers =
     pandas_incompatible: mark a benchmark that cannot be run with pandas
+addopts = --tb=native
diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
index 2136bca0e28..8a594794fac 100644
--- a/python/cudf/cudf/tests/pytest.ini
+++ b/python/cudf/cudf/tests/pytest.ini
@@ -14,3 +14,4 @@ filterwarnings =
     ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
     # PerformanceWarning from cupy warming up the JIT cache
     ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning
+addopts = --tb=native
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
index 817d98e6ba2..98459035298 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
@@ -1,3 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
 [pytest]
 xfail_strict=true
 markers=
@@ -5,3 +7,4 @@ markers=
     xfail_gold: this test is expected to fail in the gold pass
     xfail_cudf_pandas: this test is expected to fail in the cudf_pandas pass
     xfail_compare: this test is expected to fail in the comparison pass
+addopts = --tb=native
diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/cudf_kafka/cudf_kafka/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/cudf_polars/tests/pytest.ini b/python/cudf_polars/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/cudf_polars/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/custreamz/custreamz/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini
index 1761c0f011c..f572f85ca49 100644
--- a/python/pylibcudf/pylibcudf/tests/pytest.ini
+++ b/python/pylibcudf/pylibcudf/tests/pytest.ini
@@ -6,3 +6,4 @@ filterwarnings =
     error
     ignore:::.*xdist.*
     ignore:::.*pytest.*
+addopts = --tb=native

From 8a1d652118d352b1fd9eb646be09352a673beb76 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <wence@gmx.li>
Date: Fri, 20 Sep 2024 14:23:45 +0100
Subject: [PATCH 228/270] Fix branch in shared-workflow pointer

---
 .github/workflows/pr.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 93c75ead4cd..af1538ad0c1 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -248,7 +248,7 @@ jobs:
   cudf-polars-polars-tests:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))

From e278018363814df3c939f01df2ed0646a1ab3d24 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 20 Sep 2024 13:59:49 +0000
Subject: [PATCH 229/270] cmake-format

---
 python/pylibcudf/pylibcudf/strings/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index fc8ec35bf9c..8b4fbb1932f 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -12,9 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx
-                   regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx
-                   slice.pyx strip.pyx
+set(cython_sources
+    capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx regex_flags.pyx
+    regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx
 )
 
 set(linked_libraries cudf::cudf)

From 434f99b1b3842d1a2344725c4cb6e91d2be5b13b Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 20 Sep 2024 14:01:08 +0000
Subject: [PATCH 230/270] Pacify ruff

---
 python/cudf_polars/cudf_polars/dsl/translate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 2fa96a59bb7..45881afe0c8 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -100,7 +100,7 @@ def _(
     ):
         reader_options["schema"] = {
             "fields": reader_options["schema"]["inner"]
-        } # pragma: no cover; CI tests 1.7
+        }  # pragma: no cover; CI tests 1.7
     file_options = node.file_options
     with_columns = file_options.with_columns
     n_rows = file_options.n_rows

From 2fb0186defbb5dfbe7039c7cd602934a1cc35138 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Fri, 20 Sep 2024 12:07:40 -0400
Subject: [PATCH 231/270] Add cudf.pandas dependencies.yaml to
 update-version.sh (#16840)

Adds a `dependency.yaml` for `cudf.pandas` third party tests to `update-version.sh`

Authors:
  - Ray Douglass (https://github.com/raydouglass)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16840
---
 ci/release/update-version.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index be55b49870f..b0346327319 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -45,6 +45,8 @@ sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_dask_cudf.sh
 DEPENDENCIES=(
   cudf
   cudf_kafka
+  cugraph
+  cuml
   custreamz
   dask-cuda
   dask-cudf
@@ -57,7 +59,7 @@ DEPENDENCIES=(
   rmm
 )
 for DEP in "${DEPENDENCIES[@]}"; do
-  for FILE in dependencies.yaml conda/environments/*.yaml; do
+  for FILE in dependencies.yaml conda/environments/*.yaml python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
   for FILE in python/*/pyproject.toml; do

From 9834a3ab2b4554e0abd2c2eb1ee76f0462661144 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 20 Sep 2024 16:57:28 +0000
Subject: [PATCH 232/270] Update xfailing tests in polars test suite

---
 python/cudf_polars/cudf_polars/testing/plugin.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 7be40f6f762..c40d59e6d33 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -53,8 +53,7 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
-    "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[True]": "Unknown error: invalid parquet?",
-    "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Unknown error: invalid parquet?",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
     "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
@@ -119,7 +118,6 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype",
     "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype",
     "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
-    "tests/unit/operations/test_group_by.py::test_group_by_apply_first_input_is_literal": "Polars advertises incorrect schema names polars#18524",
     "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information",

From f71f53ab9650df4aeaa0bfcdd9a1e334e35a8d10 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 20 Sep 2024 12:09:17 -0500
Subject: [PATCH 233/270] JSON tree algorithm code reorg (#16836)

This PR moves JSON host tree algorithms to separate file.
This code movement will help #16545 review easier.

The code is moved to new file and reorganized for code reuse.
Very long function `make_device_json_column` is split into
- code block with `reduce_to_column_tree` call
- code moved to function `build_tree`
- code moved to function `scatter_offsets`

No new functionality is added in this PR.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Shruti Shivakumar (https://github.com/shrshi)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16836
---
 cpp/CMakeLists.txt                      |   1 +
 cpp/src/io/json/host_tree_algorithms.cu | 808 ++++++++++++++++++++++++
 cpp/src/io/json/json_column.cu          | 680 --------------------
 cpp/src/io/json/nested_json.hpp         |  54 +-
 4 files changed, 854 insertions(+), 689 deletions(-)
 create mode 100644 cpp/src/io/json/host_tree_algorithms.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7bc01e64441..26c086046a8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -378,6 +378,7 @@ add_library(
   src/io/csv/reader_impl.cu
   src/io/csv/writer_impl.cu
   src/io/functions.cpp
+  src/io/json/host_tree_algorithms.cu
   src/io/json/json_column.cu
   src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
new file mode 100644
index 00000000000..70d61132b42
--- /dev/null
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -0,0 +1,808 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/utilities/parsing_utils.cuh"
+#include "io/utilities/string_parsing.hpp"
+#include "nested_json.hpp"
+
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/copy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/uninitialized_fill.h>
+
+#include <algorithm>
+
+namespace cudf::io::json::detail {
+
+/**
+ * @brief Get the column indices for the values column for array of arrays rows
+ *
+ * @param row_array_children_level The level of the row array's children
+ * @param d_tree The tree metadata
+ * @param col_ids The column ids
+ * @param num_columns The number of columns
+ * @param stream The stream to use
+ * @return The value columns' indices
+ */
+rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_array_children_level,
+                                                          tree_meta_t const& d_tree,
+                                                          device_span<NodeIndexT> col_ids,
+                                                          size_type const num_columns,
+                                                          rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  auto [level2_nodes, level2_indices] = get_array_children_indices(
+    row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream);
+  auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin());
+  rmm::device_uvector<NodeIndexT> values_column_indices(num_columns, stream);
+  thrust::scatter(rmm::exec_policy(stream),
+                  level2_indices.begin(),
+                  level2_indices.end(),
+                  col_id_location,
+                  values_column_indices.begin());
+  return values_column_indices;
+}
+
+/**
+ * @brief Copies strings specified by pair of begin, end offsets to host vector of strings.
+ *
+ * @param input String device buffer
+ * @param node_range_begin Begin offset of the strings
+ * @param node_range_end End offset of the strings
+ * @param stream CUDA stream
+ * @return Vector of strings
+ */
+std::vector<std::string> copy_strings_to_host_sync(
+  device_span<SymbolT const> input,
+  device_span<SymbolOffsetT const> node_range_begin,
+  device_span<SymbolOffsetT const> node_range_end,
+  rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  auto const num_strings = node_range_begin.size();
+  rmm::device_uvector<size_type> string_offsets(num_strings, stream);
+  rmm::device_uvector<size_type> string_lengths(num_strings, stream);
+  auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
+  thrust::transform(rmm::exec_policy(stream),
+                    d_offset_pairs,
+                    d_offset_pairs + num_strings,
+                    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()),
+                    [] __device__(auto const& offsets) {
+                      // Note: first character for non-field columns
+                      return thrust::make_tuple(
+                        static_cast<size_type>(thrust::get<0>(offsets)),
+                        static_cast<size_type>(thrust::get<1>(offsets) - thrust::get<0>(offsets)));
+                    });
+
+  cudf::io::parse_options_view options_view{};
+  options_view.quotechar  = '\0';  // no quotes
+  options_view.keepquotes = true;
+  auto d_offset_length_it =
+    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin());
+  auto d_column_names = parse_data(input.data(),
+                                   d_offset_length_it,
+                                   num_strings,
+                                   data_type{type_id::STRING},
+                                   rmm::device_buffer{},
+                                   0,
+                                   options_view,
+                                   stream,
+                                   cudf::get_current_device_resource_ref());
+  auto to_host        = [stream](auto const& col) {
+    if (col.is_empty()) return std::vector<std::string>{};
+    auto const scv     = cudf::strings_column_view(col);
+    auto const h_chars = cudf::detail::make_host_vector_async<char>(
+      cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
+    auto const h_offsets = cudf::detail::make_host_vector_async(
+      cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
+                                               scv.size() + 1),
+      stream);
+    stream.synchronize();
+
+    // build std::string vector from chars and offsets
+    std::vector<std::string> host_data;
+    host_data.reserve(col.size());
+    std::transform(
+      std::begin(h_offsets),
+      std::end(h_offsets) - 1,
+      std::begin(h_offsets) + 1,
+      std::back_inserter(host_data),
+      [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
+    return host_data;
+  };
+  return to_host(d_column_names->view());
+}
+
+/**
+ * @brief Checks if all strings in each string column in the tree are nulls.
+ * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as
+ * false.
+ *
+ * @param input Input JSON string device data
+ * @param d_column_tree column tree representation of JSON string
+ * @param tree Node tree representation of the JSON string
+ * @param col_ids Column ids of the nodes in the tree
+ * @param options Parsing options specifying the parsing behaviour
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Array of bytes where each byte indicate if it is all nulls string column.
+ */
+rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const> input,
+                                                      tree_meta_t const& d_column_tree,
+                                                      tree_meta_t const& tree,
+                                                      device_span<NodeIndexT> col_ids,
+                                                      cudf::io::json_reader_options const& options,
+                                                      rmm::cuda_stream_view stream)
+{
+  auto const num_nodes = col_ids.size();
+  auto const num_cols  = d_column_tree.node_categories.size();
+  rmm::device_uvector<uint8_t> is_all_nulls(num_cols, stream);
+  thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
+
+  auto parse_opt = parsing_options(options, stream);
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::counting_iterator<size_type>(0),
+    num_nodes,
+    [options           = parse_opt.view(),
+     data              = input.data(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     is_all_nulls      = is_all_nulls.begin()] __device__(size_type i) {
+      auto const node_category = column_categories[col_ids[i]];
+      if (node_category == NC_STR or node_category == NC_VAL) {
+        auto const is_null_literal = serialized_trie_contains(
+          options.trie_na,
+          {data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
+        if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
+      }
+    });
+  return is_all_nulls;
+}
+
+NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT> col_ids,
+                                       bool is_enabled_lines,
+                                       rmm::cuda_stream_view stream)
+{
+  NodeIndexT value = parent_node_sentinel;
+  if (!col_ids.empty()) {
+    auto const list_node_index = is_enabled_lines ? 0 : 1;
+    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
+                                  col_ids.data() + list_node_index,
+                                  sizeof(NodeIndexT),
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    stream.synchronize();
+  }
+  return value;
+}
+/**
+ * @brief Holds member data pointers of `d_json_column`
+ *
+ */
+struct json_column_data {
+  using row_offset_t = json_column::row_offset_t;
+  row_offset_t* string_offsets;
+  row_offset_t* string_lengths;
+  row_offset_t* child_offsets;
+  bitmask_type* validity;
+};
+
+std::pair<cudf::detail::host_vector<uint8_t>,
+          std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>>
+build_tree(device_json_column& root,
+           std::vector<uint8_t> const& is_str_column_all_nulls,
+           tree_meta_t& d_column_tree,
+           device_span<NodeIndexT const> d_unique_col_ids,
+           device_span<size_type const> d_max_row_offsets,
+           std::vector<std::string> const& column_names,
+           NodeIndexT row_array_parent_col_id,
+           bool is_array_of_arrays,
+           cudf::io::json_reader_options const& options,
+           rmm::cuda_stream_view stream,
+           rmm::device_async_resource_ref mr);
+void scatter_offsets(
+  tree_meta_t& tree,
+  device_span<NodeIndexT> col_ids,
+  device_span<size_type> row_offsets,
+  device_span<size_type> node_ids,
+  device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+  tree_meta_t& d_column_tree,
+  host_span<const uint8_t> ignore_vals,
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>& columns,
+  rmm::cuda_stream_view stream);
+
+/**
+ * @brief Constructs `d_json_column` from node tree representation
+ * Newly constructed columns are insert into `root`'s children.
+ * `root` must be a list type.
+ *
+ * @param input Input JSON string device data
+ * @param tree Node tree representation of the JSON string
+ * @param col_ids Column ids of the nodes in the tree
+ * @param row_offsets Row offsets of the nodes in the tree
+ * @param root Root node of the `d_json_column` tree
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param options Parsing options specifying the parsing behaviour
+ * options affecting behaviour are
+ *   is_enabled_lines: Whether the input is a line-delimited JSON
+ *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the device memory
+ * of child_offets and validity members of `d_json_column`
+ */
+void make_device_json_column(device_span<SymbolT const> input,
+                             tree_meta_t& tree,
+                             device_span<NodeIndexT> col_ids,
+                             device_span<size_type> row_offsets,
+                             device_json_column& root,
+                             bool is_array_of_arrays,
+                             cudf::io::json_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+
+  bool const is_enabled_lines                 = options.is_enabled_lines();
+  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  auto const num_nodes                        = col_ids.size();
+  rmm::device_uvector<NodeIndexT> sorted_col_ids(col_ids.size(), stream);  // make a copy
+  thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin());
+
+  // sort by {col_id} on {node_ids} stable
+  rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
+  thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end());
+  thrust::stable_sort_by_key(
+    rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
+
+  NodeIndexT const row_array_parent_col_id =
+    get_row_array_parent_col_id(col_ids, is_enabled_lines, stream);
+
+  // 1. gather column information.
+  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
+    reduce_to_column_tree(tree,
+                          col_ids,
+                          sorted_col_ids,
+                          node_ids,
+                          row_offsets,
+                          is_array_of_arrays,
+                          row_array_parent_col_id,
+                          stream);
+  auto num_columns                      = d_unique_col_ids.size();
+  std::vector<std::string> column_names = copy_strings_to_host_sync(
+    input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
+  // array of arrays column names
+  if (is_array_of_arrays) {
+    auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
+    auto const column_parent_ids =
+      cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
+    TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
+    auto values_column_indices =
+      get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
+    auto h_values_column_indices =
+      cudf::detail::make_host_vector_sync(values_column_indices, stream);
+    std::transform(unique_col_ids.begin(),
+                   unique_col_ids.end(),
+                   column_names.begin(),
+                   column_names.begin(),
+                   [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
+                     auto col_id, auto name) mutable {
+                     return column_parent_ids[col_id] == row_array_parent_col_id
+                              ? std::to_string(h_values_column_indices[col_id])
+                              : name;
+                   });
+  }
+
+  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
+    if (is_enabled_mixed_types_as_string) {
+      return cudf::detail::make_std_vector_sync(
+        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
+    }
+    return std::vector<uint8_t>();
+  }();
+  auto [ignore_vals, columns] = build_tree(root,
+                                           is_str_column_all_nulls,
+                                           d_column_tree,
+                                           d_unique_col_ids,
+                                           d_max_row_offsets,
+                                           column_names,
+                                           row_array_parent_col_id,
+                                           is_array_of_arrays,
+                                           options,
+                                           stream,
+                                           mr);
+
+  scatter_offsets(tree,
+                  col_ids,
+                  row_offsets,
+                  node_ids,
+                  sorted_col_ids,
+                  d_column_tree,
+                  ignore_vals,
+                  columns,
+                  stream);
+}
+
+std::pair<cudf::detail::host_vector<uint8_t>,
+          std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>>
+build_tree(device_json_column& root,
+           std::vector<uint8_t> const& is_str_column_all_nulls,
+           tree_meta_t& d_column_tree,
+           device_span<NodeIndexT const> d_unique_col_ids,
+           device_span<size_type const> d_max_row_offsets,
+           std::vector<std::string> const& column_names,
+           NodeIndexT row_array_parent_col_id,
+           bool is_array_of_arrays,
+           cudf::io::json_reader_options const& options,
+           rmm::cuda_stream_view stream,
+           rmm::device_async_resource_ref mr)
+{
+  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
+  auto column_categories =
+    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
+  auto const column_parent_ids =
+    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
+  auto column_range_beg =
+    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
+  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
+  auto num_columns           = d_unique_col_ids.size();
+
+  auto to_json_col_type = [](auto category) {
+    switch (category) {
+      case NC_STRUCT: return json_col_t::StructColumn;
+      case NC_LIST: return json_col_t::ListColumn;
+      case NC_STR: [[fallthrough]];
+      case NC_VAL: return json_col_t::StringColumn;
+      default: return json_col_t::Unknown;
+    }
+  };
+  auto init_to_zero = [stream](auto& v) {
+    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
+  };
+
+  auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
+    if (column_category == NC_ERR || column_category == NC_FN) {
+      return;
+    } else if (column_category == NC_VAL || column_category == NC_STR) {
+      col.string_offsets.resize(max_row_offsets[i] + 1, stream);
+      col.string_lengths.resize(max_row_offsets[i] + 1, stream);
+      init_to_zero(col.string_offsets);
+      init_to_zero(col.string_lengths);
+    } else if (column_category == NC_LIST) {
+      col.child_offsets.resize(max_row_offsets[i] + 2, stream);
+      init_to_zero(col.child_offsets);
+    }
+    col.num_rows = max_row_offsets[i] + 1;
+    col.validity =
+      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
+    col.type = to_json_col_type(column_category);
+  };
+
+  auto reinitialize_as_string = [&](auto i, auto& col) {
+    col.string_offsets.resize(max_row_offsets[i] + 1, stream);
+    col.string_lengths.resize(max_row_offsets[i] + 1, stream);
+    init_to_zero(col.string_offsets);
+    init_to_zero(col.string_lengths);
+    col.num_rows = max_row_offsets[i] + 1;
+    col.validity =
+      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
+    col.type = json_col_t::StringColumn;
+    // destroy references of all child columns after this step, by calling remove_child_columns
+  };
+
+  path_from_tree tree_path{column_categories,
+                           column_parent_ids,
+                           column_names,
+                           is_array_of_arrays,
+                           row_array_parent_col_id};
+
+  // 2. generate nested columns tree and its device_memory
+  // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
+  auto h_range_col_id_it =
+    thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin());
+  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
+    return thrust::get<0>(a) < thrust::get<0>(b);
+  });
+
+  // use hash map because we may skip field name's col_ids
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
+  // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
+  std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
+  // find column_ids which are values, but should be ignored in validity
+  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
+  std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
+  std::vector<uint8_t> is_pruned(num_columns, 0);
+  // for columns that are not mixed type but have been forced as string
+  std::vector<bool> forced_as_string_column(num_columns);
+  columns.try_emplace(parent_node_sentinel, std::ref(root));
+
+  std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
+    [&](NodeIndexT this_col_id, device_json_column& col) {
+      for (auto col_name : col.column_order) {
+        auto child_id                  = mapped_columns[{this_col_id, col_name}];
+        is_mixed_type_column[child_id] = 1;
+        remove_child_columns(child_id, col.child_columns.at(col_name));
+        mapped_columns.erase({this_col_id, col_name});
+        columns.erase(child_id);
+      }
+      col.child_columns.clear();  // their references are deleted above.
+      col.column_order.clear();
+    };
+
+  auto name_and_parent_index = [&is_array_of_arrays,
+                                &row_array_parent_col_id,
+                                &column_parent_ids,
+                                &column_categories,
+                                &column_names](auto this_col_id) {
+    std::string name   = "";
+    auto parent_col_id = column_parent_ids[this_col_id];
+    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
+      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
+        name = column_names[this_col_id];
+      } else {
+        name = list_child_name;
+      }
+    } else if (column_categories[parent_col_id] == NC_FN) {
+      auto field_name_col_id = parent_col_id;
+      parent_col_id          = column_parent_ids[parent_col_id];
+      name                   = column_names[field_name_col_id];
+    } else {
+      CUDF_FAIL("Unexpected parent column category");
+    }
+    return std::pair{name, parent_col_id};
+  };
+
+  // Prune columns that are not required to be parsed.
+  if (options.is_enabled_prune_columns()) {
+    for (auto const this_col_id : unique_col_ids) {
+      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+        continue;
+      }
+      // Struct, List, String, Value
+      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
+      // get path of this column, and get its dtype if present in options
+      auto const nt                             = tree_path.get_path(this_col_id);
+      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
+        is_pruned[this_col_id] = 1;
+        continue;
+      } else {
+        // make sure all its parents are not pruned.
+        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
+          is_pruned[parent_col_id] = 0;
+          parent_col_id            = column_parent_ids[parent_col_id];
+        }
+      }
+    }
+  }
+
+  // Build the column tree, also, handles mixed types.
+  for (auto const this_col_id : unique_col_ids) {
+    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+      continue;
+    }
+    // Struct, List, String, Value
+    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
+
+    // if parent is mixed type column or this column is pruned or if parent
+    // has been forced as string, ignore this column.
+    if (parent_col_id != parent_node_sentinel &&
+          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
+        forced_as_string_column[parent_col_id]) {
+      ignore_vals[this_col_id] = 1;
+      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
+      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
+      continue;
+    }
+
+    // If the child is already found,
+    // replace if this column is a nested column and the existing was a value column
+    // ignore this column if this column is a value column and the existing was a nested column
+    auto it = columns.find(parent_col_id);
+    CUDF_EXPECTS(it != columns.end(), "Parent column not found");
+    auto& parent_col = it->second.get();
+    bool replaced    = false;
+    if (mapped_columns.count({parent_col_id, name}) > 0) {
+      auto const old_col_id = mapped_columns[{parent_col_id, name}];
+      // If mixed type as string is enabled, make both of them strings and merge them.
+      // All child columns will be ignored when parsing.
+      if (is_enabled_mixed_types_as_string) {
+        bool const is_mixed_type = [&]() {
+          // If new or old is STR and they are all not null, make it mixed type, else ignore.
+          if (column_categories[this_col_id] == NC_VAL ||
+              column_categories[this_col_id] == NC_STR) {
+            if (is_str_column_all_nulls[this_col_id]) return false;
+          }
+          if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
+            if (is_str_column_all_nulls[old_col_id]) return false;
+          }
+          return true;
+        }();
+        if (is_mixed_type) {
+          is_mixed_type_column[this_col_id] = 1;
+          is_mixed_type_column[old_col_id]  = 1;
+          // if old col type (not cat) is list or struct, replace with string.
+          auto& col = columns.at(old_col_id).get();
+          if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
+            reinitialize_as_string(old_col_id, col);
+            remove_child_columns(old_col_id, col);
+            // all its children (which are already inserted) are ignored later.
+          }
+          col.forced_as_string_column = true;
+          columns.try_emplace(this_col_id, columns.at(old_col_id));
+          continue;
+        }
+      }
+
+      if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
+        ignore_vals[this_col_id] = 1;
+        continue;
+      }
+      if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
+        // remap
+        ignore_vals[old_col_id] = 1;
+        mapped_columns.erase({parent_col_id, name});
+        columns.erase(old_col_id);
+        parent_col.child_columns.erase(name);
+        replaced = true;  // to skip duplicate name in column_order
+      } else {
+        // If this is a nested column but we're trying to insert either (a) a list node into a
+        // struct column or (b) a struct node into a list column, we fail
+        CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and
+                          column_categories[this_col_id] == NC_STRUCT) or
+                         (column_categories[old_col_id] == NC_STRUCT and
+                          column_categories[this_col_id] == NC_LIST)),
+                     "A mix of lists and structs within the same column is not supported");
+      }
+    }
+
+    auto this_column_category = column_categories[this_col_id];
+    // get path of this column, check if it is a struct/list forced as string, and enforce it
+    auto const nt                             = tree_path.get_path(this_col_id);
+    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      this_column_category = NC_STR;
+    }
+
+    CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
+    // move into parent
+    device_json_column col(stream, mr);
+    initialize_json_columns(this_col_id, col, this_column_category);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      col.forced_as_string_column          = true;
+      forced_as_string_column[this_col_id] = true;
+    }
+
+    auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
+    CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
+    if (not replaced) parent_col.column_order.push_back(name);
+    columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
+    mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
+  }
+
+  if (is_enabled_mixed_types_as_string) {
+    // ignore all children of mixed type columns
+    for (auto const this_col_id : unique_col_ids) {
+      auto parent_col_id = column_parent_ids[this_col_id];
+      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
+        is_mixed_type_column[this_col_id] = 1;
+        ignore_vals[this_col_id]          = 1;
+        columns.erase(this_col_id);
+      }
+      // Convert only mixed type columns as string (so to copy), but not its children
+      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and
+          is_mixed_type_column[this_col_id] == 1)
+        column_categories[this_col_id] = NC_STR;
+    }
+    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                    column_categories.data(),
+                                    column_categories.size() * sizeof(column_categories[0]),
+                                    cudf::detail::host_memory_kind::PAGEABLE,
+                                    stream);
+  }
+
+  // ignore all children of columns forced as string
+  for (auto const this_col_id : unique_col_ids) {
+    auto parent_col_id = column_parent_ids[this_col_id];
+    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
+      forced_as_string_column[this_col_id] = true;
+      ignore_vals[this_col_id]             = 1;
+    }
+    // Convert only mixed type columns as string (so to copy), but not its children
+    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
+        forced_as_string_column[this_col_id])
+      column_categories[this_col_id] = NC_STR;
+  }
+  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                  column_categories.data(),
+                                  column_categories.size() * sizeof(column_categories[0]),
+                                  cudf::detail::host_memory_kind::PAGEABLE,
+                                  stream);
+
+  // restore unique_col_ids order
+  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
+    return thrust::get<1>(a) < thrust::get<1>(b);
+  });
+  return {ignore_vals, columns};
+}
+
+void scatter_offsets(
+  tree_meta_t& tree,
+  device_span<NodeIndexT> col_ids,
+  device_span<size_type> row_offsets,
+  device_span<size_type> node_ids,
+  device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+  tree_meta_t& d_column_tree,
+  host_span<const uint8_t> ignore_vals,
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>& columns,
+  rmm::cuda_stream_view stream)
+{
+  auto const num_nodes   = col_ids.size();
+  auto const num_columns = d_column_tree.node_categories.size();
+  // move columns data to device.
+  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
+  for (auto& [col_id, col_ref] : columns) {
+    if (col_id == parent_node_sentinel) continue;
+    auto& col            = col_ref.get();
+    columns_data[col_id] = json_column_data{col.string_offsets.data(),
+                                            col.string_lengths.data(),
+                                            col.child_offsets.data(),
+                                            static_cast<bitmask_type*>(col.validity.data())};
+  }
+
+  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
+    ignore_vals, stream, cudf::get_current_device_resource_ref());
+  auto d_columns_data = cudf::detail::make_device_uvector_async(
+    columns_data, stream, cudf::get_current_device_resource_ref());
+
+  // 3. scatter string offsets to respective columns, set validity bits
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::counting_iterator<size_type>(0),
+    num_nodes,
+    [column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     row_offsets       = row_offsets.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     d_ignore_vals     = d_ignore_vals.begin(),
+     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
+      if (d_ignore_vals[col_ids[i]]) return;
+      auto const node_category = column_categories[col_ids[i]];
+      switch (node_category) {
+        case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
+        case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
+        case NC_STR: [[fallthrough]];
+        case NC_VAL:
+          if (d_ignore_vals[col_ids[i]]) break;
+          set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]);
+          d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i];
+          d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i];
+          break;
+        default: break;
+      }
+    });
+
+  // 4. scatter List offset
+  // copy_if only node's whose parent is list, (node_id, parent_col_id)
+  // stable_sort by parent_col_id of {node_id}.
+  // For all unique parent_node_id of (i==0, i-1!=i), write start offset.
+  //                                  (i==last, i+1!=i), write end offset.
+  //    unique_copy_by_key {parent_node_id} {row_offset} to
+  //    col[parent_col_id].child_offsets[row_offset[parent_node_id]]
+
+  auto& parent_col_ids = sorted_col_ids;  // reuse sorted_col_ids
+  auto parent_col_id   = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<size_type>(0),
+    cuda::proclaim_return_type<NodeIndexT>(
+      [col_ids         = col_ids.begin(),
+       parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
+        return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel
+                                                                  : col_ids[parent_node_ids[node_id]];
+      }));
+  auto const list_children_end = thrust::copy_if(
+    rmm::exec_policy(stream),
+    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
+    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
+      num_nodes,
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
+    [d_ignore_vals     = d_ignore_vals.begin(),
+     parent_node_ids   = tree.parent_node_ids.begin(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin()] __device__(size_type node_id) {
+      auto parent_node_id = parent_node_ids[node_id];
+      return parent_node_id != parent_node_sentinel and
+             column_categories[col_ids[parent_node_id]] == NC_LIST and
+             (!d_ignore_vals[col_ids[parent_node_id]]);
+    });
+
+  auto const num_list_children =
+    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
+  thrust::stable_sort_by_key(rmm::exec_policy(stream),
+                             parent_col_ids.begin(),
+                             parent_col_ids.begin() + num_list_children,
+                             node_ids.begin());
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    num_list_children,
+    [node_ids        = node_ids.begin(),
+     parent_node_ids = tree.parent_node_ids.begin(),
+     parent_col_ids  = parent_col_ids.begin(),
+     row_offsets     = row_offsets.begin(),
+     d_columns_data  = d_columns_data.begin(),
+     num_list_children] __device__(size_type i) {
+      auto const node_id        = node_ids[i];
+      auto const parent_node_id = parent_node_ids[node_id];
+      // scatter to list_offset
+      if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
+        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
+          row_offsets[node_id];
+      }
+      // last value of list child_offset is its size.
+      if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
+        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
+          row_offsets[node_id] + 1;
+      }
+    });
+
+  // 5. scan on offsets.
+  for (auto& [id, col_ref] : columns) {
+    auto& col = col_ref.get();
+    if (col.type == json_col_t::StringColumn) {
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                             col.string_offsets.begin(),
+                             col.string_offsets.end(),
+                             col.string_offsets.begin(),
+                             thrust::maximum<json_column::row_offset_t>{});
+    } else if (col.type == json_col_t::ListColumn) {
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                             col.child_offsets.begin(),
+                             col.child_offsets.end(),
+                             col.child_offsets.begin(),
+                             thrust::maximum<json_column::row_offset_t>{});
+    }
+  }
+  stream.synchronize();
+}
+
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 756047d383a..b08fd139113 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -24,7 +24,6 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/detail/json.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -36,23 +35,16 @@
 
 #include <cuda/atomic>
 #include <cuda/functional>
-#include <thrust/count.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/gather.h>
-#include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
-#include <thrust/scan.h>
-#include <thrust/sort.h>
 #include <thrust/transform.h>
 #include <thrust/unique.h>
 
-#include <algorithm>
-#include <cstdint>
-
 namespace cudf::io::json::detail {
 
 // DEBUG prints
@@ -297,678 +289,6 @@ reduce_to_column_tree(tree_meta_t& tree,
                     std::move(max_row_offsets)};
 }
 
-/**
- * @brief Get the column indices for the values column for array of arrays rows
- *
- * @param row_array_children_level The level of the row array's children
- * @param d_tree The tree metadata
- * @param col_ids The column ids
- * @param num_columns The number of columns
- * @param stream The stream to use
- * @return The value columns' indices
- */
-rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_array_children_level,
-                                                          tree_meta_t const& d_tree,
-                                                          device_span<NodeIndexT> col_ids,
-                                                          size_type const num_columns,
-                                                          rmm::cuda_stream_view stream)
-{
-  CUDF_FUNC_RANGE();
-  auto [level2_nodes, level2_indices] = get_array_children_indices(
-    row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream);
-  auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin());
-  rmm::device_uvector<NodeIndexT> values_column_indices(num_columns, stream);
-  thrust::scatter(rmm::exec_policy(stream),
-                  level2_indices.begin(),
-                  level2_indices.end(),
-                  col_id_location,
-                  values_column_indices.begin());
-  return values_column_indices;
-}
-
-/**
- * @brief Copies strings specified by pair of begin, end offsets to host vector of strings.
- *
- * @param input String device buffer
- * @param node_range_begin Begin offset of the strings
- * @param node_range_end End offset of the strings
- * @param stream CUDA stream
- * @return Vector of strings
- */
-std::vector<std::string> copy_strings_to_host_sync(
-  device_span<SymbolT const> input,
-  device_span<SymbolOffsetT const> node_range_begin,
-  device_span<SymbolOffsetT const> node_range_end,
-  rmm::cuda_stream_view stream)
-{
-  CUDF_FUNC_RANGE();
-  auto const num_strings = node_range_begin.size();
-  rmm::device_uvector<size_type> string_offsets(num_strings, stream);
-  rmm::device_uvector<size_type> string_lengths(num_strings, stream);
-  auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
-  thrust::transform(rmm::exec_policy(stream),
-                    d_offset_pairs,
-                    d_offset_pairs + num_strings,
-                    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()),
-                    [] __device__(auto const& offsets) {
-                      // Note: first character for non-field columns
-                      return thrust::make_tuple(
-                        static_cast<size_type>(thrust::get<0>(offsets)),
-                        static_cast<size_type>(thrust::get<1>(offsets) - thrust::get<0>(offsets)));
-                    });
-
-  cudf::io::parse_options_view options_view{};
-  options_view.quotechar  = '\0';  // no quotes
-  options_view.keepquotes = true;
-  auto d_offset_length_it =
-    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin());
-  auto d_column_names = parse_data(input.data(),
-                                   d_offset_length_it,
-                                   num_strings,
-                                   data_type{type_id::STRING},
-                                   rmm::device_buffer{},
-                                   0,
-                                   options_view,
-                                   stream,
-                                   cudf::get_current_device_resource_ref());
-  auto to_host        = [stream](auto const& col) {
-    if (col.is_empty()) return std::vector<std::string>{};
-    auto const scv     = cudf::strings_column_view(col);
-    auto const h_chars = cudf::detail::make_host_vector_async<char>(
-      cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
-    auto const h_offsets = cudf::detail::make_host_vector_async(
-      cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
-                                               scv.size() + 1),
-      stream);
-    stream.synchronize();
-
-    // build std::string vector from chars and offsets
-    std::vector<std::string> host_data;
-    host_data.reserve(col.size());
-    std::transform(
-      std::begin(h_offsets),
-      std::end(h_offsets) - 1,
-      std::begin(h_offsets) + 1,
-      std::back_inserter(host_data),
-      [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
-    return host_data;
-  };
-  return to_host(d_column_names->view());
-}
-
-/**
- * @brief Checks if all strings in each string column in the tree are nulls.
- * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as
- * false.
- *
- * @param input Input JSON string device data
- * @param d_column_tree column tree representation of JSON string
- * @param tree Node tree representation of the JSON string
- * @param col_ids Column ids of the nodes in the tree
- * @param options Parsing options specifying the parsing behaviour
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Array of bytes where each byte indicate if it is all nulls string column.
- */
-rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const> input,
-                                                      tree_meta_t const& d_column_tree,
-                                                      tree_meta_t const& tree,
-                                                      device_span<NodeIndexT> col_ids,
-                                                      cudf::io::json_reader_options const& options,
-                                                      rmm::cuda_stream_view stream)
-{
-  auto const num_nodes = col_ids.size();
-  auto const num_cols  = d_column_tree.node_categories.size();
-  rmm::device_uvector<uint8_t> is_all_nulls(num_cols, stream);
-  thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
-
-  auto parse_opt = parsing_options(options, stream);
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::counting_iterator<size_type>(0),
-    num_nodes,
-    [options           = parse_opt.view(),
-     data              = input.data(),
-     column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin(),
-     range_begin       = tree.node_range_begin.begin(),
-     range_end         = tree.node_range_end.begin(),
-     is_all_nulls      = is_all_nulls.begin()] __device__(size_type i) {
-      auto const node_category = column_categories[col_ids[i]];
-      if (node_category == NC_STR or node_category == NC_VAL) {
-        auto const is_null_literal = serialized_trie_contains(
-          options.trie_na,
-          {data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
-        if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
-      }
-    });
-  return is_all_nulls;
-}
-
-/**
- * @brief Holds member data pointers of `d_json_column`
- *
- */
-struct json_column_data {
-  using row_offset_t = json_column::row_offset_t;
-  row_offset_t* string_offsets;
-  row_offset_t* string_lengths;
-  row_offset_t* child_offsets;
-  bitmask_type* validity;
-};
-
-/**
- * @brief Constructs `d_json_column` from node tree representation
- * Newly constructed columns are insert into `root`'s children.
- * `root` must be a list type.
- *
- * @param input Input JSON string device data
- * @param tree Node tree representation of the JSON string
- * @param col_ids Column ids of the nodes in the tree
- * @param row_offsets Row offsets of the nodes in the tree
- * @param root Root node of the `d_json_column` tree
- * @param is_array_of_arrays Whether the tree is an array of arrays
- * @param options Parsing options specifying the parsing behaviour
- * options affecting behaviour are
- *   is_enabled_lines: Whether the input is a line-delimited JSON
- *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the device memory
- * of child_offets and validity members of `d_json_column`
- */
-void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t& tree,
-                             device_span<NodeIndexT> col_ids,
-                             device_span<size_type> row_offsets,
-                             device_json_column& root,
-                             bool is_array_of_arrays,
-                             cudf::io::json_reader_options const& options,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-
-  bool const is_enabled_lines                 = options.is_enabled_lines();
-  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
-  auto const num_nodes                        = col_ids.size();
-  rmm::device_uvector<NodeIndexT> sorted_col_ids(col_ids.size(), stream);  // make a copy
-  thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin());
-
-  // sort by {col_id} on {node_ids} stable
-  rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
-  thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end());
-  thrust::stable_sort_by_key(
-    rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
-
-  NodeIndexT const row_array_parent_col_id = [&]() {
-    NodeIndexT value = parent_node_sentinel;
-    if (!col_ids.empty()) {
-      auto const list_node_index = is_enabled_lines ? 0 : 1;
-      CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
-                                    col_ids.data() + list_node_index,
-                                    sizeof(NodeIndexT),
-                                    cudaMemcpyDefault,
-                                    stream.value()));
-      stream.synchronize();
-    }
-    return value;
-  }();
-
-  // 1. gather column information.
-  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
-    reduce_to_column_tree(tree,
-                          col_ids,
-                          sorted_col_ids,
-                          node_ids,
-                          row_offsets,
-                          is_array_of_arrays,
-                          row_array_parent_col_id,
-                          stream);
-  auto num_columns    = d_unique_col_ids.size();
-  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
-  auto column_categories =
-    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
-  auto const column_parent_ids =
-    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
-  auto column_range_beg =
-    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
-  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
-  std::vector<std::string> column_names = copy_strings_to_host_sync(
-    input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
-  // array of arrays column names
-  if (is_array_of_arrays) {
-    TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
-    auto values_column_indices =
-      get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
-    auto h_values_column_indices =
-      cudf::detail::make_host_vector_sync(values_column_indices, stream);
-    std::transform(unique_col_ids.begin(),
-                   unique_col_ids.end(),
-                   column_names.begin(),
-                   column_names.begin(),
-                   [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
-                     auto col_id, auto name) mutable {
-                     return column_parent_ids[col_id] == row_array_parent_col_id
-                              ? std::to_string(h_values_column_indices[col_id])
-                              : name;
-                   });
-  }
-
-  auto to_json_col_type = [](auto category) {
-    switch (category) {
-      case NC_STRUCT: return json_col_t::StructColumn;
-      case NC_LIST: return json_col_t::ListColumn;
-      case NC_STR: [[fallthrough]];
-      case NC_VAL: return json_col_t::StringColumn;
-      default: return json_col_t::Unknown;
-    }
-  };
-  auto init_to_zero = [stream](auto& v) {
-    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
-  };
-
-  auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
-    if (column_category == NC_ERR || column_category == NC_FN) {
-      return;
-    } else if (column_category == NC_VAL || column_category == NC_STR) {
-      col.string_offsets.resize(max_row_offsets[i] + 1, stream);
-      col.string_lengths.resize(max_row_offsets[i] + 1, stream);
-      init_to_zero(col.string_offsets);
-      init_to_zero(col.string_lengths);
-    } else if (column_category == NC_LIST) {
-      col.child_offsets.resize(max_row_offsets[i] + 2, stream);
-      init_to_zero(col.child_offsets);
-    }
-    col.num_rows = max_row_offsets[i] + 1;
-    col.validity =
-      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = to_json_col_type(column_category);
-  };
-
-  auto reinitialize_as_string = [&](auto i, auto& col) {
-    col.string_offsets.resize(max_row_offsets[i] + 1, stream);
-    col.string_lengths.resize(max_row_offsets[i] + 1, stream);
-    init_to_zero(col.string_offsets);
-    init_to_zero(col.string_lengths);
-    col.num_rows = max_row_offsets[i] + 1;
-    col.validity =
-      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = json_col_t::StringColumn;
-    // destroy references of all child columns after this step, by calling remove_child_columns
-  };
-
-  path_from_tree tree_path{column_categories,
-                           column_parent_ids,
-                           column_names,
-                           is_array_of_arrays,
-                           row_array_parent_col_id};
-
-  // 2. generate nested columns tree and its device_memory
-  // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
-  auto h_range_col_id_it =
-    thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin());
-  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
-    return thrust::get<0>(a) < thrust::get<0>(b);
-  });
-
-  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
-    if (is_enabled_mixed_types_as_string) {
-      return cudf::detail::make_host_vector_sync(
-        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
-    }
-    return cudf::detail::make_empty_host_vector<uint8_t>(0, stream);
-  }();
-
-  // use hash map because we may skip field name's col_ids
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
-  // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
-  std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
-  // find column_ids which are values, but should be ignored in validity
-  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
-  std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
-  std::vector<uint8_t> is_pruned(num_columns, 0);
-  // for columns that are not mixed type but have been forced as string
-  std::vector<bool> forced_as_string_column(num_columns);
-  columns.try_emplace(parent_node_sentinel, std::ref(root));
-
-  std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
-    [&](NodeIndexT this_col_id, device_json_column& col) {
-      for (auto col_name : col.column_order) {
-        auto child_id                  = mapped_columns[{this_col_id, col_name}];
-        is_mixed_type_column[child_id] = 1;
-        remove_child_columns(child_id, col.child_columns.at(col_name));
-        mapped_columns.erase({this_col_id, col_name});
-        columns.erase(child_id);
-      }
-      col.child_columns.clear();  // their references are deleted above.
-      col.column_order.clear();
-    };
-
-  auto name_and_parent_index = [&is_array_of_arrays,
-                                &row_array_parent_col_id,
-                                &column_parent_ids,
-                                &column_categories,
-                                &column_names](auto this_col_id) {
-    std::string name   = "";
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
-      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
-        name = column_names[this_col_id];
-      } else {
-        name = list_child_name;
-      }
-    } else if (column_categories[parent_col_id] == NC_FN) {
-      auto field_name_col_id = parent_col_id;
-      parent_col_id          = column_parent_ids[parent_col_id];
-      name                   = column_names[field_name_col_id];
-    } else {
-      CUDF_FAIL("Unexpected parent column category");
-    }
-    return std::pair{name, parent_col_id};
-  };
-
-  // Prune columns that are not required to be parsed.
-  if (options.is_enabled_prune_columns()) {
-    for (auto const this_col_id : unique_col_ids) {
-      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-        continue;
-      }
-      // Struct, List, String, Value
-      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
-      // get path of this column, and get its dtype if present in options
-      auto const nt                             = tree_path.get_path(this_col_id);
-      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
-        is_pruned[this_col_id] = 1;
-        continue;
-      } else {
-        // make sure all its parents are not pruned.
-        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
-          is_pruned[parent_col_id] = 0;
-          parent_col_id            = column_parent_ids[parent_col_id];
-        }
-      }
-    }
-  }
-
-  // Build the column tree, also, handles mixed types.
-  for (auto const this_col_id : unique_col_ids) {
-    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-      continue;
-    }
-    // Struct, List, String, Value
-    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
-
-    // if parent is mixed type column or this column is pruned or if parent
-    // has been forced as string, ignore this column.
-    if (parent_col_id != parent_node_sentinel &&
-          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
-        forced_as_string_column[parent_col_id]) {
-      ignore_vals[this_col_id] = 1;
-      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
-      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
-      continue;
-    }
-
-    // If the child is already found,
-    // replace if this column is a nested column and the existing was a value column
-    // ignore this column if this column is a value column and the existing was a nested column
-    auto it = columns.find(parent_col_id);
-    CUDF_EXPECTS(it != columns.end(), "Parent column not found");
-    auto& parent_col = it->second.get();
-    bool replaced    = false;
-    if (mapped_columns.count({parent_col_id, name}) > 0) {
-      auto const old_col_id = mapped_columns[{parent_col_id, name}];
-      // If mixed type as string is enabled, make both of them strings and merge them.
-      // All child columns will be ignored when parsing.
-      if (is_enabled_mixed_types_as_string) {
-        bool const is_mixed_type = [&]() {
-          // If new or old is STR and they are all not null, make it mixed type, else ignore.
-          if (column_categories[this_col_id] == NC_VAL ||
-              column_categories[this_col_id] == NC_STR) {
-            if (is_str_column_all_nulls[this_col_id]) return false;
-          }
-          if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
-            if (is_str_column_all_nulls[old_col_id]) return false;
-          }
-          return true;
-        }();
-        if (is_mixed_type) {
-          is_mixed_type_column[this_col_id] = 1;
-          is_mixed_type_column[old_col_id]  = 1;
-          // if old col type (not cat) is list or struct, replace with string.
-          auto& col = columns.at(old_col_id).get();
-          if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
-            reinitialize_as_string(old_col_id, col);
-            remove_child_columns(old_col_id, col);
-            // all its children (which are already inserted) are ignored later.
-          }
-          col.forced_as_string_column = true;
-          columns.try_emplace(this_col_id, columns.at(old_col_id));
-          continue;
-        }
-      }
-
-      if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
-        ignore_vals[this_col_id] = 1;
-        continue;
-      }
-      if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
-        // remap
-        ignore_vals[old_col_id] = 1;
-        mapped_columns.erase({parent_col_id, name});
-        columns.erase(old_col_id);
-        parent_col.child_columns.erase(name);
-        replaced = true;  // to skip duplicate name in column_order
-      } else {
-        // If this is a nested column but we're trying to insert either (a) a list node into a
-        // struct column or (b) a struct node into a list column, we fail
-        CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and
-                          column_categories[this_col_id] == NC_STRUCT) or
-                         (column_categories[old_col_id] == NC_STRUCT and
-                          column_categories[this_col_id] == NC_LIST)),
-                     "A mix of lists and structs within the same column is not supported");
-      }
-    }
-
-    auto this_column_category = column_categories[this_col_id];
-    // get path of this column, check if it is a struct/list forced as string, and enforce it
-    auto const nt                             = tree_path.get_path(this_col_id);
-    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-    if ((column_categories[this_col_id] == NC_STRUCT or
-         column_categories[this_col_id] == NC_LIST) and
-        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-      this_column_category = NC_STR;
-    }
-
-    CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
-    // move into parent
-    device_json_column col(stream, mr);
-    initialize_json_columns(this_col_id, col, this_column_category);
-    if ((column_categories[this_col_id] == NC_STRUCT or
-         column_categories[this_col_id] == NC_LIST) and
-        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-      col.forced_as_string_column          = true;
-      forced_as_string_column[this_col_id] = true;
-    }
-
-    auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
-    CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
-    if (not replaced) parent_col.column_order.push_back(name);
-    columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
-    mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
-  }
-
-  if (is_enabled_mixed_types_as_string) {
-    // ignore all children of mixed type columns
-    for (auto const this_col_id : unique_col_ids) {
-      auto parent_col_id = column_parent_ids[this_col_id];
-      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
-        is_mixed_type_column[this_col_id] = 1;
-        ignore_vals[this_col_id]          = 1;
-        columns.erase(this_col_id);
-      }
-      // Convert only mixed type columns as string (so to copy), but not its children
-      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and
-          is_mixed_type_column[this_col_id] == 1)
-        column_categories[this_col_id] = NC_STR;
-    }
-    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                    column_categories.data(),
-                                    column_categories.size() * sizeof(column_categories[0]),
-                                    cudf::detail::host_memory_kind::PAGEABLE,
-                                    stream);
-  }
-
-  // ignore all children of columns forced as string
-  for (auto const this_col_id : unique_col_ids) {
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
-      forced_as_string_column[this_col_id] = true;
-      ignore_vals[this_col_id]             = 1;
-    }
-    // Convert only mixed type columns as string (so to copy), but not its children
-    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
-        forced_as_string_column[this_col_id])
-      column_categories[this_col_id] = NC_STR;
-  }
-  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                  column_categories.data(),
-                                  column_categories.size() * sizeof(column_categories[0]),
-                                  cudf::detail::host_memory_kind::PAGEABLE,
-                                  stream);
-
-  // restore unique_col_ids order
-  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
-    return thrust::get<1>(a) < thrust::get<1>(b);
-  });
-  // move columns data to device.
-  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
-  for (auto& [col_id, col_ref] : columns) {
-    if (col_id == parent_node_sentinel) continue;
-    auto& col            = col_ref.get();
-    columns_data[col_id] = json_column_data{col.string_offsets.data(),
-                                            col.string_lengths.data(),
-                                            col.child_offsets.data(),
-                                            static_cast<bitmask_type*>(col.validity.data())};
-  }
-
-  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
-    ignore_vals, stream, cudf::get_current_device_resource_ref());
-  auto d_columns_data = cudf::detail::make_device_uvector_async(
-    columns_data, stream, cudf::get_current_device_resource_ref());
-
-  // 3. scatter string offsets to respective columns, set validity bits
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::counting_iterator<size_type>(0),
-    num_nodes,
-    [column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin(),
-     row_offsets       = row_offsets.begin(),
-     range_begin       = tree.node_range_begin.begin(),
-     range_end         = tree.node_range_end.begin(),
-     d_ignore_vals     = d_ignore_vals.begin(),
-     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
-      if (d_ignore_vals[col_ids[i]]) return;
-      auto const node_category = column_categories[col_ids[i]];
-      switch (node_category) {
-        case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
-        case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
-        case NC_STR: [[fallthrough]];
-        case NC_VAL:
-          if (d_ignore_vals[col_ids[i]]) break;
-          set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]);
-          d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i];
-          d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i];
-          break;
-        default: break;
-      }
-    });
-
-  // 4. scatter List offset
-  // copy_if only node's whose parent is list, (node_id, parent_col_id)
-  // stable_sort by parent_col_id of {node_id}.
-  // For all unique parent_node_id of (i==0, i-1!=i), write start offset.
-  //                                  (i==last, i+1!=i), write end offset.
-  //    unique_copy_by_key {parent_node_id} {row_offset} to
-  //    col[parent_col_id].child_offsets[row_offset[parent_node_id]]
-
-  auto& parent_col_ids = sorted_col_ids;  // reuse sorted_col_ids
-  auto parent_col_id   = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0),
-    cuda::proclaim_return_type<NodeIndexT>(
-      [col_ids         = col_ids.begin(),
-       parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
-        return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel
-                                                                  : col_ids[parent_node_ids[node_id]];
-      }));
-  auto const list_children_end = thrust::copy_if(
-    rmm::exec_policy(stream),
-    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
-    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
-      num_nodes,
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
-    [d_ignore_vals     = d_ignore_vals.begin(),
-     parent_node_ids   = tree.parent_node_ids.begin(),
-     column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin()] __device__(size_type node_id) {
-      auto parent_node_id = parent_node_ids[node_id];
-      return parent_node_id != parent_node_sentinel and
-             column_categories[col_ids[parent_node_id]] == NC_LIST and
-             (!d_ignore_vals[col_ids[parent_node_id]]);
-    });
-
-  auto const num_list_children =
-    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
-  thrust::stable_sort_by_key(rmm::exec_policy(stream),
-                             parent_col_ids.begin(),
-                             parent_col_ids.begin() + num_list_children,
-                             node_ids.begin());
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    num_list_children,
-    [node_ids        = node_ids.begin(),
-     parent_node_ids = tree.parent_node_ids.begin(),
-     parent_col_ids  = parent_col_ids.begin(),
-     row_offsets     = row_offsets.begin(),
-     d_columns_data  = d_columns_data.begin(),
-     num_list_children] __device__(size_type i) {
-      auto const node_id        = node_ids[i];
-      auto const parent_node_id = parent_node_ids[node_id];
-      // scatter to list_offset
-      if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
-        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
-          row_offsets[node_id];
-      }
-      // last value of list child_offset is its size.
-      if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
-        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
-          row_offsets[node_id] + 1;
-      }
-    });
-
-  // 5. scan on offsets.
-  for (auto& [id, col_ref] : columns) {
-    auto& col = col_ref.get();
-    if (col.type == json_col_t::StringColumn) {
-      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                             col.string_offsets.begin(),
-                             col.string_offsets.end(),
-                             col.string_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
-    } else if (col.type == json_col_t::ListColumn) {
-      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                             col.child_offsets.begin(),
-                             col.child_offsets.end(),
-                             col.child_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
-    }
-  }
-  stream.synchronize();
-}
-
 std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_column_to_cudf_column(
   device_json_column& json_col,
   device_span<SymbolT const> d_input,
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 75639a0438f..83f71e657a7 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -299,22 +299,58 @@ get_array_children_indices(TreeDepthT row_array_children_level,
                            device_span<TreeDepthT const> node_levels,
                            device_span<NodeIndexT const> parent_node_ids,
                            rmm::cuda_stream_view stream);
+
 /**
- * @brief Reduce node tree into column tree by aggregating each property of column.
+ * @brief Reduces node tree representation to column tree representation.
  *
- * @param tree json node tree to reduce (modified in-place, but restored to original state)
- * @param col_ids column ids of each node (modified in-place, but restored to original state)
- * @param row_offsets row offsets of each node (modified in-place, but restored to original state)
- * @param stream The CUDA stream to which kernels are dispatched
- * @return A tuple containing the column tree, identifier for each column and the maximum row index
- * in each column
+ * @param tree Node tree representation of JSON string
+ * @param original_col_ids Column ids of nodes
+ * @param sorted_col_ids Sorted column ids of nodes
+ * @param ordered_node_ids Node ids of nodes sorted by column ids
+ * @param row_offsets Row offsets of nodes
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A tuple of column tree representation of JSON string, column ids of columns, and
+ * max row offsets of columns
  */
 std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
 reduce_to_column_tree(tree_meta_t& tree,
-                      device_span<NodeIndexT> col_ids,
+                      device_span<NodeIndexT> original_col_ids,
+                      device_span<NodeIndexT> sorted_col_ids,
+                      device_span<NodeIndexT> ordered_node_ids,
                       device_span<size_type> row_offsets,
+                      bool is_array_of_arrays,
+                      NodeIndexT const row_array_parent_col_id,
                       rmm::cuda_stream_view stream);
-
+/**
+ * @brief Constructs `d_json_column` from node tree representation
+ * Newly constructed columns are insert into `root`'s children.
+ * `root` must be a list type.
+ *
+ * @param input Input JSON string device data
+ * @param tree Node tree representation of the JSON string
+ * @param col_ids Column ids of the nodes in the tree
+ * @param row_offsets Row offsets of the nodes in the tree
+ * @param root Root node of the `d_json_column` tree
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param options Parsing options specifying the parsing behaviour
+ * options affecting behaviour are
+ *   is_enabled_lines: Whether the input is a line-delimited JSON
+ *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the device memory
+ * of child_offets and validity members of `d_json_column`
+ */
+void make_device_json_column(device_span<SymbolT const> input,
+                             tree_meta_t& tree,
+                             device_span<NodeIndexT> col_ids,
+                             device_span<size_type> row_offsets,
+                             device_json_column& root,
+                             bool is_array_of_arrays,
+                             cudf::io::json_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
 /**
  * @brief Retrieves the parse_options to be used for type inference and type casting
  *

From 69ab9880eec438d106f6769cc8323e13fe2098b0 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Fri, 20 Sep 2024 22:44:01 +0100
Subject: [PATCH 234/270] Exposed stream-ordering to join API (#16793)

Adds stream ordering to the public join APIs:

- `inner_join`
- `left_join`
- `full_join`
- `left_semi_join`
- `left_anti_join`
- `cross_join`
- `conditional_inner_join`
- `conditional_left_join`
- `conditional_full_join`
- `conditional_left_semi_join`
- `conditional_left_anti_join`
- `mixed_inner_join`
- `mixed_left_join`
- `mixed_full_join`
- `mixed_left_semi_join`
- `mixed_left_anti_join`
- `mixed_inner_join_size`
- `mixed_left_join_size`
- `conditional_inner_join_size`
- `conditional_left_join_size`
- `conditional_left_semi_join_size`
- `conditional_left_anti_join_size`

closes #16792
follows up https://github.com/rapidsai/cudf/issues/13744

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16793
---
 .../ndsh_data_generator/table_helpers.cpp     |   2 +-
 cpp/benchmarks/ndsh/utilities.cpp             |  15 +-
 cpp/examples/parquet_io/parquet_io.cpp        |   9 +-
 cpp/include/cudf/join.hpp                     |  44 ++++
 cpp/src/join/conditional_join.cu              |  75 ++----
 cpp/src/join/conditional_join.hpp             |   1 -
 cpp/src/join/cross_join.cu                    |   4 +-
 cpp/src/join/join.cu                          |  10 +-
 cpp/src/join/mixed_join.cu                    |  16 +-
 cpp/src/join/mixed_join_semi.cu               |   7 +-
 cpp/src/join/semi_join.cu                     |   7 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/join/join_tests.cpp                 |  12 +-
 cpp/tests/join/semi_anti_join_tests.cpp       |   7 +-
 cpp/tests/streams/join_test.cpp               | 219 ++++++++++++++++++
 15 files changed, 349 insertions(+), 80 deletions(-)
 create mode 100644 cpp/tests/streams/join_test.cpp

diff --git a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
index d4368906702..54d177df401 100644
--- a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
+++ b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
@@ -85,7 +85,7 @@ std::unique_ptr<cudf::table> perform_left_join(cudf::table_view const& left_inpu
   auto const left_selected  = left_input.select(left_on);
   auto const right_selected = right_input.select(right_on);
   auto const [left_join_indices, right_join_indices] =
-    cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, mr);
+    cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, stream, mr);
 
   auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
   auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp
index 2d514764fc2..62116ddf661 100644
--- a/cpp/benchmarks/ndsh/utilities.cpp
+++ b/cpp/benchmarks/ndsh/utilities.cpp
@@ -28,6 +28,7 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/transform.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <cstdlib>
 #include <ctime>
@@ -146,11 +147,15 @@ std::unique_ptr<cudf::table> join_and_gather(cudf::table_view const& left_input,
                                              cudf::null_equality compare_nulls)
 {
   CUDF_FUNC_RANGE();
-  constexpr auto oob_policy                          = cudf::out_of_bounds_policy::DONT_CHECK;
-  auto const left_selected                           = left_input.select(left_on);
-  auto const right_selected                          = right_input.select(right_on);
-  auto const [left_join_indices, right_join_indices] = cudf::inner_join(
-    left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref());
+  constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK;
+  auto const left_selected  = left_input.select(left_on);
+  auto const right_selected = right_input.select(right_on);
+  auto const [left_join_indices, right_join_indices] =
+    cudf::inner_join(left_selected,
+                     right_selected,
+                     compare_nulls,
+                     cudf::get_default_stream(),
+                     cudf::get_current_device_resource_ref());
 
   auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
   auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 442731694fa..9cda22d0695 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -18,6 +18,8 @@
 
 #include "../utilities/timer.hpp"
 
+#include <cudf/utilities/default_stream.hpp>
+
 /**
  * @file parquet_io.cpp
  * @brief Demonstrates usage of the libcudf APIs to read and write
@@ -159,8 +161,11 @@ int main(int argc, char const** argv)
     // Left anti-join the original and transcoded tables
     // identical tables should not throw an exception and
     // return an empty indices vector
-    auto const indices = cudf::left_anti_join(
-      input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get());
+    auto const indices = cudf::left_anti_join(input->view(),
+                                              transcoded_input->view(),
+                                              cudf::null_equality::EQUAL,
+                                              cudf::get_default_stream(),
+                                              resource.get());
 
     // No exception thrown, check indices
     auto const valid = indices->size() == 0;
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index cc8912cb022..a590eb27511 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -97,6 +97,7 @@ class distinct_hash_join;
  * @param[in] right_keys The right table
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -108,6 +109,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 inner_join(cudf::table_view const& left_keys,
            cudf::table_view const& right_keys,
            null_equality compare_nulls       = null_equality::EQUAL,
+           rmm::cuda_stream_view stream      = cudf::get_default_stream(),
            rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -137,6 +139,7 @@ inner_join(cudf::table_view const& left_keys,
  * @param[in] right_keys The right table
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -148,6 +151,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 left_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
           null_equality compare_nulls       = null_equality::EQUAL,
+          rmm::cuda_stream_view stream      = cudf::get_default_stream(),
           rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -176,6 +180,7 @@ left_join(cudf::table_view const& left_keys,
  * @param[in] right_keys The right table
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -187,6 +192,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 full_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
           null_equality compare_nulls       = null_equality::EQUAL,
+          rmm::cuda_stream_view stream      = cudf::get_default_stream(),
           rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -205,6 +211,7 @@ full_join(cudf::table_view const& left_keys,
  * @param left_keys The left table
  * @param right_keys The right table
  * @param compare_nulls Controls whether null join-key values should match or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A vector `left_indices` that can be used to construct
@@ -215,6 +222,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -236,6 +244,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
  * @param[in] right_keys The right table
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A column `left_indices` that can be used to construct
@@ -246,6 +255,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -266,6 +276,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
  *
  * @param left  The left table
  * @param right The right table
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr    Device memory resource used to allocate the returned table's device memory
  *
  * @return     Result of cross joining `left` and `right` tables
@@ -273,6 +284,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
 std::unique_ptr<cudf::table> cross_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -567,6 +579,7 @@ class distinct_hash_join {
  * @param right The right table
  * @param binary_predicate The condition on which to join
  * @param output_size Optional value which allows users to specify the exact output size
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -578,6 +591,7 @@ conditional_inner_join(table_view const& left,
                        table_view const& right,
                        ast::expression const& binary_predicate,
                        std::optional<std::size_t> output_size = {},
+                       rmm::cuda_stream_view stream           = cudf::get_default_stream(),
                        rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -612,6 +626,7 @@ conditional_inner_join(table_view const& left,
  * @param right The right table
  * @param binary_predicate The condition on which to join
  * @param output_size Optional value which allows users to specify the exact output size
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -623,6 +638,7 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size = {},
+                      rmm::cuda_stream_view stream           = cudf::get_default_stream(),
                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -655,6 +671,7 @@ conditional_left_join(table_view const& left,
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -665,6 +682,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
+                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -693,6 +711,7 @@ conditional_full_join(table_view const& left,
  * @param right The right table
  * @param binary_predicate The condition on which to join
  * @param output_size Optional value which allows users to specify the exact output size
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A vector `left_indices` that can be used to construct the result of
@@ -704,6 +723,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
@@ -732,6 +752,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
  * @param right The right table
  * @param binary_predicate The condition on which to join
  * @param output_size Optional value which allows users to specify the exact output size
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A vector `left_indices` that can be used to construct the result of
@@ -743,6 +764,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
@@ -786,6 +808,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
  * @param output_size_data An optional pair of values indicating the exact output size and the
  * number of matches for each row in the larger of the two input tables, left or right (may be
  * precomputed using the corresponding mixed_inner_join_size API).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -801,6 +824,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -846,6 +870,7 @@ mixed_inner_join(
  * @param output_size_data An optional pair of values indicating the exact output size and the
  * number of matches for each row in the larger of the two input tables, left or right (may be
  * precomputed using the corresponding mixed_left_join_size API).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -861,6 +886,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -906,6 +932,7 @@ mixed_left_join(
  * @param output_size_data An optional pair of values indicating the exact output size and the
  * number of matches for each row in the larger of the two input tables, left or right (may be
  * precomputed using the corresponding mixed_full_join_size API).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -921,6 +948,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -956,6 +984,7 @@ mixed_full_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -968,6 +997,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1004,6 +1034,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -1016,6 +1047,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1041,6 +1073,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair containing the size that would result from performing the
@@ -1056,6 +1089,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1081,6 +1115,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair containing the size that would result from performing the
@@ -1096,6 +1131,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1111,6 +1147,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return The size that would result from performing the requested join
@@ -1119,6 +1156,7 @@ std::size_t conditional_inner_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1134,6 +1172,7 @@ std::size_t conditional_inner_join_size(
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return The size that would result from performing the requested join
@@ -1142,6 +1181,7 @@ std::size_t conditional_left_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1157,6 +1197,7 @@ std::size_t conditional_left_join_size(
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return The size that would result from performing the requested join
@@ -1165,6 +1206,7 @@ std::size_t conditional_left_semi_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1180,6 +1222,7 @@ std::size_t conditional_left_semi_join_size(
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return The size that would result from performing the requested join
@@ -1188,6 +1231,7 @@ std::size_t conditional_left_anti_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 748691fb7d1..2ec23e0dc6d 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -27,7 +27,6 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -377,16 +376,12 @@ conditional_inner_join(table_view const& left,
                        table_view const& right,
                        ast::expression const& binary_predicate,
                        std::optional<std::size_t> output_size,
+                       rmm::cuda_stream_view stream,
                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join(left,
-                                  right,
-                                  binary_predicate,
-                                  detail::join_kind::INNER_JOIN,
-                                  output_size,
-                                  cudf::get_default_stream(),
-                                  mr);
+  return detail::conditional_join(
+    left, right, binary_predicate, detail::join_kind::INNER_JOIN, output_size, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -395,16 +390,12 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size,
+                      rmm::cuda_stream_view stream,
                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join(left,
-                                  right,
-                                  binary_predicate,
-                                  detail::join_kind::LEFT_JOIN,
-                                  output_size,
-                                  cudf::get_default_stream(),
-                                  mr);
+  return detail::conditional_join(
+    left, right, binary_predicate, detail::join_kind::LEFT_JOIN, output_size, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -412,16 +403,12 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
+                      rmm::cuda_stream_view stream,
                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join(left,
-                                  right,
-                                  binary_predicate,
-                                  detail::join_kind::FULL_JOIN,
-                                  {},
-                                  cudf::get_default_stream(),
-                                  mr);
+  return detail::conditional_join(
+    left, right, binary_predicate, detail::join_kind::FULL_JOIN, {}, stream, mr);
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
@@ -429,16 +416,12 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join_anti_semi(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_SEMI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr);
+  return detail::conditional_join_anti_semi(
+    left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, output_size, stream, mr);
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
@@ -446,64 +429,56 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join_anti_semi(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_ANTI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr);
+  return detail::conditional_join_anti_semi(
+    left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, output_size, stream, mr);
 }
 
 std::size_t conditional_inner_join_size(table_view const& left,
                                         table_view const& right,
                                         ast::expression const& binary_predicate,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
-    left, right, binary_predicate, detail::join_kind::INNER_JOIN, cudf::get_default_stream(), mr);
+    left, right, binary_predicate, detail::join_kind::INNER_JOIN, stream, mr);
 }
 
 std::size_t conditional_left_join_size(table_view const& left,
                                        table_view const& right,
                                        ast::expression const& binary_predicate,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
-    left, right, binary_predicate, detail::join_kind::LEFT_JOIN, cudf::get_default_stream(), mr);
+    left, right, binary_predicate, detail::join_kind::LEFT_JOIN, stream, mr);
 }
 
 std::size_t conditional_left_semi_join_size(table_view const& left,
                                             table_view const& right,
                                             ast::expression const& binary_predicate,
+                                            rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::compute_conditional_join_output_size(left,
-                                                      right,
-                                                      binary_predicate,
-                                                      detail::join_kind::LEFT_SEMI_JOIN,
-                                                      cudf::get_default_stream(),
-                                                      mr);
+  return detail::compute_conditional_join_output_size(
+    left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, stream, mr);
 }
 
 std::size_t conditional_left_anti_join_size(table_view const& left,
                                             table_view const& right,
                                             ast::expression const& binary_predicate,
+                                            rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::compute_conditional_join_output_size(left,
-                                                      right,
-                                                      binary_predicate,
-                                                      detail::join_kind::LEFT_ANTI_JOIN,
-                                                      cudf::get_default_stream(),
-                                                      mr);
+  return detail::compute_conditional_join_output_size(
+    left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
index 4f6a9484e8c..303442e79ef 100644
--- a/cpp/src/join/conditional_join.hpp
+++ b/cpp/src/join/conditional_join.hpp
@@ -19,7 +19,6 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index eeb49736bac..15594fb60e3 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -25,7 +25,6 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -75,10 +74,11 @@ std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
 
 std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
                                         cudf::table_view const& right,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::cross_join(left, right, cudf::get_default_stream(), mr);
+  return detail::cross_join(left, right, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 0abff27667b..7b13c260364 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -20,7 +20,6 @@
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -120,10 +119,11 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 inner_join(table_view const& left,
            table_view const& right,
            null_equality compare_nulls,
+           rmm::cuda_stream_view stream,
            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::inner_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
+  return detail::inner_join(left, right, compare_nulls, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -131,10 +131,11 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 left_join(table_view const& left,
           table_view const& right,
           null_equality compare_nulls,
+          rmm::cuda_stream_view stream,
           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::left_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
+  return detail::left_join(left, right, compare_nulls, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -142,10 +143,11 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 full_join(table_view const& left,
           table_view const& right,
           null_equality compare_nulls,
+          rmm::cuda_stream_view stream,
           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::full_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
+  return detail::full_join(left, right, compare_nulls, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 8ff78dd47f4..820b81ee309 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -28,7 +28,6 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -484,6 +483,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -495,7 +495,7 @@ mixed_inner_join(
                             compare_nulls,
                             detail::join_kind::INNER_JOIN,
                             output_size_data,
-                            cudf::get_default_stream(),
+                            stream,
                             mr);
 }
 
@@ -506,6 +506,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -516,7 +517,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
                                                 binary_predicate,
                                                 compare_nulls,
                                                 detail::join_kind::INNER_JOIN,
-                                                cudf::get_default_stream(),
+                                                stream,
                                                 mr);
 }
 
@@ -530,6 +531,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -541,7 +543,7 @@ mixed_left_join(
                             compare_nulls,
                             detail::join_kind::LEFT_JOIN,
                             output_size_data,
-                            cudf::get_default_stream(),
+                            stream,
                             mr);
 }
 
@@ -552,6 +554,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -562,7 +565,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
                                                 binary_predicate,
                                                 compare_nulls,
                                                 detail::join_kind::LEFT_JOIN,
-                                                cudf::get_default_stream(),
+                                                stream,
                                                 mr);
 }
 
@@ -576,6 +579,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -587,7 +591,7 @@ mixed_full_join(
                             compare_nulls,
                             detail::join_kind::FULL_JOIN,
                             output_size_data,
-                            cudf::get_default_stream(),
+                            stream,
                             mr);
 }
 
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index cfb785e242c..aa4fa281159 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -29,7 +29,6 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -267,6 +266,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -277,7 +277,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
                                  binary_predicate,
                                  compare_nulls,
                                  detail::join_kind::LEFT_SEMI_JOIN,
-                                 cudf::get_default_stream(),
+                                 stream,
                                  mr);
 }
 
@@ -288,6 +288,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -298,7 +299,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
                                  binary_predicate,
                                  compare_nulls,
                                  detail::join_kind::LEFT_ANTI_JOIN,
-                                 cudf::get_default_stream(),
+                                 stream,
                                  mr);
 }
 
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index f69ded73e8d..d2ab2122c75 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -23,7 +23,6 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -98,22 +97,24 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
-    detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr);
+    detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, stream, mr);
 }
 
 std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
-    detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr);
+    detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 1bedb344a01..586bac97570 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -691,6 +691,7 @@ ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE tes
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_JOIN_TEST streams/join_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index ab387a5c7f5..3431e941359 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -39,6 +39,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <limits>
 
 template <typename T>
@@ -60,6 +62,7 @@ template <std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             cudf::table_view const& left_keys,
             cudf::table_view const& right_keys,
             cudf::null_equality compare_nulls,
+            rmm::cuda_stream_view stream,
             rmm::device_async_resource_ref mr),
           cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK>
 std::unique_ptr<cudf::table> join_and_gather(
@@ -68,12 +71,13 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto left_selected  = left_input.select(left_on);
   auto right_selected = right_input.select(right_on);
   auto const [left_join_indices, right_join_indices] =
-    join_impl(left_selected, right_selected, compare_nulls, mr);
+    join_impl(left_selected, right_selected, compare_nulls, stream, mr);
 
   auto left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
   auto right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
@@ -2027,7 +2031,11 @@ struct JoinTestLists : public cudf::test::BaseFixture {
     auto const probe_tv = cudf::table_view{{probe}};
 
     auto const [left_result_map, right_result_map] =
-      join_func(build_tv, probe_tv, nulls_equal, cudf::get_current_device_resource_ref());
+      join_func(build_tv,
+                probe_tv,
+                nulls_equal,
+                cudf::get_default_stream(),
+                cudf::get_current_device_resource_ref());
 
     auto const left_result_table =
       sort_and_gather(build_tv, column_view_from_device_uvector(*left_result_map), oob_policy);
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 3e279260b99..554d5754e39 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -28,8 +28,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 
 template <typename T>
@@ -51,6 +54,7 @@ template <std::unique_ptr<rmm::device_uvector<cudf::size_type>> (*join_impl)(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   cudf::null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)>
 std::unique_ptr<cudf::table> join_and_gather(
   cudf::table_view const& left_input,
@@ -58,11 +62,12 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto left_selected      = left_input.select(left_on);
   auto right_selected     = right_input.select(right_on);
-  auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, mr);
+  auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, stream, mr);
 
   auto left_indices_span = cudf::device_span<cudf::size_type const>{*join_indices};
   auto left_indices_col  = cudf::column_view{left_indices_span};
diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp
new file mode 100644
index 00000000000..2811bb676fa
--- /dev/null
+++ b/cpp/tests/streams/join_test.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/join.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+class JoinTest : public cudf::test::BaseFixture {
+  static inline cudf::table make_table()
+  {
+    cudf::test::fixed_width_column_wrapper<int32_t> col0{{3, 1, 2, 0, 3}};
+    cudf::test::strings_column_wrapper col1{{"s0", "s1", "s2", "s4", "s1"}};
+    cudf::test::fixed_width_column_wrapper<int32_t> col2{{0, 1, 2, 4, 1}};
+
+    std::vector<std::unique_ptr<cudf::column>> columns;
+    columns.push_back(col0.release());
+    columns.push_back(col1.release());
+    columns.push_back(col2.release());
+
+    return cudf::table{std::move(columns)};
+  }
+
+ public:
+  cudf::table table0{make_table()};
+  cudf::table table1{make_table()};
+  cudf::table conditional0{make_table()};
+  cudf::table conditional1{make_table()};
+  cudf::ast::column_reference col_ref_left_0{0};
+  cudf::ast::column_reference col_ref_right_0{0, cudf::ast::table_reference::RIGHT};
+  cudf::ast::operation left_zero_eq_right_zero{
+    cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0};
+};
+
+TEST_F(JoinTest, InnerJoin)
+{
+  cudf::inner_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, LeftJoin)
+{
+  cudf::left_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, FullJoin)
+{
+  cudf::full_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, LeftSemiJoin)
+{
+  cudf::left_semi_join(
+    table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, LeftAntiJoin)
+{
+  cudf::left_anti_join(
+    table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, CrossJoin) { cudf::cross_join(table0, table1, cudf::test::get_default_stream()); }
+
+TEST_F(JoinTest, ConditionalInnerJoin)
+{
+  cudf::conditional_inner_join(
+    table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftJoin)
+{
+  cudf::conditional_left_join(
+    table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalFullJoin)
+{
+  cudf::conditional_full_join(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftSemiJoin)
+{
+  cudf::conditional_left_semi_join(
+    table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftAntiJoin)
+{
+  cudf::conditional_left_anti_join(
+    table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedInnerJoin)
+{
+  cudf::mixed_inner_join(table0,
+                         table1,
+                         conditional0,
+                         conditional1,
+                         left_zero_eq_right_zero,
+                         cudf::null_equality::EQUAL,
+                         std::nullopt,
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedLeftJoin)
+{
+  cudf::mixed_left_join(table0,
+                        table1,
+                        conditional0,
+                        conditional1,
+                        left_zero_eq_right_zero,
+                        cudf::null_equality::EQUAL,
+                        std::nullopt,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedFullJoin)
+{
+  cudf::mixed_full_join(table0,
+                        table1,
+                        conditional0,
+                        conditional1,
+                        left_zero_eq_right_zero,
+                        cudf::null_equality::EQUAL,
+                        std::nullopt,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedLeftSemiJoin)
+{
+  cudf::mixed_left_semi_join(table0,
+                             table1,
+                             conditional0,
+                             conditional1,
+                             left_zero_eq_right_zero,
+                             cudf::null_equality::EQUAL,
+                             cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedLeftAntiJoin)
+{
+  cudf::mixed_left_anti_join(table0,
+                             table1,
+                             conditional0,
+                             conditional1,
+                             left_zero_eq_right_zero,
+                             cudf::null_equality::EQUAL,
+                             cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedInnerJoinSize)
+{
+  cudf::mixed_inner_join_size(table0,
+                              table1,
+                              conditional0,
+                              conditional1,
+                              left_zero_eq_right_zero,
+                              cudf::null_equality::EQUAL,
+                              cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedLeftJoinSize)
+{
+  cudf::mixed_left_join_size(table0,
+                             table1,
+                             conditional0,
+                             conditional1,
+                             left_zero_eq_right_zero,
+                             cudf::null_equality::EQUAL,
+                             cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalInnerJoinSize)
+{
+  cudf::conditional_inner_join_size(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftJoinSize)
+{
+  cudf::conditional_left_join_size(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftSemiJoinSize)
+{
+  cudf::conditional_left_semi_join_size(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftAntiJoinSize)
+{
+  cudf::conditional_left_anti_join_size(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}

From b165210c706337fbda3284d115983b86a86b445f Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Fri, 20 Sep 2024 17:11:40 -0500
Subject: [PATCH 235/270] Add best practices page to Dask cuDF docs (#16821)

Adds a much-needed "best practices" page to the Dask cuDF documentation.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16821
---
 docs/dask_cudf/source/best_practices.rst | 320 +++++++++++++++++++++++
 docs/dask_cudf/source/index.rst          |  26 +-
 python/dask_cudf/README.md               |   1 +
 3 files changed, 336 insertions(+), 11 deletions(-)
 create mode 100644 docs/dask_cudf/source/best_practices.rst

diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst
new file mode 100644
index 00000000000..142124163af
--- /dev/null
+++ b/docs/dask_cudf/source/best_practices.rst
@@ -0,0 +1,320 @@
+.. _best-practices:
+
+Dask cuDF Best Practices
+========================
+
+This page outlines several important guidelines for using `Dask cuDF
+<https://docs.rapids.ai/api/dask-cudf/stable/>`__ effectively.
+
+.. note::
+  Since Dask cuDF is a backend extension for
+  `Dask DataFrame <https://docs.dask.org/en/stable/dataframe.html>`__,
+  the guidelines discussed in the `Dask DataFrames Best Practices
+  <https://docs.dask.org/en/stable/dataframe-best-practices.html>`__
+  documentation also apply to Dask cuDF (excluding any pandas-specific
+  details).
+
+
+Deployment and Configuration
+----------------------------
+
+Use Dask-CUDA
+~~~~~~~~~~~~~
+
+To execute a Dask workflow on multiple GPUs, a Dask cluster must
+be deployed with `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+and `Dask.distributed <https://distributed.dask.org/en/stable/>`__.
+
+When running on a single machine, the `LocalCUDACluster <https://docs.rapids.ai/api/dask-cuda/stable/api/#dask_cuda.LocalCUDACluster>`__
+convenience function is strongly recommended. No matter how many GPUs are
+available on the machine (even one!), using `Dask-CUDA has many advantages
+<https://docs.rapids.ai/api/dask-cuda/stable/#motivation>`__
+over default (threaded) execution. Just to list a few:
+
+* Dask-CUDA makes it easy to pin workers to specific devices.
+* Dask-CUDA makes it easy to configure memory-spilling options.
+* The distributed scheduler collects useful diagnostic information that can be viewed on a dashboard in real time.
+
+Please see `Dask-CUDA's API <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+and `Best Practices <https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/>`__
+documentation for detailed information. Typical ``LocalCUDACluster`` usage
+is also illustrated within the multi-GPU section of `Dask cuDF's
+<https://docs.rapids.ai/api/dask-cudf/stable/>`__ documentation.
+
+.. note::
+  When running on cloud infrastructure or HPC systems, it is usually best to
+  leverage system-specific deployment libraries like `Dask Operator
+  <https://docs.dask.org/en/latest/deploying-kubernetes.html>`__ and `Dask-Jobqueue
+  <https://jobqueue.dask.org/en/latest/>`__.
+
+  Please see `the RAPIDS deployment documentation <https://docs.rapids.ai/deployment/stable/>`__
+  for further details and examples.
+
+
+Use diagnostic tools
+~~~~~~~~~~~~~~~~~~~~
+
+The Dask ecosystem includes several diagnostic tools that you should absolutely use.
+These tools include an intuitive `browser dashboard
+<https://docs.dask.org/en/stable/dashboard.html>`__ as well as a dedicated
+`API for collecting performance profiles
+<https://distributed.dask.org/en/latest/diagnosing-performance.html#performance-reports>`__.
+
+No matter the workflow, using the dashboard is strongly recommended.
+It provides a visual representation of the worker resources and compute
+progress. It also shows basic GPU memory and utilization metrics (under
+the ``GPU`` tab). To visualize more detailed GPU metrics in JupyterLab,
+use `NVDashboard <https://github.com/rapidsai/jupyterlab-nvdashboard>`__.
+
+
+Enable cuDF spilling
+~~~~~~~~~~~~~~~~~~~~
+
+When using Dask cuDF for classic ETL workloads, it is usually best
+to enable `native spilling support in cuDF
+<https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory>`__.
+When using :func:`LocalCUDACluster`, this is easily accomplished by
+setting ``enable_cudf_spill=True``.
+
+When a Dask cuDF workflow includes conversion between DataFrame and Array
+representations, native cuDF spilling may be insufficient. For these cases,
+`JIT-unspill <https://docs.rapids.ai/api/dask-cuda/nightly/spilling/#jit-unspill>`__
+is likely to produce better protection from out-of-memory (OOM) errors.
+Please see `Dask-CUDA's spilling documentation
+<https://docs.rapids.ai/api/dask-cuda/24.10/spilling/>`__ for further details
+and guidance.
+
+Use RMM
+~~~~~~~
+
+Memory allocations in cuDF are significantly faster and more efficient when
+the `RAPIDS Memory Manager (RMM) <https://docs.rapids.ai/api/rmm/stable/>`__
+library is configured appropriately on worker processes. In most cases, the best way to manage
+memory is by initializing an RMM pool on each worker before executing a
+workflow. When using :func:`LocalCUDACluster`, this is easily accomplished
+by setting ``rmm_pool_size`` to a large fraction (e.g. ``0.9``).
+
+See the `Dask-CUDA memory-management documentation
+<https://docs.rapids.ai/api/dask-cuda/nightly/examples/best-practices/#gpu-memory-management>`__
+for more details.
+
+Use the Dask DataFrame API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Although Dask cuDF provides a public ``dask_cudf`` Python module, we
+strongly recommended that you use the CPU/GPU portable ``dask.dataframe``
+API instead. Simply `use the Dask configuration system
+<https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html>`__
+to set the ``"dataframe.backend"`` option to ``"cudf"``, and the
+``dask_cudf`` module will be imported and used implicitly.
+
+Be sure to use the :func:`to_backend` method if you need to convert
+between the different DataFrame backends. For example::
+
+  df = df.to_backend("pandas")  # This gives us a pandas-backed collection
+
+.. note::
+  Although :func:`to_backend` makes it easy to move data between pandas
+  and cuDF, repetitive CPU-GPU data movement can degrade performance
+  significantly. For optimal results, keep your data on the GPU as much
+  as possible.
+
+Avoid eager execution
+~~~~~~~~~~~~~~~~~~~~~
+
+Although Dask DataFrame collections are lazy by default, there are several
+notable methods that will result in the immediate execution of the
+underlying task graph:
+
+:func:`compute`: Calling ``ddf.compute()`` will materialize the result of
+``ddf`` and return a single cuDF object. This is done by executing the entire
+task graph associated with ``ddf`` and concatenating its partitions in
+local memory on the client process.
+
+.. note::
+  Never call :func:`compute` on a large collection that cannot fit comfortably
+  in the memory of a single GPU!
+
+:func:`persist`: Like :func:`compute`, calling ``ddf.persist()`` will
+execute the entire task graph associated with ``ddf``. The most important
+difference is that the computed partitions will remain in distributed
+worker memory instead of being concatenated together on the client process.
+Another difference is that :func:`persist` will return immediately when
+executing on a distributed cluster. If you need a blocking synchronization
+point in your workflow, simply use the :func:`wait` function::
+
+  ddf = ddf.persist()
+  wait(ddf)
+
+.. note::
+  Avoid calling :func:`persist` on a large collection that cannot fit comfortably
+  in global worker memory. If the total sum of the partition sizes is larger
+  than the sum of all GPU memory, calling persist will result in significant
+  spilling from device memory. If the individual partition sizes are large, this
+  is likely to produce an OOM error.
+
+:func:`len` / :func:`head` / :func:`tail`: Although these operations are used
+often within pandas/cuDF code to quickly inspect data, it is best to avoid
+them in Dask DataFrame. In most cases, these operations will execute some or all
+of the underlying task graph to materialize the collection.
+
+:func:`sort_values` / :func:`set_index` : These operations both require Dask to
+eagerly collect quantile information about the column(s) being targeted by the
+global sort operation. See `Avoid Sorting`__ for notes on sorting considerations.
+
+.. note::
+  When using :func:`set_index`, be sure to pass in ``sort=False`` whenever the
+  global collection does not **need** to be sorted by the new index.
+
+Avoid Sorting
+~~~~~~~~~~~~~
+
+`The design of Dask DataFrame <https://docs.dask.org/en/stable/dataframe-design.html#dask-dataframe-design>`__
+makes it advantageous to work with data that is already sorted along its index at
+creation time. For most other cases, it is best to avoid sorting unless the logic
+of the workflow makes global ordering absolutely necessary.
+
+If the purpose of a :func:`sort_values` operation is to ensure that all unique
+values in ``by`` will be moved to the same output partition, then `shuffle
+<https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.shuffle.html>`__
+is often the better option.
+
+
+Reading Data
+------------
+
+Tune the partition size
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The ideal partition size is usually between 1/32 and 1/8 the memory
+capacity of a single GPU. Increasing the partition size will typically
+reduce the number of tasks in your workflow and improve the GPU utilization
+for each task. However, if the partitions are too large, the risk of OOM
+errors can become significant.
+
+.. note::
+  As a general rule of thumb, start with 1/32-1/16 for shuffle-intensive workflows
+  (e.g. large-scale sorting and joining), and 1/16-1/8 otherwise. For pathologically
+  skewed data distributions, it may be necessary to target 1/64 or smaller.
+  This rule of thumb comes from anecdotal optimization and OOM-debugging
+  experience. Since every workflow is different, choosing the best partition
+  size is both an art and a science.
+
+The easiest way to tune the partition size is when the DataFrame collection
+is first created by a function like :func:`read_parquet`, :func:`read_csv`,
+or :func:`from_map`. For example, both :func:`read_parquet` and :func:`read_csv`
+expose a ``blocksize`` argument for adjusting the maximum partition size.
+
+If the partition size cannot be tuned effectively at creation time, the
+`repartition <https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.repartition.html>`__
+method can be used as a last resort.
+
+
+Use Parquet
+~~~~~~~~~~~
+
+`Parquet <https://parquet.apache.org/docs/file-format/>`__ is the recommended
+file format for Dask cuDF. It provides efficient columnar storage and enables
+Dask to perform valuable query optimizations like column projection and
+predicate pushdown.
+
+The most important arguments to :func:`read_parquet` are ``blocksize`` and
+``aggregate_files``:
+
+``blocksize``: Use this argument to specify the maximum partition size.
+The default is `"256 MiB"`, but larger values are usually more performant
+on GPUs with more than 8 GiB of memory. Dask will use the ``blocksize``
+value to map a discrete number of Parquet row-groups (or files) to each
+output partition. This mapping will only account for the uncompressed
+storage size of each row group, which is usually smaller than the
+correspondng ``cudf.DataFrame``.
+
+``aggregate_files``: Use this argument to specify whether Dask should
+map multiple files to the same DataFrame partition. The default is
+``False``, but ``aggregate_files=True`` is usually more performant when
+the dataset contains many files that are smaller than half of ``blocksize``.
+
+If you know that your files correspond to a reasonable partition size
+before splitting or aggregation, set ``blocksize=None`` to disallow
+file splitting. In the absence of column-projection pushdown, this will
+result in a simple 1-to-1 mapping between files and output partitions.
+
+.. note::
+  If your workflow requires a strict 1-to-1 mapping between files and
+  partitions, use :func:`from_map` to manually construct your partitions
+  with ``cudf.read_parquet``. When :func:`dd.read_parquet` is used,
+  query-planning optimizations may automatically aggregate distinct files
+  into the same partition (even when ``aggregate_files=False``).
+
+.. note::
+  Metadata collection can be extremely slow when reading from remote
+  storage (e.g. S3 and GCS). When reading many remote files that all
+  correspond to a reasonable partition size, use ``blocksize=None``
+  to avoid unnecessary metadata collection.
+
+
+Use :func:`from_map`
+~~~~~~~~~~~~~~~~~~~~
+
+To implement custom DataFrame-creation logic that is not covered by
+existing APIs (like :func:`read_parquet`), use :func:`dask.dataframe.from_map`
+whenever possible. The :func:`from_map` API has several advantages
+over :func:`from_delayed`:
+
+* It allows proper lazy execution of your custom logic
+* It enables column projection (as long as the mapped function supports a ``columns`` key-word argument)
+
+See the `from_map API documentation <https://docs.dask.org/en/stable/generated/dask_expr.from_map.html#dask_expr.from_map>`__
+for more details.
+
+.. note::
+  Whenever possible, be sure to specify the ``meta`` argument to
+  :func:`from_map`. If this argument is excluded, Dask will need to
+  materialize the first partition eagerly. If a large RMM pool is in
+  use on the first visible device, this eager execution on the client
+  may lead to an OOM error.
+
+
+Sorting, Joining, and Grouping
+------------------------------
+
+Sorting, joining, and grouping operations all have the potential to
+require the global shuffling of data between distinct partitions.
+When the initial data fits comfortably in global GPU memory, these
+"all-to-all" operations are typically bound by worker-to-worker
+communication. When the data is larger than global GPU memory, the
+bottleneck is typically device-to-host memory spilling.
+
+Although every workflow is different, the following guidelines
+are often recommended:
+
+* `Use a distributed cluster with Dask-CUDA workers <Use Dask-CUDA>`_
+* `Use native cuDF spilling whenever possible <Enable cuDF Spilling>`_
+* Avoid shuffling whenever possible
+  * Use ``split_out=1`` for low-cardinality groupby aggregations
+  * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``)
+* `Use UCX <https://docs.rapids.ai/api/dask-cuda/nightly/examples/ucx/>`__ if communication is a bottleneck.
+
+.. note::
+  UCX enables Dask-CUDA workers to communicate using high-performance
+  tansport technologies like `NVLink <https://www.nvidia.com/en-us/data-center/nvlink/>`__
+  and Infiniband. Without UCX, inter-process communication will rely
+  on TCP sockets.
+
+
+User-defined functions
+----------------------
+
+Most real-world Dask DataFrame workflows use `map_partitions
+<https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html>`__
+to map user-defined functions across every partition of the underlying data.
+This API is a fantastic way to apply custom operations in an intuitive and
+scalable way. With that said, the :func:`map_partitions` method will produce
+an opaque DataFrame expression that blocks the query-planning `optimizer
+<https://docs.dask.org/en/stable/dataframe-optimizer.html>`__ from performing
+useful optimizations (like projection and filter pushdown).
+
+Since column-projection pushdown is often the most effective optimization,
+it is important to select the necessary columns both before and after calling
+:func:`map_partitions`. You can also add explicit filter operations to further
+mitigate the loss of filter pushdown.
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 7fe6cbd45fa..23ca7e49753 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -15,7 +15,7 @@ as the ``"cudf"`` dataframe backend for
 .. note::
   Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU
   or multi-node execution on their own. You must also deploy a
-  `dask.distributed <https://distributed.dask.org/en/stable/>` cluster
+  `dask.distributed <https://distributed.dask.org/en/stable/>`__ cluster
   to leverage multiple GPUs. We strongly recommend using `Dask-CUDA
   <https://docs.rapids.ai/api/dask-cuda/stable/>`__ to simplify the
   setup of the cluster, taking advantage of all features of the GPU
@@ -29,6 +29,10 @@ minutes to Dask
 by `10 minutes to cuDF and Dask cuDF
 <https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html>`__.
 
+After reviewing the sections below, please see the
+:ref:`Best Practices <best-practices>` page for further guidance on
+using Dask cuDF effectively.
+
 
 Using Dask cuDF
 ---------------
@@ -36,7 +40,7 @@ Using Dask cuDF
 The Dask DataFrame API (Recommended)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Simply use the `Dask configuration <dask:configuration>` system to
+Simply use the `Dask configuration <dask:configuration>`__ system to
 set the ``"dataframe.backend"`` option to ``"cudf"``. From Python,
 this can be achieved like so::
 
@@ -50,14 +54,14 @@ environment before running your code.
 Once this is done, the public Dask DataFrame API will leverage
 ``cudf`` automatically when a new DataFrame collection is created
 from an on-disk format using any of the following ``dask.dataframe``
-functions::
+functions:
 
-* :func:`dask.dataframe.read_parquet`
-* :func:`dask.dataframe.read_json`
-* :func:`dask.dataframe.read_csv`
-* :func:`dask.dataframe.read_orc`
-* :func:`dask.dataframe.read_hdf`
-* :func:`dask.dataframe.from_dict`
+* :func:`read_parquet`
+* :func:`read_json`
+* :func:`read_csv`
+* :func:`read_orc`
+* :func:`read_hdf`
+* :func:`from_dict`
 
 For example::
 
@@ -112,8 +116,8 @@ performance benefit over the CPU/GPU-portable ``dask.dataframe`` API.
 Also, using some parts of the explicit API are incompatible with
 automatic query planning (see the next section).
 
-The explicit Dask cuDF API
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+Query Planning
+~~~~~~~~~~~~~~
 
 Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+).
 As long as the ``"dataframe.query-planning"`` configuration is set to
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
index 4655d2165f0..69e1524be39 100644
--- a/python/dask_cudf/README.md
+++ b/python/dask_cudf/README.md
@@ -16,6 +16,7 @@ See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to
 ## Resources
 
 - [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
+- [Best practices](https://docs.rapids.ai/api/dask-cudf/stable/best_practices/)
 - [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/)
 - [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
 - [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)

From ed2f9f6d000d28e67169c3636423047fed57844c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 20 Sep 2024 14:22:26 -1000
Subject: [PATCH 236/270] Add transform APIs to pylibcudf (#16760)

Contributes to https://github.com/rapidsai/cudf/issues/15162

One question is that I notice that the libcudf `compute_column` takes an expression computed by a routine in https://github.com/rapidsai/cudf/blob/branch-24.10/python/cudf/cudf/core/_internals/expressions.py. Does this need to be moved to pylibcudf too?

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16760
---
 python/cudf/cudf/_lib/column.pyx              |  11 +-
 python/cudf/cudf/_lib/transform.pyx           | 134 ++++------------
 .../pylibcudf/tests/test_transform.py         |  51 ++++++
 python/pylibcudf/pylibcudf/transform.pxd      |  14 ++
 python/pylibcudf/pylibcudf/transform.pyx      | 146 +++++++++++++++++-
 5 files changed, 250 insertions(+), 106 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index e27c595edda..99e4c21df8a 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -599,7 +599,6 @@ cdef class Column:
             children=tuple(children)
         )
 
-    #  TODO: Actually support exposed data pointers.
     @staticmethod
     def from_pylibcudf(
         col, bint data_ptr_exposed=False
@@ -616,7 +615,7 @@ cdef class Column:
         col : pylibcudf.Column
             The object to copy.
         data_ptr_exposed : bool
-            This parameter is not yet supported
+            Whether the data buffer is exposed.
 
         Returns
         -------
@@ -639,16 +638,18 @@ cdef class Column:
         dtype = dtype_from_pylibcudf_column(col)
 
         return cudf.core.column.build_column(
-            data=as_buffer(col.data().obj) if col.data() is not None else None,
+            data=as_buffer(
+                col.data().obj, exposed=data_ptr_exposed
+            ) if col.data() is not None else None,
             dtype=dtype,
             size=col.size(),
             mask=as_buffer(
-                col.null_mask().obj
+                col.null_mask().obj, exposed=data_ptr_exposed
             ) if col.null_mask() is not None else None,
             offset=col.offset(),
             null_count=col.null_count(),
             children=tuple([
-                Column.from_pylibcudf(child)
+                Column.from_pylibcudf(child, data_ptr_exposed=data_ptr_exposed)
                 for child in col.children()
             ])
         )
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index baa08a545ec..40d0c9eac3a 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -3,41 +3,26 @@
 from numba.np import numpy_support
 
 import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 from cudf.core._internals.expressions import parse_expression
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.utils import cudautils
 
 from cython.operator cimport dereference
-from libc.stdint cimport uintptr_t
 from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
-from libcpp.string cimport string
 from libcpp.utility cimport move
 
 cimport pylibcudf.libcudf.transform as libcudf_transform
 from pylibcudf cimport transform as plc_transform
 from pylibcudf.expressions cimport Expression
 from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.expressions cimport expression
-from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport (
-    bitmask_type,
-    data_type,
-    size_type,
-    type_id,
-)
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.types cimport underlying_type_t_type_id
-from cudf._lib.utils cimport (
-    columns_from_unique_ptr,
-    data_from_table_view,
-    table_view_from_columns,
-)
+from cudf._lib.utils cimport table_view_from_columns
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -46,17 +31,8 @@ def bools_to_mask(Column col):
     Given an int8 (boolean) column, compress the data from booleans to bits and
     return a Buffer
     """
-    cdef column_view col_view = col.view()
-    cdef pair[unique_ptr[device_buffer], size_type] cpp_out
-    cdef unique_ptr[device_buffer] up_db
-
-    with nogil:
-        cpp_out = move(libcudf_transform.bools_to_mask(col_view))
-        up_db = move(cpp_out.first)
-
-    rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
-    buf = as_buffer(rmm_db)
-    return buf
+    mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read"))
+    return as_buffer(mask)
 
 
 @acquire_spill_lock()
@@ -68,22 +44,15 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
     if not isinstance(mask_buffer, cudf.core.buffer.Buffer):
         raise TypeError("mask_buffer is not an instance of "
                         "cudf.core.buffer.Buffer")
-    cdef bitmask_type* bit_mask = <bitmask_type*><uintptr_t>(
-        mask_buffer.get_ptr(mode="read")
+    plc_column = plc_transform.mask_to_bools(
+        mask_buffer.get_ptr(mode="read"), begin_bit, end_bit
     )
-
-    cdef unique_ptr[column] result
-    with nogil:
-        result = move(
-            libcudf_transform.mask_to_bools(bit_mask, begin_bit, end_bit)
-        )
-
-    return Column.from_unique_ptr(move(result))
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
 def nans_to_nulls(Column input):
-    (mask, _) = plc_transform.nans_to_nulls(
+    mask, _ = plc_transform.nans_to_nulls(
         input.to_pylibcudf(mode="read")
     )
     return as_buffer(mask)
@@ -91,80 +60,45 @@ def nans_to_nulls(Column input):
 
 @acquire_spill_lock()
 def transform(Column input, op):
-    cdef column_view c_input = input.view()
-    cdef string c_str
-    cdef type_id c_tid
-    cdef data_type c_dtype
-
     nb_type = numpy_support.from_dtype(input.dtype)
     nb_signature = (nb_type,)
     compiled_op = cudautils.compile_udf(op, nb_signature)
-    c_str = compiled_op[0].encode('UTF-8')
     np_dtype = cudf.dtype(compiled_op[1])
 
-    try:
-        c_tid = <type_id> (
-            <underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
-                np_dtype
-            ]
-        )
-        c_dtype = data_type(c_tid)
-
-    except KeyError:
-        raise TypeError(
-            "Result of window function has unsupported dtype {}"
-            .format(np_dtype)
-        )
-
-    with nogil:
-        c_output = move(libcudf_transform.transform(
-            c_input,
-            c_str,
-            c_dtype,
-            True
-        ))
-
-    return Column.from_unique_ptr(move(c_output))
+    plc_column = plc_transform.transform(
+        input.to_pylibcudf(mode="read"),
+        compiled_op[0],
+        plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
+        True
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def table_encode(list source_columns):
-    cdef table_view c_input = table_view_from_columns(source_columns)
-    cdef pair[unique_ptr[table], unique_ptr[column]] c_result
-
-    with nogil:
-        c_result = move(libcudf_transform.encode(c_input))
+    plc_table, plc_column = plc_transform.encode(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns])
+    )
 
     return (
-        columns_from_unique_ptr(move(c_result.first)),
-        Column.from_unique_ptr(move(c_result.second))
+        [Column.from_pylibcudf(col) for col in plc_table.columns()],
+        Column.from_pylibcudf(plc_column)
     )
 
 
 def one_hot_encode(Column input_column, Column categories):
-    cdef column_view c_view_input = input_column.view()
-    cdef column_view c_view_categories = categories.view()
-    cdef pair[unique_ptr[column], table_view] c_result
-
-    with nogil:
-        c_result = move(
-            libcudf_transform.one_hot_encode(c_view_input, c_view_categories)
-        )
-
-    # Notice, the data pointer of `owner` has been exposed
-    # through `c_result.second` at this point.
-    owner = Column.from_unique_ptr(
-        move(c_result.first), data_ptr_exposed=True
-    )
-
-    pylist_categories = categories.to_arrow().to_pylist()
-    encodings, _ = data_from_table_view(
-        move(c_result.second),
-        owner=owner,
-        column_names=[
-            x if x is not None else '<NA>' for x in pylist_categories
-        ]
+    plc_table = plc_transform.one_hot_encode(
+        input_column.to_pylibcudf(mode="read"),
+        categories.to_pylibcudf(mode="read"),
     )
-    return encodings
+    result_columns = [
+        Column.from_pylibcudf(col, data_ptr_exposed=True)
+        for col in plc_table.columns()
+    ]
+    result_labels = [
+        x if x is not None else '<NA>'
+        for x in categories.to_arrow().to_pylist()
+    ]
+    return dict(zip(result_labels, result_columns))
 
 
 @acquire_spill_lock()
diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py
index 06fc35d8835..d5c618f07e4 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transform.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transform.py
@@ -29,3 +29,54 @@ def test_nans_to_nulls(has_nans):
     got = input.with_mask(mask, null_count)
 
     assert_column_eq(expect, got)
+
+
+def test_bools_to_mask_roundtrip():
+    pa_array = pa.array([True, None, False])
+    plc_input = plc.interop.from_arrow(pa_array)
+    mask, result_null_count = plc.transform.bools_to_mask(plc_input)
+
+    assert result_null_count == 2
+    result = plc_input.with_mask(mask, result_null_count)
+    assert_column_eq(pa.array([True, None, None]), result)
+
+    plc_output = plc.transform.mask_to_bools(mask.ptr, 0, len(pa_array))
+    result_pa = plc.interop.to_arrow(plc_output)
+    expected_pa = pa.chunked_array([[True, False, False]])
+    assert result_pa.equals(expected_pa)
+
+
+def test_encode():
+    pa_table = pa.table({"a": [1, 3, 4], "b": [1, 2, 4]})
+    plc_input = plc.interop.from_arrow(pa_table)
+    result_table, result_column = plc.transform.encode(plc_input)
+    pa_table_result = plc.interop.to_arrow(result_table)
+    pa_column_result = plc.interop.to_arrow(result_column)
+
+    pa_table_expected = pa.table(
+        [[1, 3, 4], [1, 2, 4]],
+        schema=pa.schema(
+            [
+                pa.field("", pa.int64(), nullable=False),
+                pa.field("", pa.int64(), nullable=False),
+            ]
+        ),
+    )
+    assert pa_table_result.equals(pa_table_expected)
+
+    pa_column_expected = pa.chunked_array([[0, 1, 2]], type=pa.int32())
+    assert pa_column_result.equals(pa_column_expected)
+
+
+def test_one_hot_encode():
+    pa_column = pa.array([1, 2, 3])
+    pa_categories = pa.array([0, 0, 0])
+    plc_input = plc.interop.from_arrow(pa_column)
+    plc_categories = plc.interop.from_arrow(pa_categories)
+    plc_table = plc.transform.one_hot_encode(plc_input, plc_categories)
+    result = plc.interop.to_arrow(plc_table)
+    expected = pa.table(
+        [[False] * 3] * 3,
+        schema=pa.schema([pa.field("", pa.bool_(), nullable=False)] * 3),
+    )
+    assert result.equals(expected)
diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd
index 4b21feffe25..b530f433c97 100644
--- a/python/pylibcudf/pylibcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/transform.pxd
@@ -1,7 +1,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp cimport bool
+from pylibcudf.libcudf.types cimport bitmask_type, data_type
 
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
+from .table cimport Table
+from .types cimport DataType
 
 
 cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input)
+
+cpdef tuple[gpumemoryview, int] bools_to_mask(Column input)
+
+cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit)
+
+cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx)
+
+cpdef tuple[Table, Column] encode(Table input)
+
+cpdef Table one_hot_encode(Column input_column, Column categories)
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index 100ccb580ce..bcd6185521a 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -1,14 +1,20 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
 from libcpp.utility cimport move, pair
 from pylibcudf.libcudf cimport transform as cpp_transform
-from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport bitmask_type, size_type
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
+from .types cimport DataType
+from .utils cimport int_to_bitmask_ptr
 
 
 cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
@@ -32,3 +38,141 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
         gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
         c_result.second
     )
+
+
+cpdef tuple[gpumemoryview, int] bools_to_mask(Column input):
+    """Create a bitmask from a column of boolean elements
+
+    Parameters
+    ----------
+    input : Column
+        Column to produce new mask from.
+
+    Returns
+    -------
+    tuple[gpumemoryview, int]
+        Two-tuple of a gpumemoryview wrapping the bitmask and the null count.
+    """
+    cdef pair[unique_ptr[device_buffer], size_type] c_result
+
+    with nogil:
+        c_result = move(cpp_transform.bools_to_mask(input.view()))
+
+    return (
+        gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
+        c_result.second
+    )
+
+
+cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit):
+    """Creates a boolean column from given bitmask.
+
+    Parameters
+    ----------
+    bitmask : int
+        Pointer to the bitmask which needs to be converted
+    begin_bit : int
+        Position of the bit from which the conversion should start
+    end_bit : int
+        Position of the bit before which the conversion should stop
+
+    Returns
+    -------
+    Column
+        Boolean column of the bitmask from [begin_bit, end_bit]
+    """
+    cdef unique_ptr[column] c_result
+    cdef bitmask_type * bitmask_ptr = int_to_bitmask_ptr(bitmask)
+
+    with nogil:
+        c_result = move(cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx):
+    """Create a new column by applying a unary function against every
+       element of an input column.
+
+    Parameters
+    ----------
+    input : Column
+        Column to transform.
+    unary_udf : str
+        The PTX/CUDA string of the unary function to apply.
+    output_type : DataType
+        The output type that is compatible with the output type in the unary_udf.
+    is_ptx : bool
+        If `True`, the UDF is treated as PTX code.
+        If `False`, the UDF is treated as CUDA code.
+
+    Returns
+    -------
+    Column
+        The transformed column having the UDF applied to each element.
+    """
+    cdef unique_ptr[column] c_result
+    cdef string c_unary_udf = unary_udf.encode()
+    cdef bool c_is_ptx = is_ptx
+
+    with nogil:
+        c_result = move(
+            cpp_transform.transform(
+                input.view(), c_unary_udf, output_type.c_obj, c_is_ptx
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef tuple[Table, Column] encode(Table input):
+    """Encode the rows of the given table as integers.
+
+    Parameters
+    ----------
+    input : Table
+        Table containing values to be encoded
+
+    Returns
+    -------
+    tuple[Table, Column]
+        The distinct row of the input table in sorted order,
+        and a column of integer indices representing the encoded rows.
+    """
+    cdef pair[unique_ptr[table], unique_ptr[column]] c_result
+
+    with nogil:
+        c_result = move(cpp_transform.encode(input.view()))
+
+    return (
+        Table.from_libcudf(move(c_result.first)),
+        Column.from_libcudf(move(c_result.second))
+    )
+
+cpdef Table one_hot_encode(Column input, Column categories):
+    """Encodes `input` by generating a new column
+    for each value in `categories` indicating the presence
+    of that value in `input`.
+
+    Parameters
+    ----------
+    input : Column
+        Column containing values to be encoded.
+    categories : Column
+        Column containing categories
+
+    Returns
+    -------
+    Column
+        A table of the encoded values.
+    """
+    cdef pair[unique_ptr[column], table_view] c_result
+    cdef Table owner_table
+
+    with nogil:
+        c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view()))
+
+    owner_table = Table(
+        [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns()
+    )
+
+    return Table.from_table_view(c_result.second, owner_table)

From 96d2f814ab60bd22667c22d82d4b9b1755c1e028 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 20 Sep 2024 18:24:47 -0700
Subject: [PATCH 237/270] Update labeler for pylibcudf (#16868)

The labeler was not updated for the move of pylibcudf to a separate package.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16868
---
 .github/labeler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 90cdda4d3ca..8506d38a048 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -12,7 +12,7 @@ cudf.polars:
   - 'python/cudf_polars/**'
 
 pylibcudf:
-  - 'python/cudf/pylibcudf/**'
+  - 'python/pylibcudf/**'
 
 libcudf:
   - 'cpp/**'

From 9b4c4c721c399bae9e88733da79daa1a10644481 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Sat, 21 Sep 2024 14:19:57 +0100
Subject: [PATCH 238/270] Exposed stream-ordering to datetime API (#16774)

This merge request exposes stream-ordering to the public-facing datetime APIs.

- `extract_year`
- `extract_month`
- `extract_day`
- `extract_weekday`
- `extract_hour`
- `extract_minute`
- `extract_second`
- `extract_millisecond_fraction`
- `extract_microsecond_fraction`
- `extract_nanosecond_fraction`
- `last_day_of_month`
- `day_of_year`
- `add_calendrical_months`
- `is_leap_year`
- `days_in_month`
- `extract_quarter`
- `ceil_datetimes`
- `floor_datetimes`
- `round_datetimes`


 Follows-up https://github.com/rapidsai/cudf/issues/13744
Closes https://github.com/rapidsai/cudf/issues/16775

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16774
---
 cpp/include/cudf/datetime.hpp        |  43 +++++++++
 cpp/include/cudf/detail/datetime.hpp |  55 +++++------
 cpp/include/cudf/detail/timezone.hpp |   6 +-
 cpp/include/cudf/timezone.hpp        |   5 +
 cpp/src/datetime/datetime_ops.cu     |  91 +++++++++++-------
 cpp/src/datetime/timezone.cpp        |   4 +-
 cpp/tests/CMakeLists.txt             |   1 +
 cpp/tests/streams/datetime_test.cpp  | 139 +++++++++++++++++++++++++++
 8 files changed, 276 insertions(+), 68 deletions(-)
 create mode 100644 cpp/tests/streams/datetime_test.cpp

diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index c7523c80b2b..7359a0d5fde 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -17,9 +17,12 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 
 /**
@@ -40,6 +43,7 @@ namespace datetime {
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t years
@@ -47,6 +51,7 @@ namespace datetime {
  */
 std::unique_ptr<cudf::column> extract_year(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -54,6 +59,7 @@ std::unique_ptr<cudf::column> extract_year(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t months
@@ -61,6 +67,7 @@ std::unique_ptr<cudf::column> extract_year(
  */
 std::unique_ptr<cudf::column> extract_month(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -68,6 +75,7 @@ std::unique_ptr<cudf::column> extract_month(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t days
@@ -75,6 +83,7 @@ std::unique_ptr<cudf::column> extract_month(
  */
 std::unique_ptr<cudf::column> extract_day(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -82,6 +91,7 @@ std::unique_ptr<cudf::column> extract_day(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t days
@@ -89,6 +99,7 @@ std::unique_ptr<cudf::column> extract_day(
  */
 std::unique_ptr<cudf::column> extract_weekday(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -96,6 +107,7 @@ std::unique_ptr<cudf::column> extract_weekday(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t hours
@@ -103,6 +115,7 @@ std::unique_ptr<cudf::column> extract_weekday(
  */
 std::unique_ptr<cudf::column> extract_hour(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -110,6 +123,7 @@ std::unique_ptr<cudf::column> extract_hour(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t minutes
@@ -117,6 +131,7 @@ std::unique_ptr<cudf::column> extract_hour(
  */
 std::unique_ptr<cudf::column> extract_minute(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -124,6 +139,7 @@ std::unique_ptr<cudf::column> extract_minute(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t seconds
@@ -131,6 +147,7 @@ std::unique_ptr<cudf::column> extract_minute(
  */
 std::unique_ptr<cudf::column> extract_second(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -141,6 +158,7 @@ std::unique_ptr<cudf::column> extract_second(
  * For example, the millisecond fraction of 1.234567890 seconds is 234.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t milliseconds
@@ -148,6 +166,7 @@ std::unique_ptr<cudf::column> extract_second(
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -158,6 +177,7 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  * For example, the microsecond fraction of 1.234567890 seconds is 567.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t microseconds
@@ -165,6 +185,7 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -175,6 +196,7 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  * For example, the nanosecond fraction of 1.234567890 seconds is 890.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t nanoseconds
@@ -182,6 +204,7 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
@@ -196,6 +219,7 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column containing last day of the month as TIMESTAMP_DAYS
@@ -203,6 +227,7 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
  */
 std::unique_ptr<cudf::column> last_day_of_month(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -210,6 +235,7 @@ std::unique_ptr<cudf::column> last_day_of_month(
  * returns an int16_t cudf::column. The value is between [1, {365-366}]
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of datatype INT16 containing the day number since the start of the year
@@ -217,6 +243,7 @@ std::unique_ptr<cudf::column> last_day_of_month(
  */
 std::unique_ptr<cudf::column> day_of_year(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -245,6 +272,7 @@ std::unique_ptr<cudf::column> day_of_year(
  *
  * @param timestamps cudf::column_view of timestamp type
  * @param months cudf::column_view of integer type containing the number of months to add
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of timestamp type containing the computed timestamps
@@ -252,6 +280,7 @@ std::unique_ptr<cudf::column> day_of_year(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::column_view const& months,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -280,6 +309,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  *
  * @param timestamps cudf::column_view of timestamp type
  * @param months cudf::scalar of integer type containing the number of months to add
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @return cudf::column of timestamp type containing the computed timestamps
@@ -287,6 +317,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::scalar const& months,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -297,6 +328,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  * `output[i] is null` if `column[i]` is null
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of datatype BOOL8 truth value of the corresponding date
@@ -304,6 +336,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  */
 std::unique_ptr<cudf::column> is_leap_year(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -315,11 +348,13 @@ std::unique_ptr<cudf::column> is_leap_year(
  * @throw cudf::logic_error if input column datatype is not a TIMESTAMP
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  * @return cudf::column of datatype INT16 of days in month of the corresponding date
  */
 std::unique_ptr<cudf::column> days_in_month(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -331,11 +366,13 @@ std::unique_ptr<cudf::column> days_in_month(
  * @throw cudf::logic_error if input column datatype is not a TIMESTAMP
  *
  * @param column The input column containing datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  * @return A column of INT16 type indicating which quarter the date is in
  */
 std::unique_ptr<cudf::column> extract_quarter(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -357,6 +394,7 @@ enum class rounding_frequency : int32_t {
  *
  * @param column cudf::column_view of the input datetime values
  * @param freq rounding_frequency indicating the frequency to round up to
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @throw cudf::logic_error if input column datatype is not TIMESTAMP.
@@ -365,6 +403,7 @@ enum class rounding_frequency : int32_t {
 std::unique_ptr<cudf::column> ceil_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -372,6 +411,7 @@ std::unique_ptr<cudf::column> ceil_datetimes(
  *
  * @param column cudf::column_view of the input datetime values
  * @param freq rounding_frequency indicating the frequency to round down to
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @throw cudf::logic_error if input column datatype is not TIMESTAMP.
@@ -380,6 +420,7 @@ std::unique_ptr<cudf::column> ceil_datetimes(
 std::unique_ptr<cudf::column> floor_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -387,6 +428,7 @@ std::unique_ptr<cudf::column> floor_datetimes(
  *
  * @param column cudf::column_view of the input datetime values
  * @param freq rounding_frequency indicating the frequency to round to
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @throw cudf::logic_error if input column datatype is not TIMESTAMP.
@@ -395,6 +437,7 @@ std::unique_ptr<cudf::column> floor_datetimes(
 std::unique_ptr<cudf::column> round_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index 31782cbaf8a..9db7e48498f 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -26,111 +26,108 @@ namespace CUDF_EXPORT cudf {
 namespace datetime {
 namespace detail {
 /**
- * @copydoc cudf::extract_year(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_year(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_year(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_month(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_month(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_month(cudf::column_view const& column,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_day(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_day(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_day(cudf::column_view const& column,
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_weekday(cudf::column_view const& column,
                                               rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_hour(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_minute(cudf::column_view const& column,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_second(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_second(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_second(cudf::column_view const& column,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&,
+ * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, rmm::cuda_stream_view,
  * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(cudf::column_view const& column,
                                                            rmm::cuda_stream_view stream,
                                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&,
+ * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, rmm::cuda_stream_view,
  * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(cudf::column_view const& column,
                                                            rmm::cuda_stream_view stream,
                                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&,
+ * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, rmm::cuda_stream_view,
  * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(cudf::column_view const& column,
                                                           rmm::cuda_stream_view stream,
                                                           rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> last_day_of_month(cudf::column_view const& column,
                                                 rmm::cuda_stream_view stream,
                                                 rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> day_of_year(cudf::column_view const& column,
                                           rmm::cuda_stream_view stream,
@@ -138,9 +135,8 @@ std::unique_ptr<cudf::column> day_of_year(cudf::column_view const& column,
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::column_view const&,
- * rmm::device_async_resource_ref)
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
                                                      cudf::column_view const& months,
@@ -149,9 +145,8 @@ std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& ti
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::scalar const&,
- * rmm::device_async_resource_ref)
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
                                                      cudf::scalar const& months,
@@ -159,9 +154,9 @@ std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& ti
                                                      rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> is_leap_year(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
index 5738f9ec8e9..f51d1ba42b2 100644
--- a/cpp/include/cudf/detail/timezone.hpp
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/timezone.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -26,14 +27,13 @@ namespace detail {
 
 /**
  * @copydoc cudf::make_timezone_transition_table(std::optional<std::string_view>, std::string_view,
- * rmm::device_async_resource_ref)
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
-  rmm::cuda_stream_view stream,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
index aa903770e26..f6de1056c24 100644
--- a/cpp/include/cudf/timezone.hpp
+++ b/cpp/include/cudf/timezone.hpp
@@ -15,9 +15,12 @@
  */
 #pragma once
 
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <optional>
 #include <string>
@@ -43,6 +46,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
  *
  * @param tzif_dir The directory where the TZif files are located
  * @param timezone_name standard timezone name (for example, "America/Los_Angeles")
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory.
  *
  * @return The transition table for the given timezone
@@ -50,6 +54,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
 std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index fd9a6b8f5fe..ddb0dbcd96d 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -580,142 +580,167 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 
 std::unique_ptr<column> ceil_datetimes(column_view const& column,
                                        rounding_frequency freq,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(
-    detail::rounding_function::CEIL, freq, column, cudf::get_default_stream(), mr);
+  return detail::round_general(detail::rounding_function::CEIL, freq, column, stream, mr);
 }
 
 std::unique_ptr<column> floor_datetimes(column_view const& column,
                                         rounding_frequency freq,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(
-    detail::rounding_function::FLOOR, freq, column, cudf::get_default_stream(), mr);
+  return detail::round_general(detail::rounding_function::FLOOR, freq, column, stream, mr);
 }
 
 std::unique_ptr<column> round_datetimes(column_view const& column,
                                         rounding_frequency freq,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(
-    detail::rounding_function::ROUND, freq, column, cudf::get_default_stream(), mr);
+  return detail::round_general(detail::rounding_function::ROUND, freq, column, stream, mr);
 }
 
-std::unique_ptr<column> extract_year(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_year(column_view const& column,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_year(column, cudf::get_default_stream(), mr);
+  return detail::extract_year(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_month(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_month(column_view const& column,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_month(column, cudf::get_default_stream(), mr);
+  return detail::extract_month(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_day(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_day(column_view const& column,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_day(column, cudf::get_default_stream(), mr);
+  return detail::extract_day(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_weekday(column_view const& column,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_weekday(column, cudf::get_default_stream(), mr);
+  return detail::extract_weekday(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_hour(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_hour(column_view const& column,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_hour(column, cudf::get_default_stream(), mr);
+  return detail::extract_hour(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_minute(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_minute(column_view const& column,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_minute(column, cudf::get_default_stream(), mr);
+  return detail::extract_minute(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_second(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_second(column_view const& column,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_second(column, cudf::get_default_stream(), mr);
+  return detail::extract_second(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_millisecond_fraction(column, cudf::get_default_stream(), mr);
+  return detail::extract_millisecond_fraction(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_microsecond_fraction(column, cudf::get_default_stream(), mr);
+  return detail::extract_microsecond_fraction(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_nanosecond_fraction(column, cudf::get_default_stream(), mr);
+  return detail::extract_nanosecond_fraction(column, stream, mr);
 }
 
 std::unique_ptr<column> last_day_of_month(column_view const& column,
+                                          rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::last_day_of_month(column, cudf::get_default_stream(), mr);
+  return detail::last_day_of_month(column, stream, mr);
 }
 
-std::unique_ptr<column> day_of_year(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> day_of_year(column_view const& column,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::day_of_year(column, cudf::get_default_stream(), mr);
+  return detail::day_of_year(column, stream, mr);
 }
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
                                                      cudf::column_view const& months_column,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::add_calendrical_months(
-    timestamp_column, months_column, cudf::get_default_stream(), mr);
+  return detail::add_calendrical_months(timestamp_column, months_column, stream, mr);
 }
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
                                                      cudf::scalar const& months,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::add_calendrical_months(timestamp_column, months, cudf::get_default_stream(), mr);
+  return detail::add_calendrical_months(timestamp_column, months, stream, mr);
 }
 
-std::unique_ptr<column> is_leap_year(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> is_leap_year(column_view const& column,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_leap_year(column, cudf::get_default_stream(), mr);
+  return detail::is_leap_year(column, stream, mr);
 }
 
-std::unique_ptr<column> days_in_month(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> days_in_month(column_view const& column,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::days_in_month(column, cudf::get_default_stream(), mr);
+  return detail::days_in_month(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_quarter(column_view const& column,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_quarter(column, cudf::get_default_stream(), mr);
+  return detail::extract_quarter(column, stream, mr);
 }
 
 }  // namespace datetime
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 6498a5e6c55..cf239297255 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -380,11 +380,11 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year)
 
 std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
                                                       std::string_view timezone_name,
+                                                      rmm::cuda_stream_view stream,
                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::make_timezone_transition_table(
-    tzif_dir, timezone_name, cudf::get_default_stream(), mr);
+  return detail::make_timezone_transition_table(tzif_dir, timezone_name, stream, mr);
 }
 
 namespace detail {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 586bac97570..288fa84a73d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -687,6 +687,7 @@ ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_DATETIME_TEST streams/datetime_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/datetime_test.cpp b/cpp/tests/streams/datetime_test.cpp
new file mode 100644
index 00000000000..82629156fa6
--- /dev/null
+++ b/cpp/tests/streams/datetime_test.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/datetime.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+#include <cstdint>
+
+class DatetimeTest : public cudf::test::BaseFixture {
+ public:
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> timestamps{
+    -23324234,  // 1969-12-31 23:59:59.976675766 GMT
+    23432424,   // 1970-01-01 00:00:00.023432424 GMT
+    987234623   // 1970-01-01 00:00:00.987234623 GMT
+  };
+  cudf::test::fixed_width_column_wrapper<int32_t, int32_t> months{{1, -1, 3}};
+};
+
+TEST_F(DatetimeTest, ExtractYear)
+{
+  cudf::datetime::extract_year(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractMonth)
+{
+  cudf::datetime::extract_month(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractDay)
+{
+  cudf::datetime::extract_day(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractWeekday)
+{
+  cudf::datetime::extract_weekday(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractHour)
+{
+  cudf::datetime::extract_hour(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractMinute)
+{
+  cudf::datetime::extract_minute(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractSecond)
+{
+  cudf::datetime::extract_second(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractMillisecondFraction)
+{
+  cudf::datetime::extract_millisecond_fraction(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractMicrosecondFraction)
+{
+  cudf::datetime::extract_microsecond_fraction(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractNanosecondFraction)
+{
+  cudf::datetime::extract_nanosecond_fraction(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, LastDayOfMonth)
+{
+  cudf::datetime::last_day_of_month(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, DayOfYear)
+{
+  cudf::datetime::day_of_year(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, AddCalendricalMonths)
+{
+  cudf::datetime::add_calendrical_months(timestamps, months, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, AddCalendricalMonthsScalar)
+{
+  auto scalar = cudf::make_fixed_width_scalar(1, cudf::test::get_default_stream());
+
+  cudf::datetime::add_calendrical_months(timestamps, *scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, IsLeapYear)
+{
+  cudf::datetime::is_leap_year(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, DaysInMonth)
+{
+  cudf::datetime::days_in_month(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractQuarter)
+{
+  cudf::datetime::extract_quarter(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, CeilDatetimes)
+{
+  cudf::datetime::ceil_datetimes(
+    timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, FloorDatetimes)
+{
+  cudf::datetime::floor_datetimes(
+    timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, RoundDatetimes)
+{
+  cudf::datetime::round_datetimes(
+    timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream());
+}

From 0870051b6fbe8ad5a5cec93035d1784e9b18cbd8 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 23 Sep 2024 11:41:42 -0500
Subject: [PATCH 239/270] Improve Polars docs (#16820)

This PR improves the docs by reducing the size of the Polars heading
(too many words) and tightening up the writing of the docs page.

---------

Co-authored-by: Ray Douglass <ray@raydouglass.com>
---
 .github/workflows/build.yaml           |  2 +-
 .github/workflows/pr.yaml              |  6 +++---
 .github/workflows/test.yaml            |  6 +++---
 docs/cudf/source/cudf_polars/index.rst | 12 ++++++------
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2e5959338b0..379f39ac965 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -62,7 +62,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 25f11863b0d..0fe4533f68e 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -89,7 +89,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       run_script: "ci/test_java.sh"
   static-configure:
     needs: checks
@@ -109,7 +109,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
@@ -119,7 +119,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       run_script: "ci/build_docs.sh"
   wheel-build-cudf:
     needs: checks
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 36c9088d93c..a10117a45e6 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -41,7 +41,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
@@ -81,7 +81,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
@@ -93,7 +93,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst
index cc7aabd124f..0a3a0d86b2c 100644
--- a/docs/cudf/source/cudf_polars/index.rst
+++ b/docs/cudf/source/cudf_polars/index.rst
@@ -1,7 +1,7 @@
-cuDF-based GPU backend for Polars [Open Beta]
-=============================================
+Polars GPU engine
+=================
 
-cuDF supports an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API.
+cuDF provides an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API.
 The engine supports most of the core expressions and data types as well as a growing set of more advanced dataframe manipulations
 and data file formats. When using the GPU engine, Polars will convert expressions into an optimized query plan and determine
 whether the plan is supported on the GPU. If it is not, the execution will transparently fall back to the standard Polars engine
@@ -16,7 +16,7 @@ We reproduced the `Polars Decision Support (PDS) <https://github.com/pola-rs/pol
 
 
-You can see up to 13x speedup using the GPU backend on the compute-heavy PDS queries involving complex aggregation and join operations. Below are the speedups for the top performing queries:
+You can see up to 13x speedup using the GPU engine on the compute-heavy PDS queries involving complex aggregation and join operations. Below are the speedups for the top performing queries:
 
 
 .. figure:: ../_static/compute_heavy_queries_polars.png
@@ -29,7 +29,7 @@ You can reproduce the results by visiting the `Polars Decision Support (PDS) Git
 Learn More
 ----------
 
-The GPU backend for Polars is now available in Open Beta and the engine is undergoing rapid development. To learn more, visit the `GPU Support page <https://docs.pola.rs/user-guide/gpu-support/>`__ on the Polars website.
+The GPU engine for Polars is now available in Open Beta and the engine is undergoing rapid development. To learn more, visit the `GPU Support page <https://docs.pola.rs/user-guide/gpu-support/>`__ on the Polars website.
 
 Launch on Google Colab
 ----------------------
@@ -38,4 +38,4 @@ Launch on Google Colab
    :width: 200px
    :target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb
 
-   Take the cuDF backend for Polars for a test-drive in a free GPU-enabled notebook environment using your Google account by `launching on Colab <https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb>`__.
+   Try out the GPU engine for Polars in a free GPU notebook environment. Sign in with your Google account and `launch the demo on Colab <https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb>`__.

From 389208c9a46fd6583efacfe9c1875c862e8d0c90 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 23 Sep 2024 14:03:57 -0500
Subject: [PATCH 240/270] Ignore numba warning specific to ARM runners (#16872)

This PR ignores numba warnings that are showing up in arm runners: https://github.com/numba/numba/issues/6589#issuecomment-748595076

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16872
---
 python/cudf/cudf/tests/pytest.ini | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
index 8a594794fac..d05ba9aaacc 100644
--- a/python/cudf/cudf/tests/pytest.ini
+++ b/python/cudf/cudf/tests/pytest.ini
@@ -14,4 +14,6 @@ filterwarnings =
     ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
     # PerformanceWarning from cupy warming up the JIT cache
     ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning
+    # Ignore numba PEP 456 warning specific to arm machines
+    ignore:FNV hashing is not implemented in Numba.*:UserWarning
 addopts = --tb=native

From 8b12cf4e66b4b1f8ec248493c27deb65ee625bbf Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 23 Sep 2024 15:35:32 -0500
Subject: [PATCH 241/270] Update fmt (to 11.0.2) and spdlog (to 1.14.1).
 (#16806)

## Description

Replaces #15603

Contributes to:

* https://github.com/rapidsai/build-planning/issues/54
* https://github.com/rapidsai/build-planning/issues/56
* https://github.com/rapidsai/rapids-cmake/issues/387

Now that most of `conda-forge` has been updated to `fmt >=11.0.1,<12`
and `spdlog>=1.14.1,<1.15`
(https://github.com/rapidsai/build-planning/issues/56#issuecomment-2334281452),
we're attempting to upgrade RAPIDS to similar versions of those
libraries.

This improves the likelihood that RAPIDS will be installable alongside
newer versions of its
dependencies and complementary packages on conda-forge.

## Notes for Reviewers

This PR is testing changes made in
https://github.com/rapidsai/rapids-cmake/pull/689.
It shouldn't be merged until those `rapids-cmake` changes are merged and
any testing-specific details have been removed.
---
 .../all_cuda-118_arch-x86_64.yaml             |  4 ++--
 .../all_cuda-125_arch-x86_64.yaml             |  4 ++--
 conda/recipes/libcudf/conda_build_config.yaml |  4 ++--
 cpp/CMakeLists.txt                            |  2 +-
 cpp/cmake/thirdparty/get_spdlog.cmake         | 21 ++++++-------------
 dependencies.yaml                             |  4 ++--
 6 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index c96e8706d27..16b3d112992 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -31,7 +31,7 @@ dependencies:
 - doxygen=1.9.1
 - fastavro>=0.22.9
 - flatbuffers==24.3.25
-- fmt>=10.1.1,<11
+- fmt>=11.0.2,<12
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
 - hypothesis
@@ -84,7 +84,7 @@ dependencies:
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
-- spdlog>=1.12.0,<1.13
+- spdlog>=1.14.1,<1.15
 - sphinx
 - sphinx-autobuild
 - sphinx-copybutton
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index e54a44d9f6e..cce2e0eea84 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - doxygen=1.9.1
 - fastavro>=0.22.9
 - flatbuffers==24.3.25
-- fmt>=10.1.1,<11
+- fmt>=11.0.2,<12
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
 - hypothesis
@@ -82,7 +82,7 @@ dependencies:
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
-- spdlog>=1.12.0,<1.13
+- spdlog>=1.14.1,<1.15
 - sphinx
 - sphinx-autobuild
 - sphinx-copybutton
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 33fa4b4eccf..dc75eb4b252 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -26,13 +26,13 @@ librdkafka_version:
   - ">=2.5.0,<2.6.0a0"
 
 fmt_version:
-  - ">=10.1.1,<11"
+  - ">=11.0.2,<12"
 
 flatbuffers_version:
   - "=24.3.25"
 
 spdlog_version:
-  - ">=1.12.0,<1.13"
+  - ">=1.14.1,<1.15"
 
 nvcomp_version:
   - "=4.0.1"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 26c086046a8..84b462bb884 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -798,7 +798,7 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
+  PUBLIC CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool> spdlog::spdlog_header_only
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
           kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
index c0e07d02d94..90b0f4d8a8e 100644
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ b/cpp/cmake/thirdparty/get_spdlog.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -16,21 +16,12 @@
 function(find_and_configure_spdlog)
 
   include(${rapids-cmake-dir}/cpm/spdlog.cmake)
-  rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET cudf-exports)
-  rapids_export_package(BUILD spdlog cudf-exports)
+  rapids_cpm_spdlog(
+    FMT_OPTION "EXTERNAL_FMT_HO"
+    INSTALL_EXPORT_SET cudf-exports
+    BUILD_EXPORT_SET cudf-exports
+  )
 
-  if(spdlog_ADDED)
-    rapids_export(
-      BUILD spdlog
-      EXPORT_SET spdlog
-      GLOBAL_TARGETS spdlog spdlog_header_only
-      NAMESPACE spdlog::
-    )
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(
-      BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports
-    )
-  endif()
 endfunction()
 
 find_and_configure_spdlog()
diff --git a/dependencies.yaml b/dependencies.yaml
index 2f2d7ba679e..01edcb3889a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -350,12 +350,12 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - fmt>=10.1.1,<11
+          - fmt>=11.0.2,<12
           - flatbuffers==24.3.25
           - librdkafka>=2.5.0,<2.6.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==4.0.1
-          - spdlog>=1.12.0,<1.13
+          - spdlog>=1.14.1,<1.15
   rapids_build_skbuild:
     common:
       - output_types: [conda, requirements, pyproject]

From 6badd6b183e966f7f882708a0f4b2c4d0f2b5368 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Tue, 24 Sep 2024 08:17:53 -0500
Subject: [PATCH 242/270] Add in support for setting delim when parsing JSON
 through java (#16867) (#16880)

This is a back-port of #16867 to 24.10.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/16880
---
 .../main/java/ai/rapids/cudf/JSONOptions.java | 16 ++++++++++++++++
 java/src/main/java/ai/rapids/cudf/Table.java  | 19 ++++++++++++++-----
 java/src/main/native/src/TableJni.cpp         | 12 ++++++++++--
 .../test/java/ai/rapids/cudf/TableTest.java   | 19 ++++++++++++++++++-
 4 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index c8308ca17ec..17b497be5ee 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -38,6 +38,7 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean allowLeadingZeros;
   private final boolean allowNonNumericNumbers;
   private final boolean allowUnquotedControlChars;
+  private final byte lineDelimiter;
 
   private JSONOptions(Builder builder) {
     super(builder);
@@ -52,6 +53,11 @@ private JSONOptions(Builder builder) {
     allowLeadingZeros = builder.allowLeadingZeros;
     allowNonNumericNumbers = builder.allowNonNumericNumbers;
     allowUnquotedControlChars = builder.allowUnquotedControlChars;
+    lineDelimiter = builder.lineDelimiter;
+  }
+
+  public byte getLineDelimiter() {
+    return lineDelimiter;
   }
 
   public boolean isDayFirst() {
@@ -123,6 +129,16 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean mixedTypesAsStrings = false;
     private boolean keepQuotes = false;
 
+    private byte lineDelimiter = '\n';
+
+    public Builder withLineDelimiter(char delimiter) {
+      if (delimiter > Byte.MAX_VALUE) {
+        throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter);
+      }
+      lineDelimiter = (byte)delimiter;
+      return this;
+    }
+
     /**
      * Should json validation be strict or not
      */
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 09da43374ae..19c72809cea 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -258,7 +258,8 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean strictValidation,
                                         boolean allowLeadingZeros,
                                         boolean allowNonNumericNumbers,
-                                        boolean allowUnquotedControl) throws CudfException;
+                                        boolean allowUnquotedControl,
+                                        byte lineDelimiter) throws CudfException;
 
   private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
                                       int[] dTypeIds, int[] dTypeScales,
@@ -272,6 +273,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean allowLeadingZeros,
                                       boolean allowNonNumericNumbers,
                                       boolean allowUnquotedControl,
+                                      byte lineDelimiter,
                                       long dsHandle) throws CudfException;
 
   private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
@@ -284,6 +286,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool
                                       boolean allowLeadingZeros,
                                       boolean allowNonNumericNumbers,
                                       boolean allowUnquotedControl,
+                                      byte lineDelimiter,
                                       long dsHandle) throws CudfException;
 
   private static native long readAndInferJSON(long address, long length,
@@ -297,7 +300,8 @@ private static native long readAndInferJSON(long address, long length,
                                               boolean strictValidation,
                                               boolean allowLeadingZeros,
                                               boolean allowNonNumericNumbers,
-                                              boolean allowUnquotedControl) throws CudfException;
+                                              boolean allowUnquotedControl,
+                                              byte lineDelimiter) throws CudfException;
 
   /**
    * Read in Parquet formatted data.
@@ -1321,7 +1325,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     opts.strictValidation(),
                     opts.leadingZerosAllowed(),
                     opts.nonNumericNumbersAllowed(),
-                    opts.unquotedControlChars()))) {
+                    opts.unquotedControlChars(),
+                    opts.getLineDelimiter()))) {
 
       return gatherJSONColumns(schema, twm, -1);
     }
@@ -1404,7 +1409,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
         opts.strictValidation(),
         opts.leadingZerosAllowed(),
         opts.nonNumericNumbersAllowed(),
-        opts.unquotedControlChars()));
+        opts.unquotedControlChars(),
+        opts.getLineDelimiter()));
   }
 
   /**
@@ -1426,6 +1432,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
           opts.leadingZerosAllowed(),
           opts.nonNumericNumbersAllowed(),
           opts.unquotedControlChars(),
+          opts.getLineDelimiter(),
           dsHandle));
         return twm;
       } finally {
@@ -1479,7 +1486,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
             opts.strictValidation(),
             opts.leadingZerosAllowed(),
             opts.nonNumericNumbersAllowed(),
-            opts.unquotedControlChars()))) {
+            opts.unquotedControlChars(),
+            opts.getLineDelimiter()))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
     }
   }
@@ -1518,6 +1526,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
         opts.leadingZerosAllowed(),
         opts.nonNumericNumbersAllowed(),
         opts.unquotedControlChars(),
+        opts.getLineDelimiter(),
         dsHandle))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
     } finally {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 92e213bcb60..96d4c2c4eeb 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1627,6 +1627,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
                                                          jboolean allow_leading_zeros,
                                                          jboolean allow_nonnumeric_numbers,
                                                          jboolean allow_unquoted_control,
+                                                         jbyte line_delimiter,
                                                          jlong ds_handle)
 {
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
@@ -1646,6 +1647,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
     if (strict_validation) {
@@ -1676,7 +1678,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
                                            jboolean strict_validation,
                                            jboolean allow_leading_zeros,
                                            jboolean allow_nonnumeric_numbers,
-                                           jboolean allow_unquoted_control)
+                                           jboolean allow_unquoted_control,
+                                           jbyte line_delimiter)
 {
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1700,6 +1703,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .strict_validation(strict_validation)
         .mixed_types_as_string(mixed_types_as_string)
+        .delimiter(static_cast<char>(line_delimiter))
         .keep_quotes(keep_quotes);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
@@ -1814,6 +1818,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
                                                  jboolean allow_leading_zeros,
                                                  jboolean allow_nonnumeric_numbers,
                                                  jboolean allow_unquoted_control,
+                                                 jbyte line_delimiter,
                                                  jlong ds_handle)
 {
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
@@ -1848,6 +1853,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
     if (strict_validation) {
@@ -1908,7 +1914,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
                                                            jboolean strict_validation,
                                                            jboolean allow_leading_zeros,
                                                            jboolean allow_nonnumeric_numbers,
-                                                           jboolean allow_unquoted_control)
+                                                           jboolean allow_unquoted_control,
+                                                           jbyte line_delimiter)
 {
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1957,6 +1964,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
     if (strict_validation) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 830f2b33b32..c7fcb1756b6 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -40,7 +40,6 @@
 import org.apache.parquet.schema.GroupType;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.OriginalType;
-import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 
 import java.io.*;
@@ -656,6 +655,24 @@ void testJSONValidationUnquotedControl() {
     }
   }
 
+  private static final byte[] CR_JSON_TEST_BUFFER = ("{\"a\":\"12\n3\"}\0" +
+      "{\"a\":\"AB\nC\"}\0").getBytes(StandardCharsets.UTF_8);
+
+  @Test
+  void testReadJSONDelim() {
+    Schema schema = Schema.builder().addColumn(DType.STRING, "a").build();
+    JSONOptions opts = JSONOptions.builder()
+        .withLines(true)
+        .withLineDelimiter('\0')
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column("12\n3", "AB\nC")
+        .build();
+        Table found = Table.readJSON(schema, opts, CR_JSON_TEST_BUFFER)) {
+      assertTablesAreEqual(expected, found);
+    }
+  }
+
   private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" +
       "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" +
       "{\"d\":[1,2,3]}\n" +

From b3518ab7e10f5eabf5ef06a495cc659079e0447c Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Tue, 24 Sep 2024 10:15:38 -0500
Subject: [PATCH 243/270] Add in option for Java JSON APIs to do column pruning
 in CUDF (#16796)

This adds in the options to enable column_pruning when reading JSON using the java APIs.

This is still in draft because there are test failures if this is turned on for those tests.

https://github.com/rapidsai/cudf/issues/16797

That said the performance impact from enabling column pruning on some queries is huge. For one query in particular the current code takes 161.5 seconds and with CUDF column pruning it is just 16.5 seconds. That is a 10x speedup for something that is fairly real world.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16796
---
 .../main/java/ai/rapids/cudf/JSONOptions.java   | 12 ++++++++++++
 java/src/main/java/ai/rapids/cudf/Table.java    | 17 +++++++++++++++++
 java/src/main/native/src/TableJni.cpp           | 12 +++++++++---
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 17b497be5ee..2bb74c3e3b1 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -38,6 +38,7 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean allowLeadingZeros;
   private final boolean allowNonNumericNumbers;
   private final boolean allowUnquotedControlChars;
+  private final boolean cudfPruneSchema;
   private final byte lineDelimiter;
 
   private JSONOptions(Builder builder) {
@@ -53,9 +54,14 @@ private JSONOptions(Builder builder) {
     allowLeadingZeros = builder.allowLeadingZeros;
     allowNonNumericNumbers = builder.allowNonNumericNumbers;
     allowUnquotedControlChars = builder.allowUnquotedControlChars;
+    cudfPruneSchema = builder.cudfPruneSchema;
     lineDelimiter = builder.lineDelimiter;
   }
 
+  public boolean shouldCudfPruneSchema() {
+    return cudfPruneSchema;
+  }
+
   public byte getLineDelimiter() {
     return lineDelimiter;
   }
@@ -129,8 +135,14 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean mixedTypesAsStrings = false;
     private boolean keepQuotes = false;
 
+    private boolean cudfPruneSchema = false;
     private byte lineDelimiter = '\n';
 
+    public Builder withCudfPruneSchema(boolean prune) {
+      cudfPruneSchema = prune;
+      return this;
+    }
+
     public Builder withLineDelimiter(char delimiter) {
       if (delimiter > Byte.MAX_VALUE) {
         throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter);
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 19c72809cea..6d370ca27b2 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -259,6 +259,7 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean allowLeadingZeros,
                                         boolean allowNonNumericNumbers,
                                         boolean allowUnquotedControl,
+                                        boolean pruneColumns,
                                         byte lineDelimiter) throws CudfException;
 
   private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
@@ -273,6 +274,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean allowLeadingZeros,
                                       boolean allowNonNumericNumbers,
                                       boolean allowUnquotedControl,
+                                      boolean pruneColumns,
                                       byte lineDelimiter,
                                       long dsHandle) throws CudfException;
 
@@ -1312,6 +1314,10 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
    * @return the file parsed as a table on the GPU.
    */
   public static Table readJSON(Schema schema, JSONOptions opts, File path) {
+    // only prune the schema if one is provided
+    boolean cudfPruneSchema = schema.getColumnNames() != null &&
+        schema.getColumnNames().length != 0 &&
+        opts.shouldCudfPruneSchema();
     try (TableWithMeta twm = new TableWithMeta(
             readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
                     schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
@@ -1326,6 +1332,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     opts.leadingZerosAllowed(),
                     opts.nonNumericNumbersAllowed(),
                     opts.unquotedControlChars(),
+                    cudfPruneSchema,
                     opts.getLineDelimiter()))) {
 
       return gatherJSONColumns(schema, twm, -1);
@@ -1472,6 +1479,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     assert len > 0;
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
+    // only prune the schema if one is provided
+    boolean cudfPruneSchema = schema.getColumnNames() != null &&
+        schema.getColumnNames().length != 0 &&
+        opts.shouldCudfPruneSchema();
     try (TableWithMeta twm = new TableWithMeta(readJSON(
             schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
             schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
@@ -1487,6 +1498,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
             opts.leadingZerosAllowed(),
             opts.nonNumericNumbersAllowed(),
             opts.unquotedControlChars(),
+            cudfPruneSchema,
             opts.getLineDelimiter()))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
     }
@@ -1513,6 +1525,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
    */
   public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) {
     long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
+    // only prune the schema if one is provided
+    boolean cudfPruneSchema = schema.getColumnNames() != null &&
+        schema.getColumnNames().length != 0 &&
+        opts.shouldCudfPruneSchema();
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
         schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
         opts.isDayFirst(),
@@ -1526,6 +1542,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
         opts.leadingZerosAllowed(),
         opts.nonNumericNumbersAllowed(),
         opts.unquotedControlChars(),
+        cudfPruneSchema,
         opts.getLineDelimiter(),
         dsHandle))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 96d4c2c4eeb..0f77da54152 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1649,7 +1649,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
         .mixed_types_as_string(mixed_types_as_string)
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
-        .keep_quotes(keep_quotes);
+        .keep_quotes(keep_quotes)
+        .prune_columns(false);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
@@ -1703,6 +1704,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .strict_validation(strict_validation)
         .mixed_types_as_string(mixed_types_as_string)
+        .prune_columns(false)
         .delimiter(static_cast<char>(line_delimiter))
         .keep_quotes(keep_quotes);
     if (strict_validation) {
@@ -1818,6 +1820,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
                                                  jboolean allow_leading_zeros,
                                                  jboolean allow_nonnumeric_numbers,
                                                  jboolean allow_unquoted_control,
+                                                 jboolean prune_columns,
                                                  jbyte line_delimiter,
                                                  jlong ds_handle)
 {
@@ -1855,7 +1858,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .mixed_types_as_string(mixed_types_as_string)
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
-        .keep_quotes(keep_quotes);
+        .keep_quotes(keep_quotes)
+        .prune_columns(prune_columns);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
@@ -1915,6 +1919,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
                                                            jboolean allow_leading_zeros,
                                                            jboolean allow_nonnumeric_numbers,
                                                            jboolean allow_unquoted_control,
+                                                           jboolean prune_columns,
                                                            jbyte line_delimiter)
 {
   bool read_buffer = true;
@@ -1966,7 +1971,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .mixed_types_as_string(mixed_types_as_string)
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
-        .keep_quotes(keep_quotes);
+        .keep_quotes(keep_quotes)
+        .prune_columns(prune_columns);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)

From f8db575330dddf5f32df049ec9928018697fdef3 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Tue, 24 Sep 2024 14:11:02 -0500
Subject: [PATCH 244/270] Update update-version.sh to use packaging lib
 (#16891)

This PR updates the update-version.sh script to use the packaging
library, given that setuptools is no longer included by default in
Python 3.12.
---
 ci/release/update-version.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index b0346327319..f73e88bc0c8 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -25,9 +25,9 @@ NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 
 # Need to distutils-normalize the versions for some use cases
-CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))")
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
-PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))")
+CURRENT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${CURRENT_SHORT_TAG}'))")
+NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
+PATCH_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_PATCH}'))")
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 

From 73fa557186932fa867a0516f8947bb25b97d0f29 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 24 Sep 2024 18:43:02 -0500
Subject: [PATCH 245/270] Update oldest deps for `pyarrow` & `numpy` (#16883)

We recently pinned our `dask-expr` version to `1.1.14`: https://github.com/rapidsai/rapids-dask-dependency/pull/64, that plus latest `dask` seems to be having a minimum requirement for `pyarrow` as `14.0.1`. This is causing failures in our CI matrix while running tests with the oldest dependencies. This PR bumps the minimum pyarrow version in our oldest deps.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16883
---
 ci/cudf_pandas_scripts/run_tests.sh |  4 ++--
 ci/test_python_common.sh            |  4 ++--
 ci/test_python_cudf.sh              |  2 +-
 ci/test_python_other.sh             |  2 +-
 dependencies.yaml                   | 36 +++++++++++++++++++++++++----
 5 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index c6228a4ef33..f6bdc6f9484 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -56,10 +56,10 @@ else
 
     echo "" > ./constraints.txt
     if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-        # `test_python` constraints are for `[test]` not `[cudf-pandas-tests]`
+        # `test_python_cudf_pandas` constraints are for `[test]` not `[cudf-pandas-tests]`
         rapids-dependency-file-generator \
             --output requirements \
-            --file-key test_python \
+            --file-key test_python_cudf_pandas \
             --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
         | tee ./constraints.txt
     fi
diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
index d0675b0431a..dc70661a17a 100755
--- a/ci/test_python_common.sh
+++ b/ci/test_python_common.sh
@@ -10,10 +10,10 @@ set -euo pipefail
 rapids-logger "Generate Python testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
-
+FILE_KEY=$1
 rapids-dependency-file-generator \
   --output conda \
-  --file-key test_python \
+  --file-key ${FILE_KEY} \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
     | tee "${ENV_YAML_DIR}/env.yaml"
 
diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh
index ae34047e87f..2386414b32e 100755
--- a/ci/test_python_cudf.sh
+++ b/ci/test_python_cudf.sh
@@ -5,7 +5,7 @@
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../;
 
 # Common setup steps shared by Python test jobs
-source ./ci/test_python_common.sh
+source ./ci/test_python_common.sh test_python_cudf
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index 06a24773cae..67c97ad29a5 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -5,7 +5,7 @@
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 
 # Common setup steps shared by Python test jobs
-source ./ci/test_python_common.sh
+source ./ci/test_python_common.sh test_python_other
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
diff --git a/dependencies.yaml b/dependencies.yaml
index 01edcb3889a..7a9c9b8486d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -43,15 +43,28 @@ files:
     includes:
       - cuda_version
       - test_cpp
-  test_python:
+  test_python_cudf_pandas:
     output: none
     includes:
       - cuda_version
       - py_version
       - test_python_common
       - test_python_cudf
-      - test_python_dask_cudf
       - test_python_cudf_pandas
+  test_python_cudf:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_python_common
+      - test_python_cudf
+  test_python_other:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_python_common
+      - test_python_dask_cudf
   test_java:
     output: none
     includes:
@@ -707,9 +720,7 @@ dependencies:
           - matrix: {dependencies: "oldest"}
             packages:
               - numba==0.57.*
-              - numpy==1.23.*
               - pandas==2.0.*
-              - pyarrow==14.0.0
           - matrix:
             packages:
       - output_types: conda
@@ -764,6 +775,14 @@ dependencies:
           - &transformers transformers==4.39.3
           - tzdata
     specific:
+      - output_types: [conda, requirements]
+        matrices:
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - numpy==1.23.*
+              - pyarrow==14.0.0
+          - matrix:
+            packages:
       - output_types: conda
         matrices:
           - matrix:
@@ -783,6 +802,15 @@ dependencies:
         packages:
           - dask-cuda==24.10.*,>=0.0.0a0
           - *numba
+    specific:
+      - output_types: [conda, requirements]
+        matrices:
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - numpy==1.24.*
+              - pyarrow==14.0.1
+          - matrix:
+            packages:
   depends_on_libcudf:
     common:
       - output_types: conda

From 22cefc94d05727607563d6519eb17e1eb95c5478 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 24 Sep 2024 21:40:11 -0500
Subject: [PATCH 246/270] Fix metadata after implicit array conversion from
 Dask cuDF (#16842)

Temporary workaround for https://github.com/dask/dask/issues/11017 in Dask cuDF (when query-planning is enabled).
I will try to move this fix upstream soon. However, the next dask release will probably not be used by 24.10, and it's still unclear whether the same fix works for all CPU cases.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16842
---
 .../dask_cudf/dask_cudf/expr/_collection.py   | 79 +++++++++++++------
 python/dask_cudf/dask_cudf/tests/test_core.py | 17 ++--
 2 files changed, 65 insertions(+), 31 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index 97e1dffc65b..c1dd16eac8d 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -202,27 +202,58 @@ class Index(DXIndex, CudfFrameBase):
 ##
 
 
-try:
-    from dask_expr._backends import create_array_collection
-
-    @get_collection_type.register_lazy("cupy")
-    def _register_cupy():
-        import cupy
-
-        @get_collection_type.register(cupy.ndarray)
-        def get_collection_type_cupy_array(_):
-            return create_array_collection
-
-    @get_collection_type.register_lazy("cupyx")
-    def _register_cupyx():
-        # Needed for cuml
-        from cupyx.scipy.sparse import spmatrix
-
-        @get_collection_type.register(spmatrix)
-        def get_collection_type_csr_matrix(_):
-            return create_array_collection
-
-except ImportError:
-    # Older version of dask-expr.
-    # Implicit conversion to array wont work.
-    pass
+def _create_array_collection_with_meta(expr):
+    # NOTE: This is the GPU compatible version of
+    # `new_dd_object` for DataFrame -> Array conversion.
+    # This can be removed if dask#11017 is resolved
+    # (See: https://github.com/dask/dask/issues/11017)
+    import numpy as np
+
+    import dask.array as da
+    from dask.blockwise import Blockwise
+    from dask.highlevelgraph import HighLevelGraph
+
+    result = expr.optimize()
+    dsk = result.__dask_graph__()
+    name = result._name
+    meta = result._meta
+    divisions = result.divisions
+    chunks = ((np.nan,) * (len(divisions) - 1),) + tuple(
+        (d,) for d in meta.shape[1:]
+    )
+    if len(chunks) > 1:
+        if isinstance(dsk, HighLevelGraph):
+            layer = dsk.layers[name]
+        else:
+            # dask-expr provides a dict only
+            layer = dsk
+        if isinstance(layer, Blockwise):
+            layer.new_axes["j"] = chunks[1][0]
+            layer.output_indices = layer.output_indices + ("j",)
+        else:
+            suffix = (0,) * (len(chunks) - 1)
+            for i in range(len(chunks[0])):
+                layer[(name, i) + suffix] = layer.pop((name, i))
+
+    return da.Array(dsk, name=name, chunks=chunks, meta=meta)
+
+
+@get_collection_type.register_lazy("cupy")
+def _register_cupy():
+    import cupy
+
+    get_collection_type.register(
+        cupy.ndarray,
+        lambda _: _create_array_collection_with_meta,
+    )
+
+
+@get_collection_type.register_lazy("cupyx")
+def _register_cupyx():
+    # Needed for cuml
+    from cupyx.scipy.sparse import spmatrix
+
+    get_collection_type.register(
+        spmatrix,
+        lambda _: _create_array_collection_with_meta,
+    )
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 7aa0f6320f2..9f54aba3e13 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -16,6 +16,7 @@
 
 import dask_cudf
 from dask_cudf.tests.utils import (
+    QUERY_PLANNING_ON,
     require_dask_expr,
     skip_dask_expr,
     xfail_dask_expr,
@@ -950,12 +951,16 @@ def test_implicit_array_conversion_cupy():
     def func(x):
         return x.values
 
-    # Need to compute the dask collection for now.
-    # See: https://github.com/dask/dask/issues/11017
-    result = ds.map_partitions(func, meta=s.values).compute()
-    expect = func(s)
+    result = ds.map_partitions(func, meta=s.values)
 
-    dask.array.assert_eq(result, expect)
+    if QUERY_PLANNING_ON:
+        # Check Array and round-tripped DataFrame
+        dask.array.assert_eq(result, func(s))
+        dd.assert_eq(result.to_dask_dataframe(), s, check_index=False)
+    else:
+        # Legacy version still carries numpy metadata
+        # See: https://github.com/dask/dask/issues/11017
+        dask.array.assert_eq(result.compute(), func(s))
 
 
 def test_implicit_array_conversion_cupy_sparse():
@@ -967,8 +972,6 @@ def test_implicit_array_conversion_cupy_sparse():
     def func(x):
         return cupyx.scipy.sparse.csr_matrix(x.values)
 
-    # Need to compute the dask collection for now.
-    # See: https://github.com/dask/dask/issues/11017
     result = ds.map_partitions(func, meta=s.values).compute()
     expect = func(s)
 

From 9316309551d13bd258d7cb359cde0cc96019e0cd Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 24 Sep 2024 19:40:46 -0700
Subject: [PATCH 247/270] Remove unnecessary flag from build.sh (#16879)

This CMake option was removed by #15483.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16879
---
 build.sh | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/build.sh b/build.sh
index 211e1db9fbf..69d6481af42 100755
--- a/build.sh
+++ b/build.sh
@@ -239,11 +239,6 @@ if hasArg --pydevelop; then
     PYTHON_ARGS_FOR_INSTALL="${PYTHON_ARGS_FOR_INSTALL} -e"
 fi
 
-# Append `-DFIND_CUDF_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option.
-if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUDF_CPP"* ]]; then
-    EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_CUDF_CPP=ON"
-fi
-
 if hasArg --disable_large_strings; then
     BUILD_DISABLE_LARGE_STRINGS="ON"
 fi

From 03c77c2176ee5f30ef3d10b9332ad9c3612db905 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 24 Sep 2024 18:02:05 -1000
Subject: [PATCH 248/270] Add string.findall APIs to pylibcudf (#16825)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16825
---
 .../api_docs/pylibcudf/strings/findall.rst    |  6 +++
 .../api_docs/pylibcudf/strings/index.rst      |  1 +
 python/cudf/cudf/_lib/strings/findall.pyx     | 35 +++++-----------
 .../pylibcudf/libcudf/strings/findall.pxd     |  4 +-
 .../pylibcudf/strings/CMakeLists.txt          |  4 +-
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |  1 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |  1 +
 .../pylibcudf/pylibcudf/strings/findall.pxd   |  7 ++++
 .../pylibcudf/pylibcudf/strings/findall.pyx   | 40 +++++++++++++++++++
 .../pylibcudf/tests/test_string_findall.py    | 23 +++++++++++
 10 files changed, 93 insertions(+), 29 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/findall.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/findall.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_findall.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst
new file mode 100644
index 00000000000..9850ee10098
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst
@@ -0,0 +1,6 @@
+====
+find
+====
+
+.. automodule:: pylibcudf.strings.findall
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 003e7c0c35e..9b1a6b72a88 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -9,6 +9,7 @@ strings
     contains
     extract
     find
+    findall
     regex_flags
     regex_program
     repeat
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index 3cf2084e30a..0e758d5b322 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -1,21 +1,13 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.findall cimport findall as cpp_findall
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def findall(Column source_strings, object pattern, uint32_t flags):
@@ -23,18 +15,11 @@ def findall(Column source_strings, object pattern, uint32_t flags):
     Returns data with all non-overlapping matches of `pattern`
     in each string of `source_strings` as a lists column.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_findall(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    prog = plc.strings.regex_program.RegexProgram.create(
+        str(pattern), flags
+    )
+    plc_result = plc.strings.findall.findall(
+        source_strings.to_pylibcudf(mode="read"),
+        prog,
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
index b25724586e1..e0a8b776465 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
@@ -9,5 +9,5 @@ from pylibcudf.libcudf.strings.regex_program cimport regex_program
 cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] findall(
-        column_view source_strings,
-        regex_program) except +
+        column_view input,
+        regex_program prog) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 8b4fbb1932f..77f20b0b917 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -13,8 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx regex_flags.pyx
-    regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx
+    capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx
+    regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 4867d944dc7..91d884b294b 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -8,6 +8,7 @@ from . cimport (
     convert,
     extract,
     find,
+    findall,
     regex_flags,
     regex_program,
     replace,
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index a3bef64d19f..b4856784390 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -8,6 +8,7 @@
     convert,
     extract,
     find,
+    findall,
     regex_flags,
     regex_program,
     repeat,
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd
new file mode 100644
index 00000000000..54afa088141
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/findall.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+
+cpdef Column findall(Column input, RegexProgram pattern)
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx
new file mode 100644
index 00000000000..03ecb13a50e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyx
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport findall as cpp_findall
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+
+cpdef Column findall(Column input, RegexProgram pattern):
+    """
+    Returns a lists column of strings for each matching occurrence using
+    the regex_program pattern within each string.
+
+    For details, see For details, see :cpp:func:`cudf::strings::findall`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    pattern : RegexProgram
+        Regex pattern
+
+    Returns
+    -------
+    Column
+        New lists column of strings
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_findall.findall(
+                input.view(),
+                pattern.c_obj.get()[0]
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
new file mode 100644
index 00000000000..994552fa276
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import re
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_findall():
+    arr = pa.array(["bunny", "rabbit", "hare", "dog"])
+    pattern = "[ab]"
+    result = plc.strings.findall.findall(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    pa_result = plc.interop.to_arrow(result)
+    expected = pa.array(
+        [re.findall(pattern, elem) for elem in arr.to_pylist()],
+        type=pa_result.type,
+    )
+    assert_column_eq(result, expected)

From dbe5528706b309a9a21f34e948c22c1c4de9caff Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 25 Sep 2024 09:14:01 -0400
Subject: [PATCH 249/270] [FEA] Add an environment variable to fail on fallback
 in `cudf.pandas` (#16562)

This PR makes more on #14975 by adding an environment variable that fails when fallback occurs in cudf.pandas. It also adds some tests that do __not__ fallback.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16562
---
 python/cudf/cudf/pandas/fast_slow_proxy.py    |  10 ++
 .../cudf_pandas_tests/test_cudf_pandas.py     |  16 ++-
 .../test_cudf_pandas_no_fallback.py           | 100 ++++++++++++++++++
 3 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py

diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index bf2ee6ae624..0c1cda8810b 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -881,6 +881,12 @@ def _assert_fast_slow_eq(left, right):
         assert_eq(left, right)
 
 
+class ProxyFallbackError(Exception):
+    """Raised when fallback occurs"""
+
+    pass
+
+
 def _fast_function_call():
     """
     Placeholder fast function for pytest profiling purposes.
@@ -957,6 +963,10 @@ def _fast_slow_function_call(
                             f"The exception was {e}."
                         )
     except Exception as err:
+        if _env_get_bool("CUDF_PANDAS_FAIL_ON_FALLBACK", False):
+            raise ProxyFallbackError(
+                f"The operation failed with cuDF, the reason was {type(err)}: {err}."
+            ) from err
         with nvtx.annotate(
             "EXECUTE_SLOW",
             color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index c4ab4b0a853..2bbed40e34e 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -26,7 +26,11 @@
 
 from cudf.core._compat import PANDAS_GE_220
 from cudf.pandas import LOADED, Profiler
-from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object
+from cudf.pandas.fast_slow_proxy import (
+    ProxyFallbackError,
+    _Unusable,
+    is_proxy_object,
+)
 from cudf.testing import assert_eq
 
 if not LOADED:
@@ -1738,3 +1742,13 @@ def add_one_ufunc(a):
         return a + 1
 
     assert_eq(cp.asarray(add_one_ufunc(arr1)), cp.asarray(add_one_ufunc(arr2)))
+
+
+@pytest.mark.xfail(
+    reason="Fallback expected because casting to object is not supported",
+)
+def test_fallback_raises_error(monkeypatch):
+    with monkeypatch.context() as monkeycontext:
+        monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True")
+        with pytest.raises(ProxyFallbackError):
+            pd.Series(range(2)).astype(object)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py
new file mode 100644
index 00000000000..896256bf6d7
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from cudf.pandas import LOADED
+
+if not LOADED:
+    raise ImportError("These tests must be run with cudf.pandas loaded")
+
+import numpy as np
+import pandas as pd
+
+
+@pytest.fixture(autouse=True)
+def fail_on_fallback(monkeypatch):
+    monkeypatch.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True")
+
+
+@pytest.fixture
+def dataframe():
+    df = pd.DataFrame(
+        {
+            "a": [1, 1, 1, 2, 3],
+            "b": [1, 2, 3, 4, 5],
+            "c": [1.2, 1.3, 1.5, 1.7, 1.11],
+        }
+    )
+    return df
+
+
+@pytest.fixture
+def series(dataframe):
+    return dataframe["a"]
+
+
+@pytest.fixture
+def array(series):
+    return series.values
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "sum",
+        "min",
+        "max",
+        "mean",
+        "std",
+        "var",
+        "prod",
+        "median",
+    ],
+)
+def test_no_fallback_in_reduction_ops(series, op):
+    s = series
+    getattr(s, op)()
+
+
+def test_groupby(dataframe):
+    df = dataframe
+    df.groupby("a", sort=True).max()
+
+
+def test_no_fallback_in_binops(dataframe):
+    df = dataframe
+    df + df
+    df - df
+    df * df
+    df**df
+    df[["a", "b"]] & df[["a", "b"]]
+    df <= df
+
+
+def test_no_fallback_in_groupby_rolling_sum(dataframe):
+    df = dataframe
+    df.groupby("a").rolling(2).sum()
+
+
+def test_no_fallback_in_concat(dataframe):
+    df = dataframe
+    pd.concat([df, df])
+
+
+def test_no_fallback_in_get_shape(dataframe):
+    df = dataframe
+    df.shape
+
+
+def test_no_fallback_in_array_ufunc_op(array):
+    np.add(array, array)
+
+
+def test_no_fallback_in_merge(dataframe):
+    df = dataframe
+    pd.merge(df * df, df + df, how="inner")
+    pd.merge(df * df, df + df, how="outer")
+    pd.merge(df * df, df + df, how="left")
+    pd.merge(df * df, df + df, how="right")

From 75c5c83f1375213c94527eba1d0488145d7fdce7 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 25 Sep 2024 09:12:32 -0500
Subject: [PATCH 250/270] Add dask-cudf workaround for missing `rename_axis`
 support in cudf (#16899)

See https://github.com/rapidsai/cudf/issues/16895
Closes https://github.com/rapidsai/cudf/issues/16892

Dask-expr uses `rename_axis`, which is not supported by cudf yet. This is a temporary workaround until #16895 is resolved.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16899
---
 python/dask_cudf/dask_cudf/expr/_collection.py | 12 ++++++++++++
 python/dask_cudf/dask_cudf/expr/_expr.py       | 16 +++++++++++++++-
 python/dask_cudf/dask_cudf/tests/test_core.py  | 12 ++++++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index c1dd16eac8d..907abaa2bfc 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -15,6 +15,7 @@
 
 from dask import config
 from dask.dataframe.core import is_dataframe_like
+from dask.typing import no_default
 
 import cudf
 
@@ -90,6 +91,17 @@ def var(
             )
         )
 
+    def rename_axis(
+        self, mapper=no_default, index=no_default, columns=no_default, axis=0
+    ):
+        from dask_cudf.expr._expr import RenameAxisCudf
+
+        return new_collection(
+            RenameAxisCudf(
+                self, mapper=mapper, index=index, columns=columns, axis=axis
+            )
+        )
+
 
 class DataFrame(DXDataFrame, CudfFrameBase):
     @classmethod
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index 8a2c50d3fe7..b284ab3774d 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -4,11 +4,12 @@
 import dask_expr._shuffle as _shuffle_module
 from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
-from dask_expr._expr import Elemwise, Expr, VarColumns
+from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns
 from dask_expr._reductions import Reduction, Var
 
 from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
 from dask.dataframe.dispatch import is_categorical_dtype
+from dask.typing import no_default
 
 import cudf
 
@@ -17,6 +18,19 @@
 ##
 
 
+class RenameAxisCudf(RenameAxis):
+    # TODO: Remove this after rename_axis is supported in cudf
+    # (See: https://github.com/rapidsai/cudf/issues/16895)
+    @staticmethod
+    def operation(df, index=no_default, **kwargs):
+        if index != no_default:
+            df.index.name = index
+            return df
+        raise NotImplementedError(
+            "Only `index` is supported for the cudf backend"
+        )
+
+
 class ToCudfBackend(Elemwise):
     # TODO: Inherit from ToBackend when rapids-dask-dependency
     # is pinned to dask>=2024.8.1
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 9f54aba3e13..5f0fae86691 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -1027,3 +1027,15 @@ def test_cov_corr(op, numeric_only):
     # (See: https://github.com/rapidsai/cudf/issues/12626)
     expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only)
     dd.assert_eq(res, expect)
+
+
+def test_rename_axis_after_join():
+    df1 = cudf.DataFrame(index=["a", "b", "c"], data=dict(a=[1, 2, 3]))
+    df1.index.name = "test"
+    ddf1 = dd.from_pandas(df1, 2)
+
+    df2 = cudf.DataFrame(index=["a", "b", "d"], data=dict(b=[1, 2, 3]))
+    ddf2 = dd.from_pandas(df2, 2)
+    result = ddf1.join(ddf2, how="outer")
+    expected = df1.join(df2, how="outer")
+    dd.assert_eq(result, expected, check_index=False)

From 416042314e16c1bfc7499309ccc4a352b1f47c0a Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 25 Sep 2024 15:42:51 +0100
Subject: [PATCH 251/270] Pin polars for 24.10 and update polars test suite
 xfail list (#16886)

For releases, since the polars release cadence is quite a lot faster than rapids, we propose to hard-pin to a known good version. In this case, 1.8.x.

At the same time, remove pin in CI scripts and update list of xfailing tests in the polars test suite.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16886
---
 ci/run_cudf_polars_polars_tests.sh            |  2 +-
 ci/test_cudf_polars_polars_tests.sh           |  3 +-
 ci/test_wheel_cudf_polars.sh                  |  5 +--
 dependencies.yaml                             |  2 +-
 python/cudf_polars/cudf_polars/__init__.py    |  8 +++--
 .../cudf_polars/cudf_polars/dsl/translate.py  |  8 -----
 .../cudf_polars/testing/asserts.py            | 14 +++++---
 .../cudf_polars/cudf_polars/testing/plugin.py |  4 +++
 .../cudf_polars/cudf_polars/utils/versions.py | 16 ++++-----
 python/cudf_polars/pyproject.toml             |  2 +-
 python/cudf_polars/tests/test_groupby.py      |  6 +++-
 .../cudf_polars/tests/testing/test_asserts.py | 35 ++++++++-----------
 12 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh
index 52a827af94c..95f78f17f2f 100755
--- a/ci/run_cudf_polars_polars_tests.sh
+++ b/ci/run_cudf_polars_polars_tests.sh
@@ -21,7 +21,7 @@ python -m pytest \
        -m "" \
        -p cudf_polars.testing.plugin \
        -v \
-       --tb=short \
+       --tb=native \
        ${DESELECTED_TESTS} \
        "$@" \
        py-polars/tests
diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index 6c728a9537f..bfc8fd37565 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -33,8 +33,7 @@ python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
 rapids-logger "Install cudf_polars"
 python -m pip install $(echo ./dist/cudf_polars*.whl)
 
-# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
-TAG="py-1.7.0"
+TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
 rapids-logger "Clone polars to ${TAG}"
 git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1
 
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index b4509bba02e..3116bd820e9 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -39,7 +39,7 @@ if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
       | tee ./constraints.txt
 fi
 
-# echo to expand wildcard before adding `[extra]` requires for pip
+# echo to expand wildcard before adding `[test]` requires for pip
 python -m pip install \
     -v \
     --constraint ./constraints.txt \
@@ -47,9 +47,6 @@ python -m pip install \
     "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
     "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
-rapids-logger "Pin to 1.7.0 Temporarily"
-python -m pip install polars==1.7.0
-
 rapids-logger "Run cudf_polars tests"
 
 function set_exitcode()
diff --git a/dependencies.yaml b/dependencies.yaml
index 7a9c9b8486d..339adbc5ff9 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -663,7 +663,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.6
+          - polars>=1.8,<1.9
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
index c1317e8f467..66c15f694ee 100644
--- a/python/cudf_polars/cudf_polars/__init__.py
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -10,13 +10,15 @@
 
 from __future__ import annotations
 
-# Check we have a supported polars version
-import cudf_polars.utils.versions as v
 from cudf_polars._version import __git_commit__, __version__
 from cudf_polars.callback import execute_with_cudf
 from cudf_polars.dsl.translate import translate_ir
 
-del v
+# Check we have a supported polars version
+from cudf_polars.utils.versions import _ensure_polars_version
+
+_ensure_polars_version()
+del _ensure_polars_version
 
 __all__: list[str] = [
     "execute_with_cudf",
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 45881afe0c8..a0291037f01 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -93,14 +93,6 @@ def _(
         cloud_options = None
     else:
         reader_options, cloud_options = map(json.loads, options)
-    if (
-        typ == "csv"
-        and visitor.version()[0] == 1
-        and reader_options["schema"] is not None
-    ):
-        reader_options["schema"] = {
-            "fields": reader_options["schema"]["inner"]
-        }  # pragma: no cover; CI tests 1.7
     file_options = node.file_options
     with_columns = file_options.with_columns
     n_rows = file_options.n_rows
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index a79d45899cd..7b6f3848fc4 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -164,9 +164,11 @@ def assert_collect_raises(
         cudf-polars.
         Useful for controlling optimization settings.
     polars_except
-        Exception or exceptions polars CPU is expected to raise.
+        Exception or exceptions polars CPU is expected to raise. If
+        None, CPU is not expected to raise an exception.
     cudf_except
-        Exception or exceptions polars GPU is expected to raise.
+        Exception or exceptions polars GPU is expected to raise. If
+        None, GPU is not expected to raise an exception.
     collect_kwargs
         Common keyword arguments to pass to collect for both polars CPU and
         cudf-polars.
@@ -203,7 +205,8 @@ def assert_collect_raises(
             f"CPU execution RAISED {type(e)}, EXPECTED {polars_except}"
         ) from e
     else:
-        raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}")
+        if polars_except != ():
+            raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}")
 
     engine = GPUEngine(raise_on_fail=True)
     try:
@@ -212,7 +215,8 @@ def assert_collect_raises(
         pass
     except Exception as e:
         raise AssertionError(
-            f"GPU execution RAISED {type(e)}, EXPECTED {polars_except}"
+            f"GPU execution RAISED {type(e)}, EXPECTED {cudf_except}"
         ) from e
     else:
-        raise AssertionError(f"GPU execution DID NOT RAISE {polars_except}")
+        if cudf_except != ():
+            raise AssertionError(f"GPU execution DID NOT RAISE {cudf_except}")
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index c40d59e6d33..05b76d76808 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -49,11 +49,15 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
     "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
+    "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index 2e6efde968c..4a7ad6b3cf2 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -12,11 +12,11 @@
 
 POLARS_VERSION = parse(__version__)
 
-POLARS_VERSION_GE_16 = POLARS_VERSION >= parse("1.6")
-POLARS_VERSION_GT_16 = POLARS_VERSION > parse("1.6")
-POLARS_VERSION_LT_16 = POLARS_VERSION < parse("1.6")
-
-if POLARS_VERSION_LT_16:
-    raise ImportError(
-        "cudf_polars requires py-polars v1.6 or greater."
-    )  # pragma: no cover
+POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8")
+
+
+def _ensure_polars_version():
+    if POLARS_VERSION_LT_18:
+        raise ImportError(
+            "cudf_polars requires py-polars v1.8 or greater."
+        )  # pragma: no cover
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 857a8c14b2f..268364f72a7 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.6",
+    "polars>=1.8,<1.9",
     "pylibcudf==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index 6f996e0e0ec..74bf8b9e4e2 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -168,7 +168,11 @@ def test_groupby_nan_minmax_raises(op):
     "expr",
     [
         pl.lit(1).alias("value"),
-        pl.lit([[4, 5, 6]]).alias("value"),
+        pytest.param(
+            pl.lit([[4, 5, 6]]).alias("value"),
+            marks=pytest.mark.xfail(reason="Need to expose OtherScalar in rust IR"),
+        ),
+        pl.Series("value", [[4, 5, 6]], dtype=pl.List(pl.Int32)),
         pl.col("float") * (1 - pl.col("int")),
         [pl.lit(2).alias("value"), pl.col("float") * 2],
     ],
diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py
index 8e7f1a09d9b..ace1c6b8648 100644
--- a/python/cudf_polars/tests/testing/test_asserts.py
+++ b/python/cudf_polars/tests/testing/test_asserts.py
@@ -7,8 +7,6 @@
 
 import polars as pl
 
-from cudf_polars.containers import DataFrame
-from cudf_polars.dsl.ir import Select
 from cudf_polars.testing.asserts import (
     assert_collect_raises,
     assert_gpu_result_equal,
@@ -38,14 +36,24 @@ class E(Exception):
         assert_ir_translation_raises(unsupported, E)
 
 
-def test_collect_assert_raises(monkeypatch):
+def test_collect_assert_raises():
     df = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
 
-    with pytest.raises(AssertionError):
-        # This should raise, because polars CPU can run this query
+    with pytest.raises(AssertionError, match="CPU execution DID NOT RAISE"):
+        # This should raise, because polars CPU can run this query,
+        # but we expect an error.
         assert_collect_raises(
             df,
             polars_except=pl.exceptions.InvalidOperationError,
+            cudf_except=(),
+        )
+
+    with pytest.raises(AssertionError, match="GPU execution DID NOT RAISE"):
+        # This should raise, because polars GPU can run this query,
+        # but we expect an error.
+        assert_collect_raises(
+            df,
+            polars_except=(),
             cudf_except=pl.exceptions.InvalidOperationError,
         )
 
@@ -60,7 +68,7 @@ def test_collect_assert_raises(monkeypatch):
         cudf_except=pl.exceptions.InvalidOperationError,
     )
 
-    with pytest.raises(AssertionError):
+    with pytest.raises(AssertionError, match="GPU execution RAISED"):
         # This should raise because the expected GPU error is wrong
         assert_collect_raises(
             q,
@@ -68,23 +76,10 @@ def test_collect_assert_raises(monkeypatch):
             cudf_except=NotImplementedError,
         )
 
-    with pytest.raises(AssertionError):
+    with pytest.raises(AssertionError, match="CPU execution RAISED"):
         # This should raise because the expected CPU error is wrong
         assert_collect_raises(
             q,
             polars_except=NotImplementedError,
             cudf_except=pl.exceptions.InvalidOperationError,
         )
-
-    with monkeypatch.context() as m:
-        m.setattr(Select, "evaluate", lambda self, cache: DataFrame([]))
-        # This query should fail, but we monkeypatch a bad
-        # implementation of Select which "succeeds" to check that our
-        # assertion notices this case.
-        q = df.select(pl.col("a") + pl.Series([1, 2]))
-        with pytest.raises(AssertionError):
-            assert_collect_raises(
-                q,
-                polars_except=pl.exceptions.ComputeError,
-                cudf_except=pl.exceptions.ComputeError,
-            )

From ef270827cc3e4f336258d1e1ad4b7f633656409b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 25 Sep 2024 12:52:34 -0500
Subject: [PATCH 252/270] Build `cudf-polars` with `build.sh` (#16898)

This PR adds `cudf-polars` to the top level build script.

Authors:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/16898
---
 build.sh | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/build.sh b/build.sh
index 69d6481af42..56359eae235 100755
--- a/build.sh
+++ b/build.sh
@@ -17,13 +17,14 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcudf pylibcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
-HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
+VALIDARGS="clean libcudf pylibcudf cudf cudf_polars cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
+HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudf_polars] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
    clean                         - remove all existing build artifacts and configuration (start
                                    over)
    libcudf                       - build the cudf C++ code only
    pylibcudf                     - build the pylibcudf Python package
    cudf                          - build the cudf Python package
+   cudf_polars                   - build the cudf_polars Python package
    cudfjar                       - build cudf JAR with static libcudf using devtoolset toolchain
    dask_cudf                     - build the dask_cudf Python package
    benchmarks                    - build benchmarks
@@ -353,6 +354,12 @@ if buildAll || hasArg cudf; then
         python ${PYTHON_ARGS_FOR_INSTALL} .
 fi
 
+# Build and install the cudf_polars Python package
+if buildAll || hasArg cudf_polars; then
+
+    cd ${REPODIR}/python/cudf_polars
+    python ${PYTHON_ARGS_FOR_INSTALL} .
+fi
 
 # Build and install the dask_cudf Python package
 if buildAll || hasArg dask_cudf; then

From b92d0085eb4e22ddb79ad0269014669ed53754cf Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 25 Sep 2024 08:35:39 -1000
Subject: [PATCH 253/270] Fix DataFrame.drop(columns=cudf.Series/Index, axis=1)
 (#16712)

Before when `columns=` was a `cudf.Series/Index` we would call `return array.unique.to_pandas()`, but `.unique` is a method not a property so this would have raised an error.

Also took the time to refactor the helper methods here and push down the `errors=` keyword to `Frame._drop_column`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16712
---
 python/cudf/cudf/core/frame.py           | 14 +++++++----
 python/cudf/cudf/core/indexed_frame.py   | 32 ++++++++----------------
 python/cudf/cudf/tests/test_dataframe.py | 11 ++++++++
 3 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 98af006f6e5..37ad6b8fabb 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -767,11 +767,15 @@ def fillna(
         )
 
     @_performance_tracking
-    def _drop_column(self, name):
-        """Drop a column by *name*"""
-        if name not in self._data:
-            raise KeyError(f"column '{name}' does not exist")
-        del self._data[name]
+    def _drop_column(
+        self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise"
+    ) -> None:
+        """Drop a column by *name* inplace."""
+        try:
+            del self._data[name]
+        except KeyError as err:
+            if errors != "ignore":
+                raise KeyError(f"column '{name}' does not exist") from err
 
     @_performance_tracking
     def _quantile_table(
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 810d4ad74e7..5952815deef 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3,7 +3,6 @@
 
 from __future__ import annotations
 
-import numbers
 import operator
 import textwrap
 import warnings
@@ -150,24 +149,14 @@
 )
 
 
-def _get_host_unique(array):
+def _get_unique_drop_labels(array):
+    """Return labels to be dropped for IndexFrame.drop."""
     if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)):
-        return array.unique.to_pandas()
-    elif isinstance(array, (str, numbers.Number)):
-        return [array]
+        yield from np.unique(as_column(array).values_host)
+    elif is_scalar(array):
+        yield array
     else:
-        return set(array)
-
-
-def _drop_columns(f: Frame, columns: abc.Iterable, errors: str):
-    for c in columns:
-        try:
-            f._drop_column(c)
-        except KeyError as e:
-            if errors == "ignore":
-                pass
-            else:
-                raise e
+        yield from set(array)
 
 
 def _indices_from_labels(obj, labels):
@@ -5262,15 +5251,14 @@ def drop(
             out = self.copy()
 
         if axis in (1, "columns"):
-            target = _get_host_unique(target)
-
-            _drop_columns(out, target, errors)
+            for label in _get_unique_drop_labels(target):
+                out._drop_column(label, errors=errors)
         elif axis in (0, "index"):
             dropped = _drop_rows_by_labels(out, target, level, errors)
 
             if columns is not None:
-                columns = _get_host_unique(columns)
-                _drop_columns(dropped, columns, errors)
+                for label in _get_unique_drop_labels(columns):
+                    dropped._drop_column(label, errors=errors)
 
             out._mimic_inplace(dropped, inplace=True)
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index f4d1578bda7..6f88d942746 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -515,6 +515,17 @@ def test_dataframe_drop_columns(pdf, columns, inplace):
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize("obj", ["Index", "Series"])
+def test_drop_cudf_obj_columns(obj):
+    pdf = pd.DataFrame({"A": [1], "B": [1]})
+    gdf = cudf.from_pandas(pdf)
+
+    columns = ["B"]
+    expected = pdf.drop(labels=getattr(pd, obj)(columns), axis=1)
+    actual = gdf.drop(columns=getattr(cudf, obj)(columns), axis=1)
+    assert_eq(expected, actual)
+
+
 @pytest.mark.parametrize(
     "pdf",
     [

From d11ec7ac18092e71ad004b87b3e42da3606e0f0b Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 25 Sep 2024 14:46:34 -0400
Subject: [PATCH 254/270] [DOC] Update Pylibcudf doc strings (#16810)

This PR is a first pass at #15937. We will close #15937 after #15162 is closed

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16810
---
 python/pylibcudf/pylibcudf/binaryop.pyx       |  2 +-
 .../pylibcudf/pylibcudf/column_factories.pyx  | 18 +++++++++++++++++
 python/pylibcudf/pylibcudf/groupby.pyx        |  2 +-
 python/pylibcudf/pylibcudf/io/avro.pyx        |  2 ++
 python/pylibcudf/pylibcudf/io/parquet.pyx     |  4 ++++
 python/pylibcudf/pylibcudf/labeling.pyx       |  2 ++
 python/pylibcudf/pylibcudf/lists.pyx          | 12 +++++++++++
 python/pylibcudf/pylibcudf/merge.pyx          |  2 ++
 python/pylibcudf/pylibcudf/quantiles.pyx      |  4 ++++
 python/pylibcudf/pylibcudf/reshape.pyx        |  4 ++++
 python/pylibcudf/pylibcudf/search.pyx         |  6 ++++++
 python/pylibcudf/pylibcudf/sorting.pyx        | 20 +++++++++++++++++++
 .../pylibcudf/pylibcudf/stream_compaction.pyx | 18 +++++++++++++++++
 .../pylibcudf/pylibcudf/strings/findall.pyx   |  2 +-
 python/pylibcudf/pylibcudf/transform.pyx      |  2 ++
 15 files changed, 97 insertions(+), 3 deletions(-)

diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
index 5a67f4d6cdb..5f9d145139a 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -94,7 +94,7 @@ cpdef bool is_supported_operation(
 ):
     """Check if an operation is supported for the given data types.
 
-    For details, see :cpp:func::is_supported_operation`.
+    For details, see :cpp:func::`is_supported_operation`.
 
     Parameters
     ----------
diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx
index 4601cba515a..e9085e3ea02 100644
--- a/python/pylibcudf/pylibcudf/column_factories.pyx
+++ b/python/pylibcudf/pylibcudf/column_factories.pyx
@@ -18,6 +18,20 @@ from .types import MaskState, TypeId
 
 
 cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id):
+    """Creates an empty column of the specified type.
+
+    For details, see :cpp:func::`make_empty_column`.
+
+    Parameters
+    ----------
+    type_or_id : Union[DataType, type_id, object]
+        The column data type.
+
+    Returns
+    -------
+    Column
+        An empty Column
+    """
     cdef unique_ptr[column] result
     cdef type_id id
 
@@ -60,7 +74,11 @@ cpdef Column make_numeric_column(
     size_type size,
     MaskArg mstate
 ):
+    """Creates an empty numeric column.
+
+    For details, see :cpp:func::`make_numeric_column`.
 
+    """
     cdef unique_ptr[column] result
     cdef mask_state state
 
diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx
index ae5d33aaa46..afb95dba5b3 100644
--- a/python/pylibcudf/pylibcudf/groupby.pyx
+++ b/python/pylibcudf/pylibcudf/groupby.pyx
@@ -286,7 +286,7 @@ cdef class GroupBy:
 
         Returns
         -------
-        Tuple[List[int], Table, Table]]
+        Tuple[List[int], Table, Table]
             A tuple of tables containing three items:
                 - A list of integer offsets into the group keys/values
                 - A table of group keys
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx
index 667c67f4c36..438b0ff1634 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyx
+++ b/python/pylibcudf/pylibcudf/io/avro.pyx
@@ -20,6 +20,8 @@ cpdef TableWithMetadata read_avro(
     """
     Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.
 
+    For details, see :cpp:func:`read_avro`.
+
     Parameters
     ----------
     source_info: SourceInfo
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index df1f1b14247..981ca7b8159 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -59,6 +59,8 @@ cdef class ChunkedParquetReader:
     """
     Reads chunks of a Parquet file into a :py:class:`~.types.TableWithMetadata`.
 
+    For details, see :cpp:class:`chunked_parquet_reader`.
+
     Parameters
     ----------
     source_info : SourceInfo
@@ -167,6 +169,8 @@ cpdef read_parquet(
 ):
     """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`.
 
+    For details, see :cpp:func:`read_parquet`.
+
     Parameters
     ----------
     source_info : SourceInfo
diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx
index b5a7445df36..b3f6a92d85c 100644
--- a/python/pylibcudf/pylibcudf/labeling.pyx
+++ b/python/pylibcudf/pylibcudf/labeling.pyx
@@ -20,6 +20,8 @@ cpdef Column label_bins(
 ):
     """Labels elements based on membership in the specified bins.
 
+    For details see :cpp:func:`label_bins`.
+
     Parameters
     ----------
     input : Column
diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx
index 947caddc485..6f82124d06e 100644
--- a/python/pylibcudf/pylibcudf/lists.pyx
+++ b/python/pylibcudf/pylibcudf/lists.pyx
@@ -52,6 +52,8 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx):
 
     All other columns will be duplicated for each element in the list.
 
+    For details, see :cpp:func:`explode_outer`.
+
     Parameters
     ----------
     input : Table
@@ -75,6 +77,8 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx):
 cpdef Column concatenate_rows(Table input):
     """Concatenate multiple lists columns into a single lists column row-wise.
 
+    For details, see :cpp:func:`concatenate_list_elements`.
+
     Parameters
     ----------
     input : Table
@@ -96,6 +100,8 @@ cpdef Column concatenate_rows(Table input):
 cpdef Column concatenate_list_elements(Column input, bool dropna):
     """Concatenate multiple lists on the same row into a single list.
 
+    For details, see :cpp:func:`concatenate_list_elements`.
+
     Parameters
     ----------
     input : Column
@@ -168,6 +174,8 @@ cpdef Column contains_nulls(Column input):
     """Create a column of bool values indicating whether
     each row in the lists column contains a null value.
 
+    For details, see :cpp:func:`contains_nulls`.
+
     Parameters
     ----------
     input : Column
@@ -290,6 +298,8 @@ cpdef Column segmented_gather(Column input, Column gather_map_list):
 cpdef Column extract_list_element(Column input, ColumnOrSizeType index):
     """Create a column of extracted list elements.
 
+    For details, see :cpp:func:`extract_list_element`.
+
     Parameters
     ----------
     input : Column
@@ -318,6 +328,8 @@ cpdef Column count_elements(Column input):
     list element in the given lists column.
     For details, see :cpp:func:`count_elements`.
 
+    For details, see :cpp:func:`count_elements`.
+
     Parameters
     ----------
     input : Column
diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx
index a7d43c9d158..6d707b67449 100644
--- a/python/pylibcudf/pylibcudf/merge.pyx
+++ b/python/pylibcudf/pylibcudf/merge.pyx
@@ -19,6 +19,8 @@ cpdef Table merge (
 ):
     """Merge a set of sorted tables.
 
+    For details see :cpp:func:`merge`.
+
     Parameters
     ----------
     tables_to_merge : list
diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx
index b847ade774d..3a771fbe7ef 100644
--- a/python/pylibcudf/pylibcudf/quantiles.pyx
+++ b/python/pylibcudf/pylibcudf/quantiles.pyx
@@ -30,6 +30,8 @@ cpdef Column quantile(
     Computes the specified quantiles by interpolating values between which they lie,
     using the interpolation strategy specified in interp.
 
+    For details see :cpp:func:`quantile`.
+
     Parameters
     ----------
     input: Column
@@ -91,6 +93,8 @@ cpdef Table quantiles(
     specified quantiles. In the event a quantile lies in between rows, the specified
     interpolation strategy is used to pick between the rows.
 
+    For details see :cpp:func:`quantiles`.
+
     Parameters
     ----------
     input: Table
diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx
index a99145be900..eb1499ebbea 100644
--- a/python/pylibcudf/pylibcudf/reshape.pyx
+++ b/python/pylibcudf/pylibcudf/reshape.pyx
@@ -23,6 +23,8 @@ cpdef Column interleave_columns(Table source_table):
     in     = [[A1, A2, A3], [B1, B2, B3]]
     return = [A1, B1, A2, B2, A3, B3]
 
+    For details, see :cpp:func:`interleave_columns`.
+
     Parameters
     ----------
     source_table: Table
@@ -44,6 +46,8 @@ cpdef Column interleave_columns(Table source_table):
 cpdef Table tile(Table source_table, size_type count):
     """Repeats the rows from input table count times to form a new table.
 
+    For details, see :cpp:func:`tile`.
+
     Parameters
     ----------
     source_table: Table
diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx
index ff2468f3f9c..814bc6553d8 100644
--- a/python/pylibcudf/pylibcudf/search.pyx
+++ b/python/pylibcudf/pylibcudf/search.pyx
@@ -19,6 +19,8 @@ cpdef Column lower_bound(
 ):
     """Find smallest indices in haystack where needles may be inserted to retain order.
 
+    For details, see :cpp:func:`lower_bound`.
+
     Parameters
     ----------
     haystack : Table
@@ -58,6 +60,8 @@ cpdef Column upper_bound(
 ):
     """Find largest indices in haystack where needles may be inserted to retain order.
 
+    For details, see :cpp:func:`upper_bound`.
+
     Parameters
     ----------
     haystack : Table
@@ -92,6 +96,8 @@ cpdef Column upper_bound(
 cpdef Column contains(Column haystack, Column needles):
     """Check whether needles are present in haystack.
 
+    For details, see :cpp:func:`contains`.
+
     Parameters
     ----------
     haystack : Table
diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx
index bd173eebacb..42289d54bca 100644
--- a/python/pylibcudf/pylibcudf/sorting.pyx
+++ b/python/pylibcudf/pylibcudf/sorting.pyx
@@ -16,6 +16,8 @@ from .table cimport Table
 cpdef Column sorted_order(Table source_table, list column_order, list null_precedence):
     """Computes the row indices required to sort the table.
 
+    For details, see :cpp:func:`sorted_order`.
+
     Parameters
     ----------
     source_table : Table
@@ -52,6 +54,8 @@ cpdef Column stable_sorted_order(
     """Computes the row indices required to sort the table,
     preserving order of equal elements.
 
+    For details, see :cpp:func:`stable_sorted_order`.
+
     Parameters
     ----------
     source_table : Table
@@ -90,6 +94,8 @@ cpdef Column rank(
 ):
     """Computes the rank of each element in the column.
 
+    For details, see :cpp:func:`rank`.
+
     Parameters
     ----------
     input_view : Column
@@ -128,6 +134,8 @@ cpdef Column rank(
 cpdef bool is_sorted(Table tbl, list column_order, list null_precedence):
     """Checks if the table is sorted.
 
+    For details, see :cpp:func:`is_sorted`.
+
     Parameters
     ----------
     tbl : Table
@@ -165,6 +173,8 @@ cpdef Table segmented_sort_by_key(
 ):
     """Sorts the table by key, within segments.
 
+    For details, see :cpp:func:`segmented_sort_by_key`.
+
     Parameters
     ----------
     values : Table
@@ -209,6 +219,8 @@ cpdef Table stable_segmented_sort_by_key(
     """Sorts the table by key preserving order of equal elements,
     within segments.
 
+    For details, see :cpp:func:`stable_segmented_sort_by_key`.
+
     Parameters
     ----------
     values : Table
@@ -251,6 +263,8 @@ cpdef Table sort_by_key(
 ):
     """Sorts the table by key.
 
+    For details, see :cpp:func:`sort_by_key`.
+
     Parameters
     ----------
     values : Table
@@ -290,6 +304,8 @@ cpdef Table stable_sort_by_key(
 ):
     """Sorts the table by key preserving order of equal elements.
 
+    For details, see :cpp:func:`stable_sort_by_key`.
+
     Parameters
     ----------
     values : Table
@@ -324,6 +340,8 @@ cpdef Table stable_sort_by_key(
 cpdef Table sort(Table source_table, list column_order, list null_precedence):
     """Sorts the table.
 
+    For details, see :cpp:func:`sort`.
+
     Parameters
     ----------
     source_table : Table
@@ -355,6 +373,8 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence):
 cpdef Table stable_sort(Table source_table, list column_order, list null_precedence):
     """Sorts the table preserving order of equal elements.
 
+    For details, see :cpp:func:`stable_sort`.
+
     Parameters
     ----------
     source_table : Table
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx
index b574bfa9fa2..d5475ea79d5 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pyx
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx
@@ -25,6 +25,8 @@ from .table cimport Table
 cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold):
     """Filters out rows from the input table based on the presence of nulls.
 
+    For details, see :cpp:func:`drop_nulls`.
+
     Parameters
     ----------
     source_table : Table
@@ -53,6 +55,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold):
 cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold):
     """Filters out rows from the input table based on the presence of NaNs.
 
+    For details, see :cpp:func:`drop_nans`.
+
     Parameters
     ----------
     source_table : Table
@@ -81,6 +85,8 @@ cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold):
 cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask):
     """Filters out rows from the input table based on a boolean mask.
 
+    For details, see :cpp:func:`apply_boolean_mask`.
+
     Parameters
     ----------
     source_table : Table
@@ -111,6 +117,8 @@ cpdef Table unique(
 ):
     """Filter duplicate consecutive rows from the input table.
 
+    For details, see :cpp:func:`unique`.
+
     Parameters
     ----------
     input : Table
@@ -153,6 +161,8 @@ cpdef Table distinct(
 ):
     """Get the distinct rows from the input table.
 
+    For details, see :cpp:func:`distinct`.
+
     Parameters
     ----------
     input : Table
@@ -191,6 +201,8 @@ cpdef Column distinct_indices(
 ):
     """Get the indices of the distinct rows from the input table.
 
+    For details, see :cpp:func:`distinct_indices`.
+
     Parameters
     ----------
     input : Table
@@ -226,6 +238,8 @@ cpdef Table stable_distinct(
 ):
     """Get the distinct rows from the input table, preserving input order.
 
+    For details, see :cpp:func:`stable_distinct`.
+
     Parameters
     ----------
     input : Table
@@ -263,6 +277,8 @@ cpdef size_type unique_count(
 ):
     """Returns the number of unique consecutive elements in the input column.
 
+    For details, see :cpp:func:`unique_count`.
+
     Parameters
     ----------
     source : Column
@@ -294,6 +310,8 @@ cpdef size_type distinct_count(
 ):
     """Returns the number of distinct elements in the input column.
 
+    For details, see :cpp:func:`distinct_count`.
+
     Parameters
     ----------
     source : Column
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx
index 03ecb13a50e..3a6b87504b3 100644
--- a/python/pylibcudf/pylibcudf/strings/findall.pyx
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyx
@@ -13,7 +13,7 @@ cpdef Column findall(Column input, RegexProgram pattern):
     Returns a lists column of strings for each matching occurrence using
     the regex_program pattern within each string.
 
-    For details, see For details, see :cpp:func:`cudf::strings::findall`.
+    For details, see :cpp:func:`cudf::strings::findall`.
 
     Parameters
     ----------
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index bcd6185521a..de425a27c15 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -20,6 +20,8 @@ from .utils cimport int_to_bitmask_ptr
 cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
     """Create a null mask preserving existing nulls and converting nans to null.
 
+    For details, see :cpp:func:`nans_to_nulls`.
+
     Parameters
     ----------
     input : Column

From 8e784243c48e8420b7a75790fb42fc0ffbf6896a Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 25 Sep 2024 14:16:14 -0500
Subject: [PATCH 255/270] Optimization of tdigest merge aggregation. (#16780)

Fixes https://github.com/rapidsai/cudf/issues/16625

This PR fixes a slow implementation of the centroid merging step during the tdigest merge aggregation.  Previously it was doing a linear march over the individual tdigests per group and merging them one by one.  This led to terrible performance for large numbers of groups.  In principle though, all this really was doing was a segmented sort of centroid values. So that's what this PR changes it to.  Speedup for 1,000,000 input tidests with 1,000,000 individual groups is ~1000x,

```
Old
---------------------------------------------------------------------------------------------------------------
Benchmark                                                                     Time             CPU   Iterations
---------------------------------------------------------------------------------------------------------------
TDigest/many_tiny_groups/1000000/1/1/10000/iterations:8/manual_time        7473 ms         7472 ms            8
TDigest/many_tiny_groups2/1000000/1/1/1000/iterations:8/manual_time        7433 ms         7431 ms            8
```


```
New
---------------------------------------------------------------------------------------------------------------
Benchmark                                                                     Time             CPU   Iterations
---------------------------------------------------------------------------------------------------------------
TDigest/many_tiny_groups/1000000/1/1/10000/iterations:8/manual_time        6.72 ms         6.79 ms            8
TDigest/many_tiny_groups2/1000000/1/1/1000/iterations:8/manual_time        1.24 ms         1.32 ms            8
```

Authors:
  - https://github.com/nvdbaranec
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16780
---
 cpp/benchmarks/CMakeLists.txt                 |   5 +
 cpp/benchmarks/quantiles/tdigest.cu           | 123 +++++++++++
 .../quantiles/tdigest/tdigest_aggregation.cu  | 192 ++++++++++--------
 3 files changed, 232 insertions(+), 88 deletions(-)
 create mode 100644 cpp/benchmarks/quantiles/tdigest.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index abc6f74fccf..4113e38dcf4 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -230,6 +230,11 @@ ConfigureNVBench(STRUCT_CREATION_NVBENCH structs/create_structs.cpp)
 # --------------------------------------------------------------------------------
 ConfigureBench(QUANTILES_BENCH quantiles/quantiles.cpp)
 
+# ##################################################################################################
+# * tdigest benchmark
+# --------------------------------------------------------------------------------
+ConfigureNVBench(TDIGEST_NVBENCH quantiles/tdigest.cu)
+
 # ##################################################################################################
 # * type_dispatcher benchmark ---------------------------------------------------------------------
 ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher.cu)
diff --git a/cpp/benchmarks/quantiles/tdigest.cu b/cpp/benchmarks/quantiles/tdigest.cu
new file mode 100644
index 00000000000..9d37dbc9a26
--- /dev/null
+++ b/cpp/benchmarks/quantiles/tdigest.cu
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+
+#include <nvbench/nvbench.cuh>
+
+void bm_tdigest_merge(nvbench::state& state)
+{
+  auto const num_tdigests = static_cast<cudf::size_type>(state.get_int64("num_tdigests"));
+  auto const tdigest_size = static_cast<cudf::size_type>(state.get_int64("tdigest_size"));
+  auto const tdigests_per_group =
+    static_cast<cudf::size_type>(state.get_int64("tdigests_per_group"));
+  auto const max_centroids   = static_cast<cudf::size_type>(state.get_int64("max_centroids"));
+  auto const num_groups      = num_tdigests / tdigests_per_group;
+  auto const total_centroids = num_tdigests * tdigest_size;
+
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
+  constexpr int base_value = 5;
+
+  // construct inner means/weights
+  auto val_iter = cudf::detail::make_counting_transform_iterator(
+    0, cuda::proclaim_return_type<double>([tdigest_size](cudf::size_type i) {
+      return static_cast<double>(base_value + (i % tdigest_size));
+    }));
+  auto one_iter = thrust::make_constant_iterator(1);
+  cudf::test::fixed_width_column_wrapper<double> means(val_iter, val_iter + total_centroids);
+  cudf::test::fixed_width_column_wrapper<double> weights(one_iter, one_iter + total_centroids);
+  std::vector<std::unique_ptr<cudf::column>> inner_struct_children;
+  inner_struct_children.push_back(means.release());
+  inner_struct_children.push_back(weights.release());
+  cudf::test::structs_column_wrapper inner_struct(std::move(inner_struct_children));
+
+  // construct the tdigest lists themselves
+  auto offset_iter = cudf::detail::make_counting_transform_iterator(
+    0, cuda::proclaim_return_type<cudf::size_type>([tdigest_size](cudf::size_type i) {
+      return i * tdigest_size;
+    }));
+  cudf::test::fixed_width_column_wrapper<int> offsets(offset_iter, offset_iter + num_tdigests + 1);
+  auto list_col = cudf::make_lists_column(
+    num_tdigests, offsets.release(), inner_struct.release(), 0, {}, stream, mr);
+
+  // min and max columns
+  auto min_iter = thrust::make_constant_iterator(base_value);
+  auto max_iter = thrust::make_constant_iterator(base_value + (tdigest_size - 1));
+  cudf::test::fixed_width_column_wrapper<double> mins(min_iter, min_iter + num_tdigests);
+  cudf::test::fixed_width_column_wrapper<double> maxes(max_iter, max_iter + num_tdigests);
+
+  // assemble the whole thing
+  std::vector<std::unique_ptr<cudf::column>> tdigest_children;
+  tdigest_children.push_back(std::move(list_col));
+  tdigest_children.push_back(mins.release());
+  tdigest_children.push_back(maxes.release());
+  cudf::test::structs_column_wrapper tdigest(std::move(tdigest_children));
+
+  rmm::device_uvector<cudf::size_type> group_offsets(num_groups + 1, stream, mr);
+  rmm::device_uvector<cudf::size_type> group_labels(num_tdigests, stream, mr);
+  auto group_offset_iter = cudf::detail::make_counting_transform_iterator(
+    0,
+    cuda::proclaim_return_type<cudf::size_type>(
+      [tdigests_per_group] __device__(cudf::size_type i) { return i * tdigests_per_group; }));
+  thrust::copy(rmm::exec_policy_nosync(stream, mr),
+               group_offset_iter,
+               group_offset_iter + num_groups + 1,
+               group_offsets.begin());
+  auto group_label_iter = cudf::detail::make_counting_transform_iterator(
+    0,
+    cuda::proclaim_return_type<cudf::size_type>(
+      [tdigests_per_group] __device__(cudf::size_type i) { return i / tdigests_per_group; }));
+  thrust::copy(rmm::exec_policy_nosync(stream, mr),
+               group_label_iter,
+               group_label_iter + num_tdigests,
+               group_labels.begin());
+
+  state.add_element_count(total_centroids);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch, auto& timer) {
+               timer.start();
+               auto result = cudf::tdigest::detail::group_merge_tdigest(
+                 tdigest, group_offsets, group_labels, num_groups, max_centroids, stream, mr);
+               timer.stop();
+             });
+}
+
+NVBENCH_BENCH(bm_tdigest_merge)
+  .set_name("TDigest many tiny groups")
+  .add_int64_axis("num_tdigests", {500'000})
+  .add_int64_axis("tdigest_size", {1, 1000})
+  .add_int64_axis("tdigests_per_group", {1})
+  .add_int64_axis("max_centroids", {10000, 1000});
+
+NVBENCH_BENCH(bm_tdigest_merge)
+  .set_name("TDigest many small groups")
+  .add_int64_axis("num_tdigests", {500'000})
+  .add_int64_axis("tdigest_size", {1, 1000})
+  .add_int64_axis("tdigests_per_group", {3})
+  .add_int64_axis("max_centroids", {10000, 1000});
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 2dd25a7b890..e1c1d2e3002 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -1021,6 +1021,76 @@ struct group_key_func {
   }
 };
 
+// merges all the tdigests within each group. returns a table containing 2 columns:
+// the sorted means and weights.
+template <typename GroupOffsetIter>
+std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_merged_centroids(
+  tdigest_column_view const& tdv,
+  GroupOffsetIter group_offsets,
+  size_type num_groups,
+  rmm::cuda_stream_view stream)
+{
+  auto temp_mr = cudf::get_current_device_resource_ref();
+
+  auto const total_merged_centroids = tdv.means().size();
+
+  // output is the merged centroids (means, weights)
+  rmm::device_uvector<double> output_means(total_merged_centroids, stream, temp_mr);
+  rmm::device_uvector<double> output_weights(total_merged_centroids, stream, temp_mr);
+
+  // each group represents a collection of tdigest columns. each row is 1 tdigest.
+  // within each group, we want to sort all the centroids within all the tdigests
+  // in that group, using the means as the key. the "outer offsets" represent the indices of the
+  // tdigests, and the "inner offsets" represents the list of centroids for a particular tdigest.
+  //
+  //  rows
+  //  ----        centroid 0 ---------
+  //  tdigest 0   centroid 1
+  //  ----        centroid 2  group 0
+  //  tdigest 1   centroid 3
+  //  ----        centroid 4 ---------
+  //  tdigest 2   centroid 5
+  //  ----        centroid 6  group 1
+  //  tdigest 3   centroid 7
+  //              centroid 8
+  //  ----        centroid 9 --------
+  auto inner_offsets    = tdv.centroids().offsets();
+  auto centroid_offsets = cudf::detail::make_counting_transform_iterator(
+    0,
+    cuda::proclaim_return_type<size_type>(
+      [group_offsets, inner_offsets = tdv.centroids().offsets().begin<size_type>()] __device__(
+        size_type i) { return inner_offsets[group_offsets[i]]; }));
+
+  // perform the sort using the means as the key
+  size_t temp_size;
+  CUDF_CUDA_TRY(cub::DeviceSegmentedSort::SortPairs(nullptr,
+                                                    temp_size,
+                                                    tdv.means().begin<double>(),
+                                                    output_means.begin(),
+                                                    tdv.weights().begin<double>(),
+                                                    output_weights.begin(),
+                                                    total_merged_centroids,
+                                                    num_groups,
+                                                    centroid_offsets,
+                                                    centroid_offsets + 1,
+                                                    stream.value()));
+
+  rmm::device_buffer temp_mem(temp_size, stream, temp_mr);
+  CUDF_CUDA_TRY(cub::DeviceSegmentedSort::SortPairs(temp_mem.data(),
+                                                    temp_size,
+                                                    tdv.means().begin<double>(),
+                                                    output_means.begin(),
+                                                    tdv.weights().begin<double>(),
+                                                    output_weights.begin(),
+                                                    total_merged_centroids,
+                                                    num_groups,
+                                                    centroid_offsets,
+                                                    centroid_offsets + 1,
+                                                    stream.value()));
+
+  return {std::move(output_means), std::move(output_weights)};
+}
+
 template <typename HGroupOffsetIter, typename GroupOffsetIter, typename GroupLabelIter>
 std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                                        HGroupOffsetIter h_outer_offsets,
@@ -1032,59 +1102,6 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
-  // thrust::merge and thrust::merge_by_key don't provide what we need.  What we would need is an
-  // algorithm like a super-merge that takes two layers of keys: one which identifies the outer
-  // grouping of tdigests, and one which identifies the inner groupings of the tdigests within the
-  // outer groups.
-  // TODO: investigate replacing the iterative merge with a single stable_sort_by_key.
-
-  // bring tdigest offsets back to the host
-  auto tdigest_offsets = tdv.centroids().offsets();
-  std::vector<size_type> h_inner_offsets(tdigest_offsets.size());
-  cudaMemcpyAsync(h_inner_offsets.data(),
-                  tdigest_offsets.begin<size_type>(),
-                  sizeof(size_type) * tdigest_offsets.size(),
-                  cudaMemcpyDefault,
-                  stream);
-
-  stream.synchronize();
-
-  // extract all means and weights into a table
-  cudf::table_view tdigests_unsliced({tdv.means(), tdv.weights()});
-
-  // generate the merged (but not yet compressed) tdigests for each group.
-  std::vector<std::unique_ptr<table>> tdigests;
-  tdigests.reserve(num_groups);
-  std::transform(h_outer_offsets,
-                 h_outer_offsets + num_groups,
-                 std::next(h_outer_offsets),
-                 std::back_inserter(tdigests),
-                 [&](auto tdigest_start, auto tdigest_end) {
-                   // the range of tdigests in this group
-                   auto const num_tdigests = tdigest_end - tdigest_start;
-
-                   // slice each tdigest from the input
-                   std::vector<table_view> unmerged_tdigests;
-                   unmerged_tdigests.reserve(num_tdigests);
-                   auto offset_iter = std::next(h_inner_offsets.begin(), tdigest_start);
-                   std::transform(
-                     offset_iter,
-                     offset_iter + num_tdigests,
-                     std::next(offset_iter),
-                     std::back_inserter(unmerged_tdigests),
-                     [&](size_type start, size_type end) {
-                       return cudf::detail::slice(tdigests_unsliced, {start, end}, stream);
-                     });
-
-                   // merge
-                   return cudf::detail::merge(unmerged_tdigests,
-                                              {0},
-                                              {order::ASCENDING},
-                                              {},
-                                              stream,
-                                              cudf::get_current_device_resource_ref());
-                 });
-
   // generate min and max values
   auto merged_min_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -1121,7 +1138,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
   auto group_num_weights = cudf::detail::make_counting_transform_iterator(
     0,
     group_num_weights_func<decltype(group_offsets)>{group_offsets,
-                                                    tdigest_offsets.begin<size_type>()});
+                                                    tdv.centroids().offsets().begin<size_type>()});
   thrust::replace_if(rmm::exec_policy(stream),
                      merged_min_col->mutable_view().begin<double>(),
                      merged_min_col->mutable_view().end<double>(),
@@ -1135,29 +1152,33 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                      group_is_empty{},
                      0);
 
-  // concatenate all the merged tdigests back into one table.
-  std::vector<table_view> tdigest_views;
-  tdigest_views.reserve(num_groups);
-  std::transform(tdigests.begin(),
-                 tdigests.end(),
-                 std::back_inserter(tdigest_views),
-                 [](std::unique_ptr<table> const& t) { return t->view(); });
-  auto merged =
-    cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref());
+  auto temp_mr = cudf::get_current_device_resource_ref();
+
+  // merge the centroids
+  auto [merged_means, merged_weights] =
+    generate_merged_centroids(tdv, group_offsets, num_groups, stream);
+  size_t const num_centroids = tdv.means().size();
+  CUDF_EXPECTS(merged_means.size() == num_centroids,
+               "Unexpected number of centroids in merged result");
 
   // generate cumulative weights
-  auto merged_weights     = merged->get_column(1).view();
-  auto cumulative_weights = cudf::make_numeric_column(
-    data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream);
-  auto keys = cudf::detail::make_counting_transform_iterator(
-    0,
-    group_key_func<decltype(group_labels)>{
-      group_labels, tdigest_offsets.begin<size_type>(), tdigest_offsets.size()});
+  rmm::device_uvector<double> cumulative_weights(merged_weights.size(), stream, temp_mr);
+
+  // generate group keys for all centroids in the entire column
+  rmm::device_uvector<size_type> group_keys(num_centroids, stream, temp_mr);
+  auto iter          = thrust::make_counting_iterator(0);
+  auto inner_offsets = tdv.centroids().offsets();
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_centroids,
+                    group_keys.begin(),
+                    group_key_func<decltype(group_labels)>{
+                      group_labels, inner_offsets.begin<size_type>(), inner_offsets.size()});
   thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                keys,
-                                keys + cumulative_weights->size(),
-                                merged_weights.begin<double>(),
-                                cumulative_weights->mutable_view().begin<double>());
+                                group_keys.begin(),
+                                group_keys.begin() + num_centroids,
+                                merged_weights.begin(),
+                                cumulative_weights.begin());
 
   auto const delta = max_centroids;
 
@@ -1166,37 +1187,32 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
     delta,
     num_groups,
     nearest_value_centroid_weights<decltype(group_offsets)>{
-      cumulative_weights->view().begin<double>(),
-      group_offsets,
-      tdigest_offsets.begin<size_type>()},
-    centroid_group_info<decltype(group_offsets)>{cumulative_weights->view().begin<double>(),
-                                                 group_offsets,
-                                                 tdigest_offsets.begin<size_type>()},
+      cumulative_weights.begin(), group_offsets, inner_offsets.begin<size_type>()},
+    centroid_group_info<decltype(group_offsets)>{
+      cumulative_weights.begin(), group_offsets, inner_offsets.begin<size_type>()},
     cumulative_centroid_weight<decltype(group_labels), decltype(group_offsets)>{
-      cumulative_weights->view().begin<double>(),
+      cumulative_weights.begin(),
       group_labels,
       group_offsets,
-      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+      {inner_offsets.begin<size_type>(), static_cast<size_t>(inner_offsets.size())}},
     false,
     stream,
     mr);
 
   // input centroid values
   auto centroids = cudf::detail::make_counting_transform_iterator(
-    0,
-    make_weighted_centroid{merged->get_column(0).view().begin<double>(),
-                           merged_weights.begin<double>()});
+    0, make_weighted_centroid{merged_means.begin(), merged_weights.begin()});
 
   // compute the tdigest
   return compute_tdigests(
     delta,
     centroids,
-    centroids + merged->num_rows(),
+    centroids + merged_means.size(),
     cumulative_centroid_weight<decltype(group_labels), decltype(group_offsets)>{
-      cumulative_weights->view().begin<double>(),
+      cumulative_weights.begin(),
       group_labels,
       group_offsets,
-      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+      {inner_offsets.begin<size_type>(), static_cast<size_t>(inner_offsets.size())}},
     std::move(merged_min_col),
     std::move(merged_max_col),
     group_cluster_wl,

From f7c5d32a833dcc6b9b35756b89a0eb19b8bc9a40 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 25 Sep 2024 14:37:37 -0500
Subject: [PATCH 256/270] Display deltas for `cudf.pandas` test summary
 (#16864)

This PR displays delta's for CPU and GPU usage metrics that are extracted from `cudf.pandas` pytests.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/16864
---
 .github/workflows/pr.yaml                     | 18 +++++-
 .../pandas-tests/job-summary.py               | 64 +++++++++++++++----
 2 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index b4c449ce5d8..766df59594b 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -50,6 +50,7 @@ jobs:
       test_java: ${{ steps.changed-files.outputs.java_any_changed == 'true' }}
       test_notebooks: ${{ steps.changed-files.outputs.notebooks_any_changed == 'true' }}
       test_python: ${{ steps.changed-files.outputs.python_any_changed == 'true' }}
+      test_cudf_pandas: ${{ steps.changed-files.outputs.cudf_pandas_any_changed == 'true' }}
     steps:
       - name: Get PR info
         id: get-pr-info
@@ -82,6 +83,7 @@ jobs:
               - '!java/**'
               - '!notebooks/**'
               - '!python/**'
+              - '!ci/cudf_pandas_scripts/**'
             java:
               - '**'
               - '!CONTRIBUTING.md'
@@ -90,11 +92,13 @@ jobs:
               - '!img/**'
               - '!notebooks/**'
               - '!python/**'
+              - '!ci/cudf_pandas_scripts/**'
             notebooks:
               - '**'
               - '!CONTRIBUTING.md'
               - '!README.md'
               - '!java/**'
+              - '!ci/cudf_pandas_scripts/**'
             python:
               - '**'
               - '!CONTRIBUTING.md'
@@ -103,6 +107,16 @@ jobs:
               - '!img/**'
               - '!java/**'
               - '!notebooks/**'
+              - '!ci/cudf_pandas_scripts/**'
+            cudf_pandas:
+              - '**'
+              - 'ci/cudf_pandas_scripts/**'
+              - '!CONTRIBUTING.md'
+              - '!README.md'
+              - '!docs/**'
+              - '!img/**'
+              - '!java/**'
+              - '!notebooks/**'
   checks:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
@@ -289,7 +303,7 @@ jobs:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
-    if: needs.changed-files.outputs.test_python == 'true'
+    if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -300,7 +314,7 @@ jobs:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
-    if: needs.changed-files.outputs.test_python == 'true'
+    if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 7a12db927e5..485b2ac8a51 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -67,20 +67,33 @@ def emoji_failed(x):
 # convert pr_results to a pandas DataFrame and then a markdown table
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
-diff_df = pr_df - main_df
-total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call']
-pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1)
-pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1)
+total_usage = main_df["_slow_function_call"] + main_df["_fast_function_call"]
+main_df["CPU Usage"] = ((main_df["_slow_function_call"] / total_usage) * 100.0).round(1)
+main_df["GPU Usage"] = ((main_df["_fast_function_call"] / total_usage) * 100.0).round(1)
+
+total_usage = pr_df["_slow_function_call"] + pr_df["_fast_function_call"]
+pr_df["CPU Usage"] = ((pr_df["_slow_function_call"] / total_usage) * 100.0).round(1)
+pr_df["GPU Usage"] = ((pr_df["_fast_function_call"] / total_usage) * 100.0).round(1)
+
+cpu_usage_mean = pr_df["CPU Usage"].mean().round(2)
+gpu_usage_mean = pr_df["GPU Usage"].mean().round(2)
+
+gpu_usage_rate_change = abs(pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean())
+pr_df["CPU Usage"] = pr_df["CPU Usage"].fillna(0)
+pr_df["GPU Usage"] = pr_df["GPU Usage"].fillna(0)
+main_df["CPU Usage"] = main_df["CPU Usage"].fillna(0)
+main_df["GPU Usage"] = main_df["GPU Usage"].fillna(0)
 
-cpu_usage_mean = pr_df['CPU Usage'].mean().round(2)
-gpu_usage_mean = pr_df['GPU Usage'].mean().round(2)
+diff_df = pr_df - main_df
+diff_df["CPU Usage"] = diff_df["CPU Usage"].round(1).fillna(0)
+diff_df["GPU Usage"] = diff_df["GPU Usage"].round(1).fillna(0)
 
-# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns
-pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%'
-pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%'
+# Add '%' suffix to "CPU Usage" and "GPU Usage" columns
+pr_df["CPU Usage"] = pr_df["CPU Usage"].astype(str) + "%"
+pr_df["GPU Usage"] = pr_df["GPU Usage"].astype(str) + "%"
 
-pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']]
-diff_df = diff_df[["total", "passed", "failed", "skipped"]]
+pr_df = pr_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
+diff_df = diff_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
 diff_df.columns = diff_df.columns + "_diff"
 diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
 diff_df["failed_diff"] = diff_df["failed_diff"].map(emoji_failed)
@@ -99,13 +112,36 @@ def emoji_failed(x):
         "passed_diff": "Passed delta",
         "failed_diff": "Failed delta",
         "skipped_diff": "Skipped delta",
+        "CPU Usage_diff": "CPU Usage delta",
+        "GPU Usage_diff": "GPU Usage delta",
     }
 )
-df = df.sort_values(by=["Failed tests", "Skipped tests"], ascending=False)
-
+df = df.sort_values(by=["CPU Usage delta", "Total tests"], ascending=False)
+df["CPU Usage delta"] = df["CPU Usage delta"].map(emoji_failed)
+df["GPU Usage delta"] = df["GPU Usage delta"].map(emoji_passed)
+df = df[
+    [
+        "Total tests",
+        "CPU Usage delta",
+        "GPU Usage delta",
+        "Passed tests",
+        "Failed tests",
+        "Skipped tests",
+        "CPU Usage",
+        "GPU Usage",
+        "Total delta",
+        "Passed delta",
+        "Failed delta",
+        "Skipped delta",
+    ]
+]
 print(comment)
 print()
-print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%")
+print(
+    f"Average GPU usage: {gpu_usage_mean}% {'an increase' if gpu_usage_rate_change > 0 else 'a decrease'} by {gpu_usage_rate_change}%"
+)
+print()
+print(f"Average CPU usage: {cpu_usage_mean}%")
 print()
 print("Here are the results of running the Pandas tests against this PR:")
 print()

From 987fea3d9c48ad567cb236ae1882f284f3711dd1 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 25 Sep 2024 15:53:25 -0400
Subject: [PATCH 257/270] JSON tree algorithms refactor I: CSR data structure
 for column tree (#15979)

Part of #15903.
1. Introduces the Compressed Sparse Row (CSR) format to store the adjacency information of the column tree.
2. Analogous to `reduce_to_column_tree`, `reduce_to_column_tree_csr` reduces node tree representation to column tree stored in CSR format.

TODO:
- [x] Correctness test

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/15979
---
 cpp/CMakeLists.txt                          |   1 +
 cpp/src/io/json/column_tree_construction.cu | 304 ++++++++++++++++
 cpp/src/io/json/json_column.cu              |  48 +--
 cpp/src/io/json/nested_json.hpp             |  62 +++-
 cpp/tests/CMakeLists.txt                    |   1 +
 cpp/tests/io/json/json_tree_csr.cu          | 370 ++++++++++++++++++++
 6 files changed, 758 insertions(+), 28 deletions(-)
 create mode 100644 cpp/src/io/json/column_tree_construction.cu
 create mode 100644 cpp/tests/io/json/json_tree_csr.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 84b462bb884..136f43ee706 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -380,6 +380,7 @@ add_library(
   src/io/functions.cpp
   src/io/json/host_tree_algorithms.cu
   src/io/json/json_column.cu
+  src/io/json/column_tree_construction.cu
   src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
diff --git a/cpp/src/io/json/column_tree_construction.cu b/cpp/src/io/json/column_tree_construction.cu
new file mode 100644
index 00000000000..c4fe7926706
--- /dev/null
+++ b/cpp/src/io/json/column_tree_construction.cu
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nested_json.hpp"
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/for_each.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/transform_scan.h>
+#include <thrust/unique.h>
+
+namespace cudf::io::json {
+
+using row_offset_t = size_type;
+
+#ifdef CSR_DEBUG_PRINT
+template <typename T>
+void print(device_span<T const> d_vec, std::string name, rmm::cuda_stream_view stream)
+{
+  stream.synchronize();
+  auto h_vec = cudf::detail::make_std_vector_sync(d_vec, stream);
+  std::cout << name << " = ";
+  for (auto e : h_vec) {
+    std::cout << e << " ";
+  }
+  std::cout << std::endl;
+}
+#endif
+
+namespace experimental::detail {
+
+struct level_ordering {
+  device_span<TreeDepthT const> node_levels;
+  device_span<NodeIndexT const> col_ids;
+  device_span<NodeIndexT const> parent_node_ids;
+  __device__ bool operator()(NodeIndexT lhs_node_id, NodeIndexT rhs_node_id) const
+  {
+    auto lhs_parent_col_id = parent_node_ids[lhs_node_id] == parent_node_sentinel
+                               ? parent_node_sentinel
+                               : col_ids[parent_node_ids[lhs_node_id]];
+    auto rhs_parent_col_id = parent_node_ids[rhs_node_id] == parent_node_sentinel
+                               ? parent_node_sentinel
+                               : col_ids[parent_node_ids[rhs_node_id]];
+
+    return (node_levels[lhs_node_id] < node_levels[rhs_node_id]) ||
+           (node_levels[lhs_node_id] == node_levels[rhs_node_id] &&
+            lhs_parent_col_id < rhs_parent_col_id) ||
+           (node_levels[lhs_node_id] == node_levels[rhs_node_id] &&
+            lhs_parent_col_id == rhs_parent_col_id && col_ids[lhs_node_id] < col_ids[rhs_node_id]);
+  }
+};
+
+struct parent_nodeids_to_colids {
+  device_span<NodeIndexT const> rev_mapped_col_ids;
+  __device__ auto operator()(NodeIndexT parent_node_id) -> NodeIndexT
+  {
+    return parent_node_id == parent_node_sentinel ? parent_node_sentinel
+                                                  : rev_mapped_col_ids[parent_node_id];
+  }
+};
+
+/**
+ * @brief Reduces node tree representation to column tree CSR representation.
+ *
+ * @param node_tree Node tree representation of JSON string
+ * @param original_col_ids Column ids of nodes
+ * @param row_offsets Row offsets of nodes
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A tuple of column tree representation of JSON string, column ids of columns, and
+ * max row offsets of columns
+ */
+std::tuple<compressed_sparse_row, column_tree_properties> reduce_to_column_tree(
+  tree_meta_t& node_tree,
+  device_span<NodeIndexT const> original_col_ids,
+  device_span<NodeIndexT const> sorted_col_ids,
+  device_span<NodeIndexT const> ordered_node_ids,
+  device_span<row_offset_t const> row_offsets,
+  bool is_array_of_arrays,
+  NodeIndexT row_array_parent_col_id,
+  rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+
+  if (original_col_ids.empty()) {
+    rmm::device_uvector<NodeIndexT> empty_row_idx(0, stream);
+    rmm::device_uvector<NodeIndexT> empty_col_idx(0, stream);
+    rmm::device_uvector<NodeT> empty_column_categories(0, stream);
+    rmm::device_uvector<row_offset_t> empty_max_row_offsets(0, stream);
+    rmm::device_uvector<NodeIndexT> empty_mapped_col_ids(0, stream);
+    return std::tuple{compressed_sparse_row{std::move(empty_row_idx), std::move(empty_col_idx)},
+                      column_tree_properties{std::move(empty_column_categories),
+                                             std::move(empty_max_row_offsets),
+                                             std::move(empty_mapped_col_ids)}};
+  }
+
+  auto [unpermuted_tree, unpermuted_col_ids, unpermuted_max_row_offsets] =
+    cudf::io::json::detail::reduce_to_column_tree(node_tree,
+                                                  original_col_ids,
+                                                  sorted_col_ids,
+                                                  ordered_node_ids,
+                                                  row_offsets,
+                                                  is_array_of_arrays,
+                                                  row_array_parent_col_id,
+                                                  stream);
+
+  NodeIndexT num_columns = unpermuted_col_ids.size();
+
+  auto mapped_col_ids = cudf::detail::make_device_uvector_async(
+    unpermuted_col_ids, stream, cudf::get_current_device_resource_ref());
+  rmm::device_uvector<NodeIndexT> rev_mapped_col_ids(num_columns, stream);
+  rmm::device_uvector<NodeIndexT> reordering_index(unpermuted_col_ids.size(), stream);
+
+  thrust::sequence(
+    rmm::exec_policy_nosync(stream), reordering_index.begin(), reordering_index.end());
+  // Reorder nodes and column ids in level-wise fashion
+  thrust::sort_by_key(
+    rmm::exec_policy_nosync(stream),
+    reordering_index.begin(),
+    reordering_index.end(),
+    mapped_col_ids.begin(),
+    level_ordering{
+      unpermuted_tree.node_levels, unpermuted_col_ids, unpermuted_tree.parent_node_ids});
+
+  {
+    auto mapped_col_ids_copy = cudf::detail::make_device_uvector_async(
+      mapped_col_ids, stream, cudf::get_current_device_resource_ref());
+    thrust::sequence(
+      rmm::exec_policy_nosync(stream), rev_mapped_col_ids.begin(), rev_mapped_col_ids.end());
+    thrust::sort_by_key(rmm::exec_policy_nosync(stream),
+                        mapped_col_ids_copy.begin(),
+                        mapped_col_ids_copy.end(),
+                        rev_mapped_col_ids.begin());
+  }
+
+  rmm::device_uvector<NodeIndexT> parent_col_ids(num_columns, stream);
+  thrust::transform_output_iterator parent_col_ids_it(parent_col_ids.begin(),
+                                                      parent_nodeids_to_colids{rev_mapped_col_ids});
+  rmm::device_uvector<row_offset_t> max_row_offsets(num_columns, stream);
+  rmm::device_uvector<NodeT> column_categories(num_columns, stream);
+  thrust::copy_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_zip_iterator(thrust::make_permutation_iterator(
+                                unpermuted_tree.parent_node_ids.begin(), reordering_index.begin()),
+                              thrust::make_permutation_iterator(unpermuted_max_row_offsets.begin(),
+                                                                reordering_index.begin()),
+                              thrust::make_permutation_iterator(
+                                unpermuted_tree.node_categories.begin(), reordering_index.begin())),
+    num_columns,
+    thrust::make_zip_iterator(
+      parent_col_ids_it, max_row_offsets.begin(), column_categories.begin()));
+
+#ifdef CSR_DEBUG_PRINT
+  print<NodeIndexT>(reordering_index, "h_reordering_index", stream);
+  print<NodeIndexT>(mapped_col_ids, "h_mapped_col_ids", stream);
+  print<NodeIndexT>(rev_mapped_col_ids, "h_rev_mapped_col_ids", stream);
+  print<NodeIndexT>(parent_col_ids, "h_parent_col_ids", stream);
+  print<row_offset_t>(max_row_offsets, "h_max_row_offsets", stream);
+#endif
+
+  auto construct_row_idx = [&stream](NodeIndexT num_columns,
+                                     device_span<NodeIndexT const> parent_col_ids) {
+    auto row_idx = cudf::detail::make_zeroed_device_uvector_async<NodeIndexT>(
+      static_cast<std::size_t>(num_columns + 1), stream, cudf::get_current_device_resource_ref());
+    // Note that the first element of csr_parent_col_ids is -1 (parent_node_sentinel)
+    // children adjacency
+
+    auto num_non_leaf_columns = thrust::unique_count(
+      rmm::exec_policy_nosync(stream), parent_col_ids.begin() + 1, parent_col_ids.end());
+    rmm::device_uvector<NodeIndexT> non_leaf_nodes(num_non_leaf_columns, stream);
+    rmm::device_uvector<NodeIndexT> non_leaf_nodes_children(num_non_leaf_columns, stream);
+    thrust::reduce_by_key(rmm::exec_policy_nosync(stream),
+                          parent_col_ids.begin() + 1,
+                          parent_col_ids.end(),
+                          thrust::make_constant_iterator(1),
+                          non_leaf_nodes.begin(),
+                          non_leaf_nodes_children.begin(),
+                          thrust::equal_to<TreeDepthT>());
+
+    thrust::scatter(rmm::exec_policy_nosync(stream),
+                    non_leaf_nodes_children.begin(),
+                    non_leaf_nodes_children.end(),
+                    non_leaf_nodes.begin(),
+                    row_idx.begin() + 1);
+
+    if (num_columns > 1) {
+      thrust::transform_inclusive_scan(
+        rmm::exec_policy_nosync(stream),
+        thrust::make_zip_iterator(thrust::make_counting_iterator(1), row_idx.begin() + 1),
+        thrust::make_zip_iterator(thrust::make_counting_iterator(1) + num_columns, row_idx.end()),
+        row_idx.begin() + 1,
+        cuda::proclaim_return_type<NodeIndexT>([] __device__(auto a) {
+          auto n   = thrust::get<0>(a);
+          auto idx = thrust::get<1>(a);
+          return n == 1 ? idx : idx + 1;
+        }),
+        thrust::plus<NodeIndexT>{});
+    } else {
+      auto single_node = 1;
+      row_idx.set_element_async(1, single_node, stream);
+    }
+
+#ifdef CSR_DEBUG_PRINT
+    print<NodeIndexT>(row_idx, "h_row_idx", stream);
+#endif
+    return row_idx;
+  };
+
+  auto construct_col_idx = [&stream](NodeIndexT num_columns,
+                                     device_span<NodeIndexT const> parent_col_ids,
+                                     device_span<NodeIndexT const> row_idx) {
+    rmm::device_uvector<NodeIndexT> col_idx((num_columns - 1) * 2, stream);
+    thrust::fill(rmm::exec_policy_nosync(stream), col_idx.begin(), col_idx.end(), -1);
+    // excluding root node, construct scatter map
+    rmm::device_uvector<NodeIndexT> map(num_columns - 1, stream);
+    thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(stream),
+                                  parent_col_ids.begin() + 1,
+                                  parent_col_ids.end(),
+                                  thrust::make_constant_iterator(1),
+                                  map.begin());
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::make_counting_iterator(1),
+                       num_columns - 1,
+                       [row_idx        = row_idx.begin(),
+                        map            = map.begin(),
+                        parent_col_ids = parent_col_ids.begin()] __device__(auto i) {
+                         auto parent_col_id = parent_col_ids[i];
+                         if (parent_col_id == 0)
+                           --map[i - 1];
+                         else
+                           map[i - 1] += row_idx[parent_col_id];
+                       });
+    thrust::scatter(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator(1),
+                    thrust::make_counting_iterator(1) + num_columns - 1,
+                    map.begin(),
+                    col_idx.begin());
+
+    // Skip the parent of root node
+    thrust::scatter(rmm::exec_policy_nosync(stream),
+                    parent_col_ids.begin() + 1,
+                    parent_col_ids.end(),
+                    row_idx.begin() + 1,
+                    col_idx.begin());
+
+#ifdef CSR_DEBUG_PRINT
+    print<NodeIndexT>(col_idx, "h_col_idx", stream);
+#endif
+
+    return col_idx;
+  };
+
+  /*
+    5. CSR construction:
+      a. Sort column levels and get their ordering
+      b. For each column node coln iterated according to sorted_column_levels; do
+          i. Find nodes that have coln as the parent node -> set adj_coln
+          ii. row idx[coln] = size of adj_coln + 1
+          iii. col idx[coln] = adj_coln U {parent_col_id[coln]}
+  */
+  auto row_idx = construct_row_idx(num_columns, parent_col_ids);
+  auto col_idx = construct_col_idx(num_columns, parent_col_ids, row_idx);
+
+  return std::tuple{
+    compressed_sparse_row{std::move(row_idx), std::move(col_idx)},
+    column_tree_properties{
+      std::move(column_categories), std::move(max_row_offsets), std::move(mapped_col_ids)}};
+}
+
+}  // namespace experimental::detail
+}  // namespace cudf::io::json
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index b08fd139113..dfd9285f682 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -47,7 +47,6 @@
 
 namespace cudf::io::json::detail {
 
-// DEBUG prints
 auto to_cat = [](auto v) -> std::string {
   switch (v) {
     case NC_STRUCT: return " S";
@@ -106,18 +105,19 @@ void print_tree(host_span<SymbolT const> input,
  */
 std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
 reduce_to_column_tree(tree_meta_t& tree,
-                      device_span<NodeIndexT> original_col_ids,
-                      device_span<NodeIndexT> sorted_col_ids,
-                      device_span<NodeIndexT> ordered_node_ids,
-                      device_span<size_type> row_offsets,
+                      device_span<NodeIndexT const> original_col_ids,
+                      device_span<NodeIndexT const> sorted_col_ids,
+                      device_span<NodeIndexT const> ordered_node_ids,
+                      device_span<size_type const> row_offsets,
                       bool is_array_of_arrays,
                       NodeIndexT const row_array_parent_col_id,
                       rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
+
   // 1. column count for allocation
-  auto const num_columns =
-    thrust::unique_count(rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end());
+  auto const num_columns = thrust::unique_count(
+    rmm::exec_policy_nosync(stream), sorted_col_ids.begin(), sorted_col_ids.end());
 
   // 2. reduce_by_key {col_id}, {row_offset}, max.
   rmm::device_uvector<NodeIndexT> unique_col_ids(num_columns, stream);
@@ -162,30 +162,34 @@ reduce_to_column_tree(tree_meta_t& tree,
     });
 
   // 4. unique_copy parent_node_ids, ranges
-  rmm::device_uvector<TreeDepthT> column_levels(0, stream);  // not required
+  rmm::device_uvector<TreeDepthT> column_levels(num_columns, stream);  // not required
   rmm::device_uvector<NodeIndexT> parent_col_ids(num_columns, stream);
   rmm::device_uvector<SymbolOffsetT> col_range_begin(num_columns, stream);  // Field names
   rmm::device_uvector<SymbolOffsetT> col_range_end(num_columns, stream);
   rmm::device_uvector<size_type> unique_node_ids(num_columns, stream);
-  thrust::unique_by_key_copy(rmm::exec_policy(stream),
+  thrust::unique_by_key_copy(rmm::exec_policy_nosync(stream),
                              sorted_col_ids.begin(),
                              sorted_col_ids.end(),
                              ordered_node_ids.begin(),
                              thrust::make_discard_iterator(),
                              unique_node_ids.begin());
+
   thrust::copy_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_zip_iterator(
+      thrust::make_permutation_iterator(tree.node_levels.begin(), unique_node_ids.begin()),
       thrust::make_permutation_iterator(tree.parent_node_ids.begin(), unique_node_ids.begin()),
       thrust::make_permutation_iterator(tree.node_range_begin.begin(), unique_node_ids.begin()),
       thrust::make_permutation_iterator(tree.node_range_end.begin(), unique_node_ids.begin())),
     unique_node_ids.size(),
-    thrust::make_zip_iterator(
-      parent_col_ids.begin(), col_range_begin.begin(), col_range_end.begin()));
+    thrust::make_zip_iterator(column_levels.begin(),
+                              parent_col_ids.begin(),
+                              col_range_begin.begin(),
+                              col_range_end.begin()));
 
   // convert parent_node_ids to parent_col_ids
   thrust::transform(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     parent_col_ids.begin(),
     parent_col_ids.end(),
     parent_col_ids.begin(),
@@ -203,18 +207,17 @@ reduce_to_column_tree(tree_meta_t& tree,
              column_categories[parent_col_id] == NC_LIST &&
                (!is_array_of_arrays || parent_col_id != row_array_parent_col_id));
   };
+
   // Mixed types in List children go to different columns,
   // so all immediate children of list column should have same max_row_offsets.
   //   create list's children max_row_offsets array. (initialize to zero)
   //   atomicMax on  children max_row_offsets array.
   //   gather the max_row_offsets from children row offset array.
   {
-    rmm::device_uvector<NodeIndexT> list_parents_children_max_row_offsets(num_columns, stream);
-    thrust::fill(rmm::exec_policy(stream),
-                 list_parents_children_max_row_offsets.begin(),
-                 list_parents_children_max_row_offsets.end(),
-                 0);
-    thrust::for_each(rmm::exec_policy(stream),
+    auto list_parents_children_max_row_offsets =
+      cudf::detail::make_zeroed_device_uvector_async<NodeIndexT>(
+        static_cast<std::size_t>(num_columns), stream, cudf::get_current_device_resource_ref());
+    thrust::for_each(rmm::exec_policy_nosync(stream),
                      unique_col_ids.begin(),
                      unique_col_ids.end(),
                      [column_categories = column_categories.begin(),
@@ -230,8 +233,9 @@ reduce_to_column_tree(tree_meta_t& tree,
                          ref.fetch_max(max_row_offsets[col_id], cuda::std::memory_order_relaxed);
                        }
                      });
+
     thrust::gather_if(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       parent_col_ids.begin(),
       parent_col_ids.end(),
       parent_col_ids.begin(),
@@ -246,7 +250,7 @@ reduce_to_column_tree(tree_meta_t& tree,
   // copy lists' max_row_offsets to children.
   // all structs should have same size.
   thrust::transform_if(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     unique_col_ids.begin(),
     unique_col_ids.end(),
     max_row_offsets.begin(),
@@ -272,7 +276,7 @@ reduce_to_column_tree(tree_meta_t& tree,
 
   // For Struct and List (to avoid copying entire strings when mixed type as string is enabled)
   thrust::transform_if(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     col_range_begin.begin(),
     col_range_begin.end(),
     column_categories.begin(),
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 83f71e657a7..93ef2b46be1 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -185,6 +185,55 @@ struct device_json_column {
   }
 };
 
+namespace experimental {
+/*
+ * @brief Sparse graph adjacency matrix stored in Compressed Sparse Row (CSR) format.
+ */
+struct compressed_sparse_row {
+  rmm::device_uvector<NodeIndexT> row_idx;
+  rmm::device_uvector<NodeIndexT> col_idx;
+};
+
+/*
+ * @brief Auxiliary column tree properties that are required to construct the device json
+ * column subtree, but not required for the final cudf column construction.
+ */
+struct column_tree_properties {
+  rmm::device_uvector<NodeT> categories;
+  rmm::device_uvector<size_type> max_row_offsets;
+  rmm::device_uvector<NodeIndexT> mapped_ids;
+};
+
+namespace detail {
+/**
+ * @brief Reduce node tree into column tree by aggregating each property of column.
+ *
+ * @param node_tree Node tree representation of JSON string
+ * @param original_col_ids Column ids of nodes
+ * @param sorted_col_ids Sorted column ids of nodes
+ * @param ordered_node_ids Node ids of nodes sorted by column ids
+ * @param row_offsets Row offsets of nodes
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Tuple of compressed_sparse_row struct storing adjacency information of the column tree,
+ * and column_tree_properties struct storing properties of each node i.e. column category, max
+ * number of rows in the column, and column id
+ */
+CUDF_EXPORT
+std::tuple<compressed_sparse_row, column_tree_properties> reduce_to_column_tree(
+  tree_meta_t& node_tree,
+  device_span<NodeIndexT const> original_col_ids,
+  device_span<NodeIndexT const> sorted_col_ids,
+  device_span<NodeIndexT const> ordered_node_ids,
+  device_span<size_type const> row_offsets,
+  bool is_array_of_arrays,
+  NodeIndexT row_array_parent_col_id,
+  rmm::cuda_stream_view stream);
+
+}  // namespace detail
+}  // namespace experimental
+
 namespace detail {
 
 // TODO: return device_uvector instead of passing pre-allocated memory
@@ -303,7 +352,7 @@ get_array_children_indices(TreeDepthT row_array_children_level,
 /**
  * @brief Reduces node tree representation to column tree representation.
  *
- * @param tree Node tree representation of JSON string
+ * @param node_tree Node tree representation of JSON string
  * @param original_col_ids Column ids of nodes
  * @param sorted_col_ids Sorted column ids of nodes
  * @param ordered_node_ids Node ids of nodes sorted by column ids
@@ -314,12 +363,13 @@ get_array_children_indices(TreeDepthT row_array_children_level,
  * @return A tuple of column tree representation of JSON string, column ids of columns, and
  * max row offsets of columns
  */
+CUDF_EXPORT
 std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
-reduce_to_column_tree(tree_meta_t& tree,
-                      device_span<NodeIndexT> original_col_ids,
-                      device_span<NodeIndexT> sorted_col_ids,
-                      device_span<NodeIndexT> ordered_node_ids,
-                      device_span<size_type> row_offsets,
+reduce_to_column_tree(tree_meta_t& node_tree,
+                      device_span<NodeIndexT const> original_col_ids,
+                      device_span<NodeIndexT const> sorted_col_ids,
+                      device_span<NodeIndexT const> ordered_node_ids,
+                      device_span<size_type const> row_offsets,
                       bool is_array_of_arrays,
                       NodeIndexT const row_array_parent_col_id,
                       rmm::cuda_stream_view stream);
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 288fa84a73d..b67d922d377 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -329,6 +329,7 @@ ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cp
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
 ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp)
 ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu)
+ConfigureTest(JSON_TREE_CSR io/json/json_tree_csr.cu)
 ConfigureTest(
   DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
   GPUS 1
diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu
new file mode 100644
index 00000000000..a336b327732
--- /dev/null
+++ b/cpp/tests/io/json/json_tree_csr.cu
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/json/nested_json.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/tokenize_json.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include <string>
+
+namespace cuio_json = cudf::io::json;
+
+struct h_tree_meta_t {
+  std::vector<cuio_json::NodeT> node_categories;
+  std::vector<cuio_json::NodeIndexT> parent_node_ids;
+  std::vector<cuio_json::SymbolOffsetT> node_range_begin;
+  std::vector<cuio_json::SymbolOffsetT> node_range_end;
+};
+
+struct h_column_tree {
+  // position of nnzs
+  std::vector<cuio_json::NodeIndexT> row_idx;
+  std::vector<cuio_json::NodeIndexT> col_idx;
+  // node properties
+  std::vector<cuio_json::NodeT> categories;
+  std::vector<cuio_json::NodeIndexT> column_ids;
+};
+
+// debug printing
+template <typename T>
+void print(cudf::host_span<T const> vec, std::string name)
+{
+  std::cout << name << " = ";
+  for (auto e : vec) {
+    std::cout << e << " ";
+  }
+  std::cout << std::endl;
+}
+
+bool check_equality(cuio_json::tree_meta_t& d_a,
+                    cudf::device_span<cudf::size_type const> d_a_max_row_offsets,
+                    cuio_json::experimental::compressed_sparse_row& d_b_csr,
+                    cuio_json::experimental::column_tree_properties& d_b_ctp,
+                    rmm::cuda_stream_view stream)
+{
+  // convert from tree_meta_t to column_tree_csr
+  stream.synchronize();
+
+  h_tree_meta_t a{cudf::detail::make_std_vector_async(d_a.node_categories, stream),
+                  cudf::detail::make_std_vector_async(d_a.parent_node_ids, stream),
+                  cudf::detail::make_std_vector_async(d_a.node_range_begin, stream),
+                  cudf::detail::make_std_vector_async(d_a.node_range_end, stream)};
+
+  h_column_tree b{cudf::detail::make_std_vector_async(d_b_csr.row_idx, stream),
+                  cudf::detail::make_std_vector_async(d_b_csr.col_idx, stream),
+                  cudf::detail::make_std_vector_async(d_b_ctp.categories, stream),
+                  cudf::detail::make_std_vector_async(d_b_ctp.mapped_ids, stream)};
+
+  auto a_max_row_offsets = cudf::detail::make_std_vector_async(d_a_max_row_offsets, stream);
+  auto b_max_row_offsets = cudf::detail::make_std_vector_async(d_b_ctp.max_row_offsets, stream);
+
+  stream.synchronize();
+
+  auto num_nodes = a.parent_node_ids.size();
+  if (num_nodes > 1) {
+    if (b.row_idx.size() != num_nodes + 1) { return false; }
+
+    for (auto pos = b.row_idx[0]; pos < b.row_idx[1]; pos++) {
+      auto v = b.col_idx[pos];
+      if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[0]) { return false; }
+    }
+    for (size_t u = 1; u < num_nodes; u++) {
+      auto v = b.col_idx[b.row_idx[u]];
+      if (a.parent_node_ids[b.column_ids[u]] != b.column_ids[v]) { return false; }
+
+      for (auto pos = b.row_idx[u] + 1; pos < b.row_idx[u + 1]; pos++) {
+        v = b.col_idx[pos];
+        if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) { return false; }
+      }
+    }
+    for (size_t u = 0; u < num_nodes; u++) {
+      if (a.node_categories[b.column_ids[u]] != b.categories[u]) { return false; }
+    }
+    for (size_t u = 0; u < num_nodes; u++) {
+      if (a_max_row_offsets[b.column_ids[u]] != b_max_row_offsets[u]) { return false; }
+    }
+  } else if (num_nodes == 1) {
+    if (b.row_idx.size() != num_nodes + 1) { return false; }
+
+    if (b.row_idx[0] != 0 || b.row_idx[1] != 1) return false;
+    if (!b.col_idx.empty()) return false;
+    for (size_t u = 0; u < num_nodes; u++) {
+      if (a.node_categories[b.column_ids[u]] != b.categories[u]) { return false; }
+    }
+
+    for (size_t u = 0; u < num_nodes; u++) {
+      if (a_max_row_offsets[b.column_ids[u]] != b_max_row_offsets[u]) { return false; }
+    }
+  }
+  return true;
+}
+
+void run_test(std::string const& input, bool enable_lines = true)
+{
+  auto const stream = cudf::get_default_stream();
+  cudf::string_scalar d_scalar(input, true, stream);
+  auto d_input = cudf::device_span<cuio_json::SymbolT const>{d_scalar.data(),
+                                                             static_cast<size_t>(d_scalar.size())};
+
+  cudf::io::json_reader_options options{};
+  options.enable_lines(enable_lines);
+  options.enable_mixed_types_as_string(true);
+
+  // Parse the JSON and get the token stream
+  auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, cudf::get_current_device_resource_ref());
+
+  // Get the JSON's tree representation
+  auto gpu_tree =
+    cuio_json::detail::get_tree_representation(tokens_gpu,
+                                               token_indices_gpu,
+                                               options.is_enabled_mixed_types_as_string(),
+                                               stream,
+                                               cudf::get_current_device_resource_ref());
+
+  bool const is_array_of_arrays = [&]() {
+    std::array<cuio_json::node_t, 2> h_node_categories = {cuio_json::NC_ERR, cuio_json::NC_ERR};
+    auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size());
+    CUDF_CUDA_TRY(cudaMemcpyAsync(h_node_categories.data(),
+                                  gpu_tree.node_categories.data(),
+                                  sizeof(cuio_json::node_t) * size_to_copy,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    stream.synchronize();
+    if (options.is_enabled_lines()) return h_node_categories[0] == cuio_json::NC_LIST;
+    return h_node_categories[0] == cuio_json::NC_LIST and
+           h_node_categories[1] == cuio_json::NC_LIST;
+  }();
+
+  auto tup =
+    cuio_json::detail::records_orient_tree_traversal(d_input,
+                                                     gpu_tree,
+                                                     is_array_of_arrays,
+                                                     options.is_enabled_lines(),
+                                                     stream,
+                                                     rmm::mr::get_current_device_resource());
+  auto& gpu_col_id      = std::get<0>(tup);
+  auto& gpu_row_offsets = std::get<1>(tup);
+
+  auto const num_nodes = gpu_col_id.size();
+  rmm::device_uvector<cudf::size_type> sorted_col_ids(gpu_col_id.size(), stream);  // make a copy
+  thrust::copy(
+    rmm::exec_policy(stream), gpu_col_id.begin(), gpu_col_id.end(), sorted_col_ids.begin());
+
+  // sort by {col_id} on {node_ids} stable
+  rmm::device_uvector<cudf::size_type> node_ids(gpu_col_id.size(), stream);
+  thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end());
+  thrust::stable_sort_by_key(
+    rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
+
+  cudf::size_type const row_array_parent_col_id = [&]() {
+    cudf::size_type value      = cuio_json::parent_node_sentinel;
+    auto const list_node_index = options.is_enabled_lines() ? 0 : 1;
+    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
+                                  gpu_col_id.data() + list_node_index,
+                                  sizeof(cudf::size_type),
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    stream.synchronize();
+    return value;
+  }();
+
+  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
+    cudf::io::json::detail::reduce_to_column_tree(gpu_tree,
+                                                  gpu_col_id,
+                                                  sorted_col_ids,
+                                                  node_ids,
+                                                  gpu_row_offsets,
+                                                  is_array_of_arrays,
+                                                  row_array_parent_col_id,
+                                                  stream);
+
+  auto [d_column_tree_csr, d_column_tree_properties] =
+    cudf::io::json::experimental::detail::reduce_to_column_tree(gpu_tree,
+                                                                gpu_col_id,
+                                                                sorted_col_ids,
+                                                                node_ids,
+                                                                gpu_row_offsets,
+                                                                is_array_of_arrays,
+                                                                row_array_parent_col_id,
+                                                                stream);
+
+  auto iseq = check_equality(
+    d_column_tree, d_max_row_offsets, d_column_tree_csr, d_column_tree_properties, stream);
+  // assert equality between csr and meta formats
+  ASSERT_TRUE(iseq);
+}
+
+struct JsonColumnTreeTests : public cudf::test::BaseFixture {};
+
+TEST_F(JsonColumnTreeTests, JSONL_Small)
+{
+  std::string const input =
+    R"(  {}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )";  // Prepare input & output buffers
+  run_test(input);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_Large)
+{
+  std::string const input =
+    R"(  {}
+    {}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )";
+  run_test(input);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_ListofStruct)
+{
+  std::string const input = R"(
+  { "Root": { "Key": [ { "EE": "A" } ] } }
+  { "Root": { "Key": {  } } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  )";
+  run_test(input);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_MissingEntries)
+{
+  std::string json_stringl = R"(
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}
+    {"a": 1, "b": {"0": "abc"          }, "c": false}
+    {"a": 1, "b": {}}
+    {"a": 1,                              "c": null}
+    )";
+  run_test(json_stringl);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_MoreMissingEntries)
+{
+  std::string json_stringl = R"(
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    )";
+  run_test(json_stringl);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_StillMoreMissingEntries)
+{
+  std::string json_stringl = R"(
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    { "foo1": ["123","456"], "bar": 123 }
+    { "foo2": { "b": 5 }, "car": 456 }
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    )";
+  run_test(json_stringl);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_MissingEntries)
+{
+  std::string json_string = R"([
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true},
+    {"a": 1, "b": {"0": "abc"          }, "c": false},
+    {"a": 1, "b": {}},
+    {"a": 1,                              "c": null}
+    ])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_StructOfStructs)
+{
+  std::string json_string =
+    R"([
+    {},
+    { "a": { "y" : 6, "z": [] }},
+    { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }}
+    ])";  // Prepare input & output buffers
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_ArrayOfArrays_NestedList)
+{
+  std::string json_string =
+    R"([123, [1,2,3]]
+       [456, null,  { "a": 1 }])";
+  run_test(json_string);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_ArrayofArrays_NestedList)
+{
+  std::string json_string = R"([[[1,2,3], null, 123],
+              [null, { "a": 1 }, 456 ]])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_CornerCase_Empty)
+{
+  std::string json_string = R"([])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_CornerCase_List)
+{
+  std::string json_string = R"([123])";
+  run_test(json_string, true);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_CornerCase_EmptyNestedList)
+{
+  std::string json_string = R"([[[]]])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_CornerCase_EmptyNestedLists)
+{
+  std::string json_string = R"([[], [], []])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_CornerCase_ListofLists)
+{
+  std::string json_string = R"([[1, 2, 3], [4, 5, null], []])";
+  run_test(json_string, true);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_CornerCase_EmptyListOfLists)
+{
+  std::string json_string = R"([[]])";
+  run_test(json_string, true);
+}

From ba4afae921f6d1906a201636c084a82a8586bb36 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 25 Sep 2024 16:03:41 -0500
Subject: [PATCH 258/270] Make tests deterministic (#16910)

This PR is a first pass of making tests deterministic, I noticed one of CI job failed due to an overflow error related to random data generation.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/16910
---
 python/cudf/cudf/tests/test_array_function.py | 26 +++++++++----------
 .../test_avro_reader_fastavro_integration.py  |  3 ++-
 python/cudf/cudf/tests/test_groupby.py        |  7 +++++
 .../dask_cudf/tests/test_reductions.py        |  2 +-
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index 773141ee71a..979c936a182 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -33,9 +33,10 @@ def __array_function__(self, *args, **kwargs):
 
 missing_arrfunc_reason = "NEP-18 support is not available in NumPy"
 
+np.random.seed(0)
+
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
     "func",
     [
@@ -47,7 +48,8 @@ def __array_function__(self, *args, **kwargs):
         lambda x: np.linalg.norm(x),
     ],
 )
-def test_array_func_cudf_series(np_ar, func):
+def test_array_func_cudf_series(func):
+    np_ar = np.random.random(100)
     cudf_ser = cudf.Series(np_ar)
     expect = func(np_ar)
     got = func(cudf_ser)
@@ -58,9 +60,6 @@ def test_array_func_cudf_series(np_ar, func):
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize(
-    "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))]
-)
 @pytest.mark.parametrize(
     "func",
     [
@@ -74,7 +73,8 @@ def test_array_func_cudf_series(np_ar, func):
         lambda x: np.prod(x, axis=1),
     ],
 )
-def test_array_func_cudf_dataframe(pd_df, func):
+def test_array_func_cudf_dataframe(func):
+    pd_df = pd.DataFrame(np.random.uniform(size=(100, 10)))
     cudf_df = cudf.from_pandas(pd_df)
     expect = func(pd_df)
     got = func(cudf_df)
@@ -82,9 +82,6 @@ def test_array_func_cudf_dataframe(pd_df, func):
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize(
-    "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))]
-)
 @pytest.mark.parametrize(
     "func",
     [
@@ -93,21 +90,22 @@ def test_array_func_cudf_dataframe(pd_df, func):
         lambda x: np.linalg.det(x),
     ],
 )
-def test_array_func_missing_cudf_dataframe(pd_df, func):
+def test_array_func_missing_cudf_dataframe(func):
+    pd_df = pd.DataFrame(np.random.uniform(size=(100, 10)))
     cudf_df = cudf.from_pandas(pd_df)
     with pytest.raises(TypeError):
         func(cudf_df)
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
     "func",
     [
         lambda x: np.unique(x),
     ],
 )
-def test_array_func_cudf_index(np_ar, func):
+def test_array_func_cudf_index(func):
+    np_ar = np.random.random(100)
     cudf_index = cudf.Index(cudf.Series(np_ar))
     expect = func(np_ar)
     got = func(cudf_index)
@@ -118,7 +116,6 @@ def test_array_func_cudf_index(np_ar, func):
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
     "func",
     [
@@ -127,7 +124,8 @@ def test_array_func_cudf_index(np_ar, func):
         lambda x: np.linalg.det(x),
     ],
 )
-def test_array_func_missing_cudf_index(np_ar, func):
+def test_array_func_missing_cudf_index(func):
+    np_ar = np.random.random(100)
     cudf_index = cudf.Index(cudf.Series(np_ar))
     with pytest.raises(TypeError):
         func(cudf_index)
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 9d69e626c3d..5acdf36de80 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -236,6 +236,7 @@ def test_avro_compression(rows, codec):
             },
         ],
         rows,
+        seed=0,
     )
     expected_df = cudf.DataFrame.from_arrow(df)
 
@@ -599,7 +600,7 @@ def test_avro_reader_multiblock(
     else:
         assert dtype in ("float32", "float64")
         avro_type = "float" if dtype == "float32" else "double"
-
+        np.random.seed(0)
         # We don't use rand_dataframe() here, because it increases the
         # execution time of each test by a factor of 10 or more (it appears
         # to use a very costly approach to generating random data).
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 0aaa71e50d7..848bc259e7b 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -2470,6 +2470,7 @@ def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     pdf.columns = ["x", "y", "z"]
@@ -2602,6 +2603,7 @@ def test_groupby_shift_row_mixed_numerics(
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2639,6 +2641,7 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2687,6 +2690,7 @@ def test_groupby_shift_row_mixed_fill(
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2732,6 +2736,7 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     gdf = cudf.from_pandas(t.to_pandas())
 
@@ -2782,6 +2787,7 @@ def test_groupby_diff_row_mixed_numerics(nelem, shift_perc, direction):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2815,6 +2821,7 @@ def test_groupby_diff_row_zero_shift(nelem):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     gdf = cudf.from_pandas(t.to_pandas())
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
index 88b15718382..d03e92319be 100644
--- a/python/dask_cudf/dask_cudf/tests/test_reductions.py
+++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -13,6 +13,7 @@
 
 
 def _make_random_frame(nelem, npartitions=2):
+    np.random.seed(0)
     df = pd.DataFrame(
         {
             "x": np.random.randint(0, 5, size=nelem),
@@ -38,7 +39,6 @@ def wrapped(series):
 @pytest.mark.parametrize("reducer", _reducers)
 def test_series_reduce(reducer):
     reducer = _get_reduce_fn(reducer)
-    np.random.seed(0)
     size = 10
     df, gdf = _make_random_frame(size)
 

From e42b91bfca834c55f1b1c77bd4d6b1542523fd5e Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 25 Sep 2024 16:21:36 -0500
Subject: [PATCH 259/270] Add polars to "all" dependency list. (#16875)

This adds Polars to the "all" dependency list, ensuring that devcontainers and developers using the conda environment can use the Polars GPU backend provided by cudf.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16875
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 +
 conda/environments/all_cuda-125_arch-x86_64.yaml | 1 +
 dependencies.yaml                                | 1 +
 3 files changed, 3 insertions(+)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 16b3d112992..5a05dfd0530 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -65,6 +65,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
+- polars>=1.8,<1.9
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<18.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index cce2e0eea84..8490296233d 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -63,6 +63,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
+- polars>=1.8,<1.9
 - pre-commit
 - pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
diff --git a/dependencies.yaml b/dependencies.yaml
index 339adbc5ff9..6909eb7168d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -25,6 +25,7 @@ files:
       - rapids_build_setuptools
       - run_common
       - run_cudf
+      - run_cudf_polars
       - run_pylibcudf
       - run_dask_cudf
       - run_custreamz

From c1f377ab911748700c032465d0b237c6a792d984 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 25 Sep 2024 17:51:44 -0400
Subject: [PATCH 260/270] Migrate ORC reader to pylibcudf (#16042)

xref #15162

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16042
---
 python/cudf/cudf/_lib/io/utils.pxd            |   4 +
 python/cudf/cudf/_lib/orc.pyx                 | 313 ++++--------------
 python/cudf/cudf/_lib/utils.pxd               |   2 +-
 python/cudf/cudf/_lib/utils.pyx               |   8 +-
 python/cudf/cudf/io/orc.py                    |  11 +-
 python/cudf/cudf/tests/test_orc.py            |  34 +-
 python/cudf/cudf/utils/ioutils.py             |   4 +-
 python/pylibcudf/pylibcudf/io/CMakeLists.txt  |   2 +-
 python/pylibcudf/pylibcudf/io/__init__.pxd    |   2 +-
 python/pylibcudf/pylibcudf/io/__init__.py     |   2 +-
 python/pylibcudf/pylibcudf/io/orc.pxd         |  50 +++
 python/pylibcudf/pylibcudf/io/orc.pyx         | 302 +++++++++++++++++
 python/pylibcudf/pylibcudf/io/types.pyx       |   1 +
 python/pylibcudf/pylibcudf/libcudf/io/orc.pxd |   1 +
 .../pylibcudf/libcudf/io/orc_metadata.pxd     |   2 +-
 .../pylibcudf/pylibcudf/tests/common/utils.py |  37 ++-
 .../pylibcudf/pylibcudf/tests/io/test_csv.py  |   8 +-
 .../pylibcudf/pylibcudf/tests/io/test_orc.py  |  53 +++
 18 files changed, 537 insertions(+), 299 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/io/orc.pxd
 create mode 100644 python/pylibcudf/pylibcudf/io/orc.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_orc.py

diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 1938f00c179..76a6e32fde0 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -21,6 +21,10 @@ cdef add_df_col_struct_names(
     df,
     child_names_dict
 )
+cdef update_col_struct_field_names(
+    Column col,
+    child_names
+)
 cdef update_struct_field_names(
     table,
     vector[column_name_info]& schema_info)
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index adeba6fffb1..f88c48ce989 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -1,8 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-import cudf
-from cudf.core.buffer import acquire_spill_lock
-
 from libc.stdint cimport int64_t
 from libcpp cimport bool, int
 from libcpp.map cimport map
@@ -11,187 +8,43 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-import datetime
 from collections import OrderedDict
 
-cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
-
 try:
     import ujson as json
 except ImportError:
     import json
 
 cimport pylibcudf.libcudf.io.types as cudf_io_types
+cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
 from pylibcudf.libcudf.io.data_sink cimport data_sink
 from pylibcudf.libcudf.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
-    orc_reader_options,
     orc_writer_options,
-    read_orc as libcudf_read_orc,
     write_orc as libcudf_write_orc,
 )
-from pylibcudf.libcudf.io.orc_metadata cimport (
-    binary_statistics,
-    bucket_statistics,
-    column_statistics,
-    date_statistics,
-    decimal_statistics,
-    double_statistics,
-    integer_statistics,
-    no_statistics,
-    parsed_orc_statistics,
-    read_parsed_orc_statistics as libcudf_read_parsed_orc_statistics,
-    statistics_type,
-    string_statistics,
-    timestamp_statistics,
-)
 from pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     compression_type,
     sink_info,
-    source_info,
     table_input_metadata,
-    table_with_metadata,
 )
 from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport data_type, size_type, type_id
-from pylibcudf.variant cimport get_if as std_get_if, holds_alternative
 
 from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_column_struct_field_names,
-)
+from cudf._lib.io.utils cimport make_sink_info, update_col_struct_field_names
+from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
 
-from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
-
-from cudf._lib.types cimport underlying_type_t_type_id
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+import pylibcudf as plc
 
+import cudf
+from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
+from cudf.core.buffer import acquire_spill_lock
 
 
-cdef _parse_column_type_statistics(column_statistics stats):
-    # Initialize stats to return and parse stats blob
-    column_stats = {}
-
-    if stats.number_of_values.has_value():
-        column_stats["number_of_values"] = stats.number_of_values.value()
-
-    if stats.has_null.has_value():
-        column_stats["has_null"] = stats.has_null.value()
-
-    cdef statistics_type type_specific_stats = stats.type_specific_stats
-
-    cdef integer_statistics* int_stats
-    cdef double_statistics* dbl_stats
-    cdef string_statistics* str_stats
-    cdef bucket_statistics* bucket_stats
-    cdef decimal_statistics* dec_stats
-    cdef date_statistics* date_stats
-    cdef binary_statistics* bin_stats
-    cdef timestamp_statistics* ts_stats
-
-    if holds_alternative[no_statistics](type_specific_stats):
-        return column_stats
-    elif int_stats := std_get_if[integer_statistics](&type_specific_stats):
-        if int_stats.minimum.has_value():
-            column_stats["minimum"] = int_stats.minimum.value()
-        else:
-            column_stats["minimum"] = None
-        if int_stats.maximum.has_value():
-            column_stats["maximum"] = int_stats.maximum.value()
-        else:
-            column_stats["maximum"] = None
-        if int_stats.sum.has_value():
-            column_stats["sum"] = int_stats.sum.value()
-        else:
-            column_stats["sum"] = None
-    elif dbl_stats := std_get_if[double_statistics](&type_specific_stats):
-        if dbl_stats.minimum.has_value():
-            column_stats["minimum"] = dbl_stats.minimum.value()
-        else:
-            column_stats["minimum"] = None
-        if dbl_stats.maximum.has_value():
-            column_stats["maximum"] = dbl_stats.maximum.value()
-        else:
-            column_stats["maximum"] = None
-        if dbl_stats.sum.has_value():
-            column_stats["sum"] = dbl_stats.sum.value()
-        else:
-            column_stats["sum"] = None
-    elif str_stats := std_get_if[string_statistics](&type_specific_stats):
-        if str_stats.minimum.has_value():
-            column_stats["minimum"] = str_stats.minimum.value().decode("utf-8")
-        else:
-            column_stats["minimum"] = None
-        if str_stats.maximum.has_value():
-            column_stats["maximum"] = str_stats.maximum.value().decode("utf-8")
-        else:
-            column_stats["maximum"] = None
-        if str_stats.sum.has_value():
-            column_stats["sum"] = str_stats.sum.value()
-        else:
-            column_stats["sum"] = None
-    elif bucket_stats := std_get_if[bucket_statistics](&type_specific_stats):
-        column_stats["true_count"] = bucket_stats.count[0]
-        column_stats["false_count"] = (
-            column_stats["number_of_values"]
-            - column_stats["true_count"]
-        )
-    elif dec_stats := std_get_if[decimal_statistics](&type_specific_stats):
-        if dec_stats.minimum.has_value():
-            column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8")
-        else:
-            column_stats["minimum"] = None
-        if dec_stats.maximum.has_value():
-            column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8")
-        else:
-            column_stats["maximum"] = None
-        if dec_stats.sum.has_value():
-            column_stats["sum"] = dec_stats.sum.value().decode("utf-8")
-        else:
-            column_stats["sum"] = None
-    elif date_stats := std_get_if[date_statistics](&type_specific_stats):
-        if date_stats.minimum.has_value():
-            column_stats["minimum"] = datetime.datetime.fromtimestamp(
-                datetime.timedelta(date_stats.minimum.value()).total_seconds(),
-                datetime.timezone.utc,
-            )
-        else:
-            column_stats["minimum"] = None
-        if date_stats.maximum.has_value():
-            column_stats["maximum"] = datetime.datetime.fromtimestamp(
-                datetime.timedelta(date_stats.maximum.value()).total_seconds(),
-                datetime.timezone.utc,
-            )
-        else:
-            column_stats["maximum"] = None
-    elif bin_stats := std_get_if[binary_statistics](&type_specific_stats):
-        if bin_stats.sum.has_value():
-            column_stats["sum"] = bin_stats.sum.value()
-        else:
-            column_stats["sum"] = None
-    elif ts_stats := std_get_if[timestamp_statistics](&type_specific_stats):
-        # Before ORC-135, the local timezone offset was included and they were
-        # stored as minimum and maximum. After ORC-135, the timestamp is
-        # adjusted to UTC before being converted to milliseconds and stored
-        # in minimumUtc and maximumUtc.
-        # TODO: Support minimum and maximum by reading writer's local timezone
-        if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value():
-            column_stats["minimum"] = datetime.datetime.fromtimestamp(
-                ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc
-            )
-            column_stats["maximum"] = datetime.datetime.fromtimestamp(
-                ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc
-            )
-    else:
-        raise ValueError("Unsupported statistics type")
-    return column_stats
-
-
+# TODO: Consider inlining this function since it seems to only be used in one place.
 cpdef read_parsed_orc_statistics(filepath_or_buffer):
     """
     Cython function to call into libcudf API, see `read_parsed_orc_statistics`.
@@ -201,25 +54,13 @@ cpdef read_parsed_orc_statistics(filepath_or_buffer):
     cudf.io.orc.read_orc_statistics
     """
 
-    cdef parsed_orc_statistics parsed = (
-        libcudf_read_parsed_orc_statistics(make_source_info([filepath_or_buffer]))
+    parsed = (
+        plc.io.orc.read_parsed_orc_statistics(
+            plc.io.SourceInfo([filepath_or_buffer])
+        )
     )
 
-    cdef vector[column_statistics] file_stats = parsed.file_stats
-    cdef vector[vector[column_statistics]] stripes_stats = parsed.stripes_stats
-
-    parsed_file_stats = [
-        _parse_column_type_statistics(file_stats[column_index])
-        for column_index in range(file_stats.size())
-    ]
-
-    parsed_stripes_stats = [
-        [_parse_column_type_statistics(stripes_stats[stripe_index][column_index])
-         for column_index in range(stripes_stats[stripe_index].size())]
-        for stripe_index in range(stripes_stats.size())
-    ]
-
-    return parsed.column_names, parsed_file_stats, parsed_stripes_stats
+    return parsed.column_names, parsed.file_stats, parsed.stripes_stats
 
 
 cpdef read_orc(object filepaths_or_buffers,
@@ -235,36 +76,34 @@ cpdef read_orc(object filepaths_or_buffers,
     See Also
     --------
     cudf.read_orc
+
+    Notes
+    -----
+    Currently this function only considers the metadata of the first file in the list of
+    filepaths_or_buffers.
     """
-    cdef orc_reader_options c_orc_reader_options = make_orc_reader_options(
-        filepaths_or_buffers,
+
+    if columns is not None:
+        columns = [str(col) for col in columns]
+
+    tbl_w_meta = plc.io.orc.read_orc(
+        plc.io.SourceInfo(filepaths_or_buffers),
         columns,
-        stripes or [],
+        stripes,
         get_skiprows_arg(skip_rows),
         get_num_rows_arg(num_rows),
-        (
-            type_id.EMPTY
-            if timestamp_type is None else
-            <type_id>(
-                <underlying_type_t_type_id> (
-                    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
-                        cudf.dtype(timestamp_type)
-                    ]
-                )
-            )
-        ),
         use_index,
+        plc.types.DataType(
+            SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[
+                cudf.dtype(timestamp_type)
+            ]
+        )
     )
 
-    cdef table_with_metadata c_result
-    cdef size_type nrows
+    names = tbl_w_meta.column_names(include_children=False)
 
-    with nogil:
-        c_result = move(libcudf_read_orc(c_orc_reader_options))
-
-    names = [info.name.decode() for info in c_result.metadata.schema_info]
     actual_index_names, col_names, is_range_index, reset_index_name, \
-        range_idx = _get_index_from_metadata(c_result.metadata.user_data,
+        range_idx = _get_index_from_metadata(tbl_w_meta.per_file_user_data,
                                              names,
                                              skip_rows,
                                              num_rows)
@@ -272,11 +111,11 @@ cpdef read_orc(object filepaths_or_buffers,
     if columns is not None and (isinstance(columns, list) and len(columns) == 0):
         # When `columns=[]`, index needs to be
         # established, but not the columns.
-        nrows = c_result.tbl.get()[0].view().num_rows()
+        nrows = tbl_w_meta.tbl.num_rows()
         return {}, cudf.RangeIndex(nrows)
 
-    data, index = data_from_unique_ptr(
-        move(c_result.tbl),
+    data, index = data_from_pylibcudf_io(
+        tbl_w_meta,
         col_names if columns is None else names,
         actual_index_names
     )
@@ -286,11 +125,13 @@ cpdef read_orc(object filepaths_or_buffers,
     elif reset_index_name:
         index.names = [None] * len(index.names)
 
+    child_name_values = tbl_w_meta.child_names.values()
+
     data = {
-        name: update_column_struct_field_names(
-            col, c_result.metadata.schema_info[i]
+        name: update_col_struct_field_names(
+            col, child_names
         )
-        for i, (name, col) in enumerate(data.items())
+        for (name, col), child_names in zip(data.items(), child_name_values)
     }
 
     return data, index
@@ -313,32 +154,35 @@ cdef compression_type _get_comp_type(object compression):
         raise ValueError(f"Unsupported `compression` type {compression}")
 
 cdef tuple _get_index_from_metadata(
-        map[string, string] user_data,
+        vector[map[string, string]] user_data,
         object names,
         object skip_rows,
         object num_rows):
-    json_str = user_data[b'pandas'].decode('utf-8')
+
     meta = None
     index_col = None
     is_range_index = False
     reset_index_name = False
     range_idx = None
-    if json_str != "":
-        meta = json.loads(json_str)
-        if 'index_columns' in meta and len(meta['index_columns']) > 0:
-            index_col = meta['index_columns']
-            if isinstance(index_col[0], dict) and \
-                    index_col[0]['kind'] == 'range':
-                is_range_index = True
-            else:
-                index_col_names = OrderedDict()
-                for idx_col in index_col:
-                    for c in meta['columns']:
-                        if c['field_name'] == idx_col:
-                            index_col_names[idx_col] = \
-                                c['name'] or c['field_name']
-                            if c['name'] is None:
-                                reset_index_name = True
+
+    if user_data.size() > 0:
+        json_str = user_data[0][b'pandas'].decode('utf-8')
+        if json_str != "":
+            meta = json.loads(json_str)
+            if 'index_columns' in meta and len(meta['index_columns']) > 0:
+                index_col = meta['index_columns']
+                if isinstance(index_col[0], dict) and \
+                        index_col[0]['kind'] == 'range':
+                    is_range_index = True
+                else:
+                    index_col_names = OrderedDict()
+                    for idx_col in index_col:
+                        for c in meta['columns']:
+                            if c['field_name'] == idx_col:
+                                index_col_names[idx_col] = \
+                                    c['name'] or c['field_name']
+                                if c['name'] is None:
+                                    reset_index_name = True
 
     actual_index_names = None
     if index_col is not None and len(index_col) > 0:
@@ -473,41 +317,6 @@ cdef int64_t get_num_rows_arg(object arg) except*:
     return <int64_t> arg
 
 
-cdef orc_reader_options make_orc_reader_options(
-    object filepaths_or_buffers,
-    object column_names,
-    object stripes,
-    int64_t skip_rows,
-    int64_t num_rows,
-    type_id timestamp_type,
-    bool use_index
-) except*:
-
-    cdef vector[vector[size_type]] strps = stripes
-    cdef orc_reader_options opts
-    cdef source_info src = make_source_info(filepaths_or_buffers)
-    opts = move(
-        orc_reader_options.builder(src)
-        .stripes(strps)
-        .skip_rows(skip_rows)
-        .timestamp_type(data_type(timestamp_type))
-        .use_index(use_index)
-        .build()
-    )
-    if num_rows >= 0:
-        opts.set_num_rows(num_rows)
-
-    cdef vector[string] c_column_names
-    if column_names is not None:
-        c_column_names.reserve(len(column_names))
-        for col in column_names:
-            c_column_names.push_back(str(col).encode())
-        if len(column_names) > 0:
-            opts.set_columns(c_column_names)
-
-    return opts
-
-
 cdef class ORCWriter:
     """
     ORCWriter lets you you incrementally write out a ORC file from a series
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index ff97fe80310..7254db5c43d 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.table.table cimport table, table_view
 cdef data_from_unique_ptr(
     unique_ptr[table] c_tbl, column_names, index_names=*)
 cdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
-cdef data_from_pylibcudf_io(tbl_with_meta)
+cdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *)
 cdef data_from_table_view(
     table_view tv, object owner, object column_names, object index_names=*)
 cdef table_view table_view_from_columns(columns) except *
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 8660cca9322..9e5b99f64eb 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -316,15 +316,17 @@ cdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
         index_names
     )
 
-cdef data_from_pylibcudf_io(tbl_with_meta):
+cdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None):
     """
     Unpacks the TableWithMetadata from libcudf I/O
     into a dict of columns and an Index (cuDF format)
     """
+    if column_names is None:
+        column_names = tbl_with_meta.column_names(include_children=False)
     return _data_from_columns(
         columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns],
-        column_names=tbl_with_meta.column_names(include_children=False),
-        index_names=None
+        column_names=column_names,
+        index_names=index_names
     )
 
 cdef columns_from_table_view(
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index c54293badbe..68b60809bb9 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -181,11 +181,6 @@ def read_orc_statistics(
             parsed_stripes_statistics,
         ) = liborc.read_parsed_orc_statistics(path_or_buf)
 
-        # Parse column names
-        column_names = [
-            column_name.decode("utf-8") for column_name in column_names
-        ]
-
         # Parse file statistics
         file_statistics = {
             column_name: column_stats
@@ -248,9 +243,9 @@ def _filter_stripes(
         num_rows_scanned = 0
         for i, stripe_statistics in enumerate(stripes_statistics):
             num_rows_before_stripe = num_rows_scanned
-            num_rows_scanned += next(iter(stripe_statistics.values()))[
-                "number_of_values"
-            ]
+            num_rows_scanned += next(
+                iter(stripe_statistics.values())
+            ).number_of_values
             if stripes is not None and i not in stripes:
                 continue
             if skip_rows is not None and num_rows_scanned <= skip_rows:
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index c2a30b76bea..1dd732c7191 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -184,25 +184,25 @@ def test_orc_read_statistics(datadir):
         pytest.skip(".orc file is not found: %s" % e)
 
     # Check numberOfValues
-    assert_eq(file_statistics[0]["int1"]["number_of_values"], 11_000)
+    assert_eq(file_statistics[0]["int1"].number_of_values, 11_000)
     assert_eq(
-        file_statistics[0]["int1"]["number_of_values"],
+        file_statistics[0]["int1"].number_of_values,
         sum(
             [
-                stripes_statistics[0]["int1"]["number_of_values"],
-                stripes_statistics[1]["int1"]["number_of_values"],
-                stripes_statistics[2]["int1"]["number_of_values"],
+                stripes_statistics[0]["int1"].number_of_values,
+                stripes_statistics[1]["int1"].number_of_values,
+                stripes_statistics[2]["int1"].number_of_values,
             ]
         ),
     )
     assert_eq(
-        stripes_statistics[1]["int1"]["number_of_values"],
-        stripes_statistics[1]["string1"]["number_of_values"],
+        stripes_statistics[1]["int1"].number_of_values,
+        stripes_statistics[1]["string1"].number_of_values,
     )
-    assert_eq(stripes_statistics[2]["string1"]["number_of_values"], 1_000)
+    assert_eq(stripes_statistics[2]["string1"].number_of_values, 1_000)
 
     # Check other statistics
-    assert_eq(stripes_statistics[2]["string1"]["has_null"], False)
+    assert_eq(stripes_statistics[2]["string1"].has_null, False)
     assert_eq(
         file_statistics[0]["int1"]["minimum"],
         min(
@@ -1538,8 +1538,8 @@ def test_empty_statistics():
     for stats in got:
         # Similar expected stats for the first 6 columns in this case
         for col_name in ascii_lowercase[:6]:
-            assert stats[0][col_name].get("number_of_values") == 0
-            assert stats[0][col_name].get("has_null") is True
+            assert stats[0][col_name].number_of_values == 0
+            assert stats[0][col_name].has_null is True
             assert stats[0][col_name].get("minimum") is None
             assert stats[0][col_name].get("maximum") is None
         for col_name in ascii_lowercase[:3]:
@@ -1547,17 +1547,17 @@ def test_empty_statistics():
         # Sum for decimal column is a string
         assert stats[0]["d"].get("sum") == "0"
 
-        assert stats[0]["g"].get("number_of_values") == 0
-        assert stats[0]["g"].get("has_null") is True
+        assert stats[0]["g"].number_of_values == 0
+        assert stats[0]["g"].has_null is True
         assert stats[0]["g"].get("true_count") == 0
         assert stats[0]["g"].get("false_count") == 0
 
-        assert stats[0]["h"].get("number_of_values") == 0
-        assert stats[0]["h"].get("has_null") is True
+        assert stats[0]["h"].number_of_values == 0
+        assert stats[0]["h"].has_null is True
         assert stats[0]["h"].get("sum") == 0
 
-        assert stats[0]["i"].get("number_of_values") == 1
-        assert stats[0]["i"].get("has_null") is False
+        assert stats[0]["i"].number_of_values == 1
+        assert stats[0]["i"].has_null is False
         assert stats[0]["i"].get("minimum") == 1
         assert stats[0]["i"].get("maximum") == 1
         assert stats[0]["i"].get("sum") == 1
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 1180da321e6..d636f36f282 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1873,7 +1873,7 @@ def _apply_filter_bool_eq(val, col_stats):
                 return False
         elif val is False:
             if (col_stats["false_count"] == 0) or (
-                col_stats["true_count"] == col_stats["number_of_values"]
+                col_stats["true_count"] == col_stats.number_of_values
             ):
                 return False
     return True
@@ -1900,7 +1900,7 @@ def _apply_predicate(op, val, col_stats):
             return False
         # TODO: Replace pd.isnull with
         # cudf.isnull once it is implemented
-        if pd.isnull(val) and not col_stats["has_null"]:
+        if pd.isnull(val) and not col_stats.has_null:
             return False
         if not _apply_filter_bool_eq(val, col_stats):
             return False
diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
index bcc2151f5b6..529a71a48ce 100644
--- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx parquet.pyx types.pyx)
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd
index 62820048584..5927a19dc69 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/io/__init__.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 # CSV is removed since it is def not cpdef (to force kw-only arguments)
-from . cimport avro, datasource, json, parquet, types
+from . cimport avro, datasource, json, orc, parquet, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
index 27640f7d955..5d899ee0808 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.py
+++ b/python/pylibcudf/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, csv, datasource, json, parquet, types
+from . import avro, csv, datasource, json, orc, parquet, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd
new file mode 100644
index 00000000000..b111d617b1b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/orc.pxd
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libc.stdint cimport uint64_t
+from libcpp cimport bool
+from libcpp.optional cimport optional
+from libcpp.string cimport string
+from libcpp.vector cimport vector
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.orc_metadata cimport (
+    column_statistics,
+    parsed_orc_statistics,
+    statistics_type,
+)
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.types cimport DataType
+
+
+cpdef TableWithMetadata read_orc(
+    SourceInfo source_info,
+    list columns = *,
+    list stripes = *,
+    size_type skip_rows = *,
+    size_type nrows = *,
+    bool use_index = *,
+    bool use_np_dtypes = *,
+    DataType timestamp_type = *,
+    list decimal128_columns = *
+)
+
+cdef class OrcColumnStatistics:
+    cdef optional[uint64_t] number_of_values_c
+    cdef optional[bool] has_null_c
+    cdef statistics_type type_specific_stats_c
+    cdef dict column_stats
+
+    cdef void _init_stats_dict(self)
+
+    @staticmethod
+    cdef OrcColumnStatistics from_libcudf(column_statistics& col_stats)
+
+
+cdef class ParsedOrcStatistics:
+    cdef parsed_orc_statistics c_obj
+
+    @staticmethod
+    cdef ParsedOrcStatistics from_libcudf(parsed_orc_statistics& orc_stats)
+
+
+cpdef ParsedOrcStatistics read_parsed_orc_statistics(
+    SourceInfo source_info
+)
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx
new file mode 100644
index 00000000000..01a5e4b04a1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/orc.pyx
@@ -0,0 +1,302 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp cimport bool
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+import datetime
+
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.orc cimport (
+    orc_reader_options,
+    read_orc as cpp_read_orc,
+)
+from pylibcudf.libcudf.io.orc_metadata cimport (
+    binary_statistics,
+    bucket_statistics,
+    column_statistics,
+    date_statistics,
+    decimal_statistics,
+    double_statistics,
+    integer_statistics,
+    no_statistics,
+    read_parsed_orc_statistics as cpp_read_parsed_orc_statistics,
+    statistics_type,
+    string_statistics,
+    timestamp_statistics,
+)
+from pylibcudf.libcudf.io.types cimport table_with_metadata
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.types cimport DataType
+from pylibcudf.variant cimport get_if, holds_alternative
+
+
+cdef class OrcColumnStatistics:
+    def __init__(self):
+        raise TypeError(
+            "OrcColumnStatistics should not be instantiated by users. If it is "
+            "being constructed in Cython from a preexisting libcudf object, "
+            "use `OrcColumnStatistics.from_libcudf` instead."
+        )
+
+    @property
+    def number_of_values(self):
+        if self.number_of_values_c.has_value():
+            return self.number_of_values_c.value()
+        return None
+
+    @property
+    def has_null(self):
+        if self.has_null_c.has_value():
+            return self.has_null_c.value()
+        return None
+
+    cdef void _init_stats_dict(self):
+        # Initialize stats to return and parse stats blob
+        self.column_stats = {}
+
+        cdef statistics_type type_specific_stats = self.type_specific_stats_c
+
+        cdef integer_statistics* int_stats
+        cdef double_statistics* dbl_stats
+        cdef string_statistics* str_stats
+        cdef bucket_statistics* bucket_stats
+        cdef decimal_statistics* dec_stats
+        cdef date_statistics* date_stats
+        cdef binary_statistics* bin_stats
+        cdef timestamp_statistics* ts_stats
+
+        if holds_alternative[no_statistics](type_specific_stats):
+            pass
+        elif int_stats := get_if[integer_statistics](&type_specific_stats):
+            if int_stats.minimum.has_value():
+                self.column_stats["minimum"] = int_stats.minimum.value()
+            else:
+                self.column_stats["minimum"] = None
+            if int_stats.maximum.has_value():
+                self.column_stats["maximum"] = int_stats.maximum.value()
+            else:
+                self.column_stats["maximum"] = None
+            if int_stats.sum.has_value():
+                self.column_stats["sum"] = int_stats.sum.value()
+            else:
+                self.column_stats["sum"] = None
+        elif dbl_stats := get_if[double_statistics](&type_specific_stats):
+            if dbl_stats.minimum.has_value():
+                self.column_stats["minimum"] = dbl_stats.minimum.value()
+            else:
+                self.column_stats["minimum"] = None
+            if dbl_stats.maximum.has_value():
+                self.column_stats["maximum"] = dbl_stats.maximum.value()
+            else:
+                self.column_stats["maximum"] = None
+            if dbl_stats.sum.has_value():
+                self.column_stats["sum"] = dbl_stats.sum.value()
+            else:
+                self.column_stats["sum"] = None
+        elif str_stats := get_if[string_statistics](&type_specific_stats):
+            if str_stats.minimum.has_value():
+                self.column_stats["minimum"] = str_stats.minimum.value().decode("utf-8")
+            else:
+                self.column_stats["minimum"] = None
+            if str_stats.maximum.has_value():
+                self.column_stats["maximum"] = str_stats.maximum.value().decode("utf-8")
+            else:
+                self.column_stats["maximum"] = None
+            if str_stats.sum.has_value():
+                self.column_stats["sum"] = str_stats.sum.value()
+            else:
+                self.column_stats["sum"] = None
+        elif bucket_stats := get_if[bucket_statistics](&type_specific_stats):
+            self.column_stats["true_count"] = bucket_stats.count[0]
+            self.column_stats["false_count"] = (
+                self.number_of_values
+                - self.column_stats["true_count"]
+            )
+        elif dec_stats := get_if[decimal_statistics](&type_specific_stats):
+            if dec_stats.minimum.has_value():
+                self.column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8")
+            else:
+                self.column_stats["minimum"] = None
+            if dec_stats.maximum.has_value():
+                self.column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8")
+            else:
+                self.column_stats["maximum"] = None
+            if dec_stats.sum.has_value():
+                self.column_stats["sum"] = dec_stats.sum.value().decode("utf-8")
+            else:
+                self.column_stats["sum"] = None
+        elif date_stats := get_if[date_statistics](&type_specific_stats):
+            if date_stats.minimum.has_value():
+                self.column_stats["minimum"] = datetime.datetime.fromtimestamp(
+                    datetime.timedelta(date_stats.minimum.value()).total_seconds(),
+                    datetime.timezone.utc,
+                )
+            else:
+                self.column_stats["minimum"] = None
+            if date_stats.maximum.has_value():
+                self.column_stats["maximum"] = datetime.datetime.fromtimestamp(
+                    datetime.timedelta(date_stats.maximum.value()).total_seconds(),
+                    datetime.timezone.utc,
+                )
+            else:
+                self.column_stats["maximum"] = None
+        elif bin_stats := get_if[binary_statistics](&type_specific_stats):
+            if bin_stats.sum.has_value():
+                self.column_stats["sum"] = bin_stats.sum.value()
+            else:
+                self.column_stats["sum"] = None
+        elif ts_stats := get_if[timestamp_statistics](&type_specific_stats):
+            # Before ORC-135, the local timezone offset was included and they were
+            # stored as minimum and maximum. After ORC-135, the timestamp is
+            # adjusted to UTC before being converted to milliseconds and stored
+            # in minimumUtc and maximumUtc.
+            # TODO: Support minimum and maximum by reading writer's local timezone
+            if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value():
+                self.column_stats["minimum"] = datetime.datetime.fromtimestamp(
+                    ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc
+                )
+                self.column_stats["maximum"] = datetime.datetime.fromtimestamp(
+                    ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc
+                )
+        else:
+            raise ValueError("Unsupported statistics type")
+
+    def __getitem__(self, item):
+        return self.column_stats[item]
+
+    def __contains__(self, item):
+        return item in self.column_stats
+
+    def get(self, item, default=None):
+        return self.column_stats.get(item, default)
+
+    @staticmethod
+    cdef OrcColumnStatistics from_libcudf(column_statistics& col_stats):
+        cdef OrcColumnStatistics out = OrcColumnStatistics.__new__(OrcColumnStatistics)
+        out.number_of_values_c = col_stats.number_of_values
+        out.has_null_c = col_stats.has_null
+        out.type_specific_stats_c = col_stats.type_specific_stats
+        out._init_stats_dict()
+        return out
+
+
+cdef class ParsedOrcStatistics:
+
+    @property
+    def column_names(self):
+        return [name.decode() for name in self.c_obj.column_names]
+
+    @property
+    def file_stats(self):
+        return [
+            OrcColumnStatistics.from_libcudf(self.c_obj.file_stats[i])
+            for i in range(self.c_obj.file_stats.size())
+        ]
+
+    @property
+    def stripes_stats(self):
+        return [
+            [
+                OrcColumnStatistics.from_libcudf(stripe_stats_c[i])
+                for i in range(stripe_stats_c.size())
+            ]
+            for stripe_stats_c in self.c_obj.stripes_stats
+        ]
+
+    @staticmethod
+    cdef ParsedOrcStatistics from_libcudf(parsed_orc_statistics& orc_stats):
+        cdef ParsedOrcStatistics out = ParsedOrcStatistics.__new__(ParsedOrcStatistics)
+        out.c_obj = move(orc_stats)
+        return out
+
+
+cpdef TableWithMetadata read_orc(
+    SourceInfo source_info,
+    list columns = None,
+    list stripes = None,
+    size_type skip_rows = 0,
+    size_type nrows = -1,
+    bool use_index = True,
+    bool use_np_dtypes = True,
+    DataType timestamp_type = None,
+    list decimal128_columns = None,
+):
+    """Reads an ORC file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the Parquet file from.
+    columns : list, default None
+        The string names of the columns to be read.
+    stripes : list[list[size_type]], default None
+        List of stripes to be read.
+    skip_rows : int64_t, default 0
+        The number of rows to skip from the start of the file.
+    nrows : size_type, default -1
+        The number of rows to read. By default, read the entire file.
+    use_index : bool, default True
+        Whether to use the row index to speed up reading.
+    use_np_dtypes : bool, default True
+        Whether to use numpy compatible dtypes.
+    timestamp_type : DataType, default None
+        The timestamp type to use for the timestamp columns.
+    decimal128_columns : list, default None
+        List of column names to be read as 128-bit decimals.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata (column names) that were read in.
+    """
+    cdef orc_reader_options opts
+    cdef vector[vector[size_type]] c_stripes
+    opts = move(
+        orc_reader_options.builder(source_info.c_obj)
+        .use_index(use_index)
+        .build()
+    )
+    if nrows >= 0:
+        opts.set_num_rows(nrows)
+    if skip_rows >= 0:
+        opts.set_skip_rows(skip_rows)
+    if stripes is not None:
+        c_stripes = stripes
+        opts.set_stripes(c_stripes)
+    if timestamp_type is not None:
+        opts.set_timestamp_type(timestamp_type.c_obj)
+
+    cdef vector[string] c_decimal128_columns
+    if decimal128_columns is not None and len(decimal128_columns) > 0:
+        c_decimal128_columns.reserve(len(decimal128_columns))
+        for col in decimal128_columns:
+            if not isinstance(col, str):
+                raise TypeError("Decimal 128 column names must be strings!")
+            c_decimal128_columns.push_back(col.encode())
+        opts.set_decimal128_columns(c_decimal128_columns)
+
+    cdef vector[string] c_column_names
+    if columns is not None and len(columns) > 0:
+        c_column_names.reserve(len(columns))
+        for col in columns:
+            if not isinstance(col, str):
+                raise TypeError("Column names must be strings!")
+            c_column_names.push_back(col.encode())
+        opts.set_columns(c_column_names)
+
+    cdef table_with_metadata c_result
+
+    with nogil:
+        c_result = move(cpp_read_orc(opts))
+
+    return TableWithMetadata.from_libcudf(c_result)
+
+
+cpdef ParsedOrcStatistics read_parsed_orc_statistics(
+    SourceInfo source_info
+):
+    cdef parsed_orc_statistics parsed = (
+        cpp_read_parsed_orc_statistics(source_info.c_obj)
+    )
+    return ParsedOrcStatistics.from_libcudf(parsed)
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index 1600a805b37..563a02761da 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -130,6 +130,7 @@ cdef class TableWithMetadata:
         """
         return self.metadata.per_file_user_data
 
+
 cdef class SourceInfo:
     """A class containing details on a source to read from.
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
index e4a09b8feb2..dca24c7f665 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
@@ -35,6 +35,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_use_index(bool val) except +
         void enable_use_np_dtypes(bool val) except +
         void set_timestamp_type(data_type type) except +
+        void set_decimal128_columns(vector[string] val) except +
 
         @staticmethod
         orc_reader_options_builder builder(
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
index db6cb0cdfa5..9302ffe2f80 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
@@ -1,11 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-cimport pylibcudf.libcudf.io.types as cudf_io_types
 from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
 from libcpp cimport bool
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
+from pylibcudf.libcudf.io cimport types as cudf_io_types
 from pylibcudf.variant cimport monostate, variant
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py
index babe6634318..9f389fa42c4 100644
--- a/python/pylibcudf/pylibcudf/tests/common/utils.py
+++ b/python/pylibcudf/pylibcudf/tests/common/utils.py
@@ -9,6 +9,7 @@
 import pyarrow.compute as pc
 import pylibcudf as plc
 import pytest
+from pyarrow.orc import write_table as orc_write_table
 from pyarrow.parquet import write_table as pq_write_table
 from pylibcudf.io.types import CompressionType
 
@@ -242,13 +243,21 @@ def is_nested_list(typ):
     return nesting_level(typ)[0] > 1
 
 
-def _convert_numeric_types_to_floating(pa_table):
+def _convert_types(pa_table, input_pred, result_type):
     """
-    Useful little helper for testing the
-    dtypes option in I/O readers.
+    Useful little helper for testing the dtypes option in I/O readers.
 
-    Returns a tuple containing the pylibcudf dtypes
-    and the new pyarrow schema
+    Returns a tuple containing the pylibcudf dtypes and the new pyarrow schema based on
+    the data in the table.
+
+    Parameters
+    ----------
+    pa_table : pyarrow.Table
+        The table from which to extract the dtypes
+    input_pred : function
+        Predicate that evaluates to true for types to replace
+    result_type : pa.DataType
+        The type to cast to
     """
     dtypes = []
     new_fields = []
@@ -257,11 +266,9 @@ def _convert_numeric_types_to_floating(pa_table):
         child_types = []
 
         plc_type = plc.interop.from_arrow(field.type)
-        if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer(
-            field.type
-        ):
-            plc_type = plc.interop.from_arrow(pa.float64())
-            field = field.with_type(pa.float64())
+        if input_pred(field.type):
+            plc_type = plc.interop.from_arrow(result_type)
+            field = field.with_type(result_type)
 
         dtypes.append((field.name, plc_type, child_types))
 
@@ -332,6 +339,16 @@ def make_source(path_or_buf, pa_table, format, **kwargs):
             if isinstance(path_or_buf, io.IOBase)
             else path_or_buf,
         )
+    elif format == "orc":
+        # The conversion to pandas is lossy (doesn't preserve
+        # nested types) so we
+        # will just use pyarrow directly to write this
+        orc_write_table(
+            pa_table,
+            pa.PythonFile(path_or_buf)
+            if isinstance(path_or_buf, io.IOBase)
+            else path_or_buf,
+        )
     if isinstance(path_or_buf, io.IOBase):
         path_or_buf.seek(0)
     return path_or_buf
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
index ccd7eef54f3..ab26f23418d 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
@@ -9,7 +9,7 @@
 import pytest
 from pylibcudf.io.types import CompressionType
 from utils import (
-    _convert_numeric_types_to_floating,
+    _convert_types,
     assert_table_and_meta_eq,
     make_source,
     write_source_str,
@@ -148,7 +148,11 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols):
     if usecols is not None:
         pa_table = pa_table.select(usecols)
 
-    dtypes, new_fields = _convert_numeric_types_to_floating(pa_table)
+    dtypes, new_fields = _convert_types(
+        pa_table,
+        lambda t: (pa.types.is_unsigned_integer(t) or pa.types.is_integer(t)),
+        pa.float64(),
+    )
     # Extract the dtype out of the (name, type, child_types) tuple
     # (read_csv doesn't support this format since it doesn't support nested columns)
     dtypes = {name: dtype for name, dtype, _ in dtypes}
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
new file mode 100644
index 00000000000..42b14b1feff
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import _convert_types, assert_table_and_meta_eq, make_source
+
+# Shared kwargs to pass to make_source
+_COMMON_ORC_SOURCE_KWARGS = {"format": "orc"}
+
+
+@pytest.mark.parametrize("columns", [None, ["col_int64", "col_bool"]])
+def test_read_orc_basic(
+    table_data, binary_source_or_sink, nrows_skiprows, columns
+):
+    _, pa_table = table_data
+    nrows, skiprows = nrows_skiprows
+
+    # ORC reader doesn't support skip_rows for nested columns
+    if skiprows > 0:
+        colnames_to_drop = []
+        for i in range(len(pa_table.schema)):
+            field = pa_table.schema.field(i)
+
+            if pa.types.is_nested(field.type):
+                colnames_to_drop.append(field.name)
+        pa_table = pa_table.drop(colnames_to_drop)
+    # ORC doesn't support unsigned ints
+    # let's cast to int64
+    _, new_fields = _convert_types(
+        pa_table, pa.types.is_unsigned_integer, pa.int64()
+    )
+    pa_table = pa_table.cast(pa.schema(new_fields))
+
+    source = make_source(
+        binary_source_or_sink, pa_table, **_COMMON_ORC_SOURCE_KWARGS
+    )
+
+    res = plc.io.orc.read_orc(
+        plc.io.SourceInfo([source]),
+        nrows=nrows,
+        skip_rows=skiprows,
+        columns=columns,
+    )
+
+    if columns is not None:
+        pa_table = pa_table.select(columns)
+
+    # Adapt to nrows/skiprows
+    pa_table = pa_table.slice(
+        offset=skiprows, length=nrows if nrows != -1 else None
+    )
+
+    assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)

From 503ce030f9523eda83677caafdd221385348a69c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 25 Sep 2024 12:11:03 -1000
Subject: [PATCH 261/270] Add transpose API to pylibcudf (#16749)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16749
---
 .../user_guide/api_docs/pylibcudf/index.rst   |  1 +
 .../api_docs/pylibcudf/transpose.rst          |  6 +++
 python/cudf/cudf/_lib/transpose.pyx           | 30 ++++-----------
 python/pylibcudf/pylibcudf/CMakeLists.txt     |  1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |  2 +
 python/pylibcudf/pylibcudf/__init__.py        |  2 +
 .../pylibcudf/tests/test_transpose.py         | 32 ++++++++++++++++
 python/pylibcudf/pylibcudf/transpose.pxd      |  5 +++
 python/pylibcudf/pylibcudf/transpose.pyx      | 38 +++++++++++++++++++
 9 files changed, 95 insertions(+), 22 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_transpose.py
 create mode 100644 python/pylibcudf/pylibcudf/transpose.pxd
 create mode 100644 python/pylibcudf/pylibcudf/transpose.pyx

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index d6f8cd2a1ff..edb0963ed29 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -38,6 +38,7 @@ This page provides API documentation for pylibcudf.
     table
     traits
     transform
+    transpose
     types
     unary
 
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst
new file mode 100644
index 00000000000..6241295e770
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst
@@ -0,0 +1,6 @@
+=========
+transpose
+=========
+
+.. automodule:: pylibcudf.transpose
+   :members:
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index f78fbd4c844..995d278cb88 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -1,32 +1,18 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.transpose cimport transpose as cpp_transpose
+import pylibcudf as plc
 
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
 
 
 def transpose(list source_columns):
     """Transpose m n-row columns into n m-row columns
     """
-    cdef pair[unique_ptr[column], table_view] c_result
-    cdef table_view c_input = table_view_from_columns(source_columns)
-
-    with nogil:
-        c_result = move(cpp_transpose(c_input))
-
-    # Notice, the data pointer of `result_owner` has been exposed
-    # through `c_result.second` at this point.
-    result_owner = Column.from_unique_ptr(
-        move(c_result.first), data_ptr_exposed=True
-    )
-    return columns_from_table_view(
-        c_result.second,
-        owners=[result_owner] * c_result.second.num_columns()
+    input_table = plc.table.Table(
+        [col.to_pylibcudf(mode="read") for col in source_columns]
     )
+    result_table = plc.transpose.transpose(input_table)
+    return [
+        Column.from_pylibcudf(col, data_ptr_exposed=True)
+        for col in result_table.columns()
+    ]
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index f07c8897e34..fb3a6c13a70 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -44,6 +44,7 @@ set(cython_sources
     table.pyx
     traits.pyx
     transform.pyx
+    transpose.pyx
     types.pyx
     unary.pyx
     utils.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index b7cf6413c05..66d9c3d6165 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -29,6 +29,7 @@ from . cimport (
     strings,
     traits,
     transform,
+    transpose,
     types,
     unary,
 )
@@ -72,6 +73,7 @@ __all__ = [
     "sorting",
     "traits",
     "transform",
+    "transpose",
     "types",
     "unary",
 ]
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 84b1c29f791..0a3615fa941 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -40,6 +40,7 @@
     strings,
     traits,
     transform,
+    transpose,
     types,
     unary,
 )
@@ -86,6 +87,7 @@
     "sorting",
     "traits",
     "transform",
+    "transpose",
     "types",
     "unary",
 ]
diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py
new file mode 100644
index 00000000000..ac11123f680
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from packaging.version import parse
+
+
+@pytest.mark.skipif(
+    parse(pa.__version__) < parse("16.0.0"),
+    reason="https://github.com/apache/arrow/pull/40070",
+)
+@pytest.mark.parametrize(
+    "arr",
+    [
+        [],
+        [1, 2, 3],
+        [1, 2],
+        [1],
+    ],
+)
+def test_transpose(arr):
+    data = {"a": arr, "b": arr}
+    arrow_tbl = pa.table(data)
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+    plc_result = plc.transpose.transpose(plc_tbl)
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.Table.from_pandas(
+        arrow_tbl.to_pandas().T, preserve_index=False
+    ).rename_columns([""] * len(arr))
+    expected = pa.table(expected, schema=result.schema)
+    assert result.equals(expected)
diff --git a/python/pylibcudf/pylibcudf/transpose.pxd b/python/pylibcudf/pylibcudf/transpose.pxd
new file mode 100644
index 00000000000..7b5a7676b49
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/transpose.pxd
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from .table cimport Table
+
+
+cpdef Table transpose(Table input_table)
diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx
new file mode 100644
index 00000000000..a708f6cc37f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/transpose.pyx
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.pair cimport pair
+from libcpp.utility cimport move
+from pylibcudf.libcudf cimport transpose as cpp_transpose
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table_view cimport table_view
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef Table transpose(Table input_table):
+    """Transpose a Table.
+
+    For details, see :cpp:func:`transpose`.
+
+    Parameters
+    ----------
+    input_table : Table
+        Table to transpose
+
+    Returns
+    -------
+    Table
+        Transposed table.
+    """
+    cdef pair[unique_ptr[column], table_view] c_result
+    cdef Table owner_table
+
+    with nogil:
+        c_result = move(cpp_transpose.transpose(input_table.view()))
+
+    owner_table = Table(
+        [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns()
+    )
+
+    return Table.from_table_view(c_result.second, owner_table)

From 0425963e14570fc723e3804f0bd7de7460d295f2 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 25 Sep 2024 17:43:07 -0500
Subject: [PATCH 262/270] Add experimental `filesystem="arrow"` support in
 `dask_cudf.read_parquet` (#16684)

This PR piggybacks on the existing CPU/Arrow Parquet infrastructure in dask-expr. With this PR,

```python
df = dask_cudf.read_parquet(path, filesystem="arrow")
```
will produce a `cudf`-backed collection using PyArrow for IO (i.e. disk->`pa.Table`->`cudf.DataFrame`). Before this PR, passing `filesystem="arrow"` will simply result in an error.

Although this code path is not ideal for fast/local storage, it can be **very** efficient for remote storage (e.g. S3).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Matthew Murray (https://github.com/Matt711)
  - David Wendt (https://github.com/davidwendt)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - https://github.com/brandon-b-miller
  - https://github.com/nvdbaranec

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16684
---
 docs/dask_cudf/source/best_practices.rst      |   9 ++
 docs/dask_cudf/source/index.rst               |   7 +-
 python/dask_cudf/dask_cudf/backends.py        | 142 +++++++++++++++++-
 python/dask_cudf/dask_cudf/expr/_expr.py      |  89 +++++++++++
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |  41 +++--
 5 files changed, 267 insertions(+), 21 deletions(-)

diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst
index 142124163af..83039f86fed 100644
--- a/docs/dask_cudf/source/best_practices.rst
+++ b/docs/dask_cudf/source/best_practices.rst
@@ -252,6 +252,15 @@ result in a simple 1-to-1 mapping between files and output partitions.
   correspond to a reasonable partition size, use ``blocksize=None``
   to avoid unnecessary metadata collection.
 
+.. note::
+  When reading from remote storage (e.g. S3 and GCS), performance will
+  likely improve with ``filesystem="arrow"``. When this option is set,
+  PyArrow will be used to perform IO on multiple CPU threads. Please be
+  aware that this feature is experimental, and behavior may change in
+  the future (without deprecation). Do not pass in ``blocksize`` or
+  ``aggregate_files`` when this feature is used. Instead, set the
+  ``"dataframe.parquet.minimum-partition-size"`` config to control
+  file aggregation.
 
 Use :func:`from_map`
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 23ca7e49753..6eb755d7854 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -40,9 +40,10 @@ Using Dask cuDF
 The Dask DataFrame API (Recommended)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Simply use the `Dask configuration <dask:configuration>`__ system to
-set the ``"dataframe.backend"`` option to ``"cudf"``. From Python,
-this can be achieved like so::
+Simply use the `Dask configuration
+<https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html>`__
+system to set the ``"dataframe.backend"`` option to ``"cudf"``.
+From Python, this can be achieved like so::
 
   import dask
 
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 9347ebba5de..bead964a0ef 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -8,6 +8,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+from packaging.version import Version
 from pandas.api.types import is_scalar
 
 import dask.dataframe as dd
@@ -52,6 +53,10 @@
 get_parallel_type.register(cudf.BaseIndex, lambda _: Index)
 
 
+# Required for Arrow filesystem support in read_parquet
+PYARROW_GE_15 = Version(pa.__version__) >= Version("15.0.0")
+
+
 @meta_nonempty.register(cudf.BaseIndex)
 @_dask_cudf_performance_tracking
 def _nonempty_index(idx):
@@ -695,15 +700,140 @@ def from_dict(
         )
 
     @staticmethod
-    def read_parquet(*args, engine=None, **kwargs):
+    def read_parquet(path, *args, filesystem="fsspec", engine=None, **kwargs):
         import dask_expr as dx
+        import fsspec
 
-        from dask_cudf.io.parquet import CudfEngine
+        if (
+            isinstance(filesystem, fsspec.AbstractFileSystem)
+            or isinstance(filesystem, str)
+            and filesystem.lower() == "fsspec"
+        ):
+            # Default "fsspec" filesystem
+            from dask_cudf.io.parquet import CudfEngine
 
-        _raise_unsupported_parquet_kwargs(**kwargs)
-        return _default_backend(
-            dx.read_parquet, *args, engine=CudfEngine, **kwargs
-        )
+            _raise_unsupported_parquet_kwargs(**kwargs)
+            return _default_backend(
+                dx.read_parquet,
+                path,
+                *args,
+                filesystem=filesystem,
+                engine=CudfEngine,
+                **kwargs,
+            )
+
+        else:
+            # EXPERIMENTAL filesystem="arrow" support.
+            # This code path uses PyArrow for IO, which is only
+            # beneficial for remote storage (e.g. S3)
+
+            from fsspec.utils import stringify_path
+            from pyarrow import fs as pa_fs
+
+            # CudfReadParquetPyarrowFS requires import of distributed beforehand
+            # (See: https://github.com/dask/dask/issues/11352)
+            import distributed  # noqa: F401
+            from dask.core import flatten
+            from dask.dataframe.utils import pyarrow_strings_enabled
+
+            from dask_cudf.expr._expr import CudfReadParquetPyarrowFS
+
+            if args:
+                raise ValueError(f"Unexpected positional arguments: {args}")
+
+            if not (
+                isinstance(filesystem, pa_fs.FileSystem)
+                or isinstance(filesystem, str)
+                and filesystem.lower() in ("arrow", "pyarrow")
+            ):
+                raise ValueError(f"Unexpected filesystem value: {filesystem}.")
+
+            if not PYARROW_GE_15:
+                raise NotImplementedError(
+                    "Experimental Arrow filesystem support requires pyarrow>=15"
+                )
+
+            if not isinstance(path, str):
+                path = stringify_path(path)
+
+            # Extract kwargs
+            columns = kwargs.pop("columns", None)
+            filters = kwargs.pop("filters", None)
+            categories = kwargs.pop("categories", None)
+            index = kwargs.pop("index", None)
+            storage_options = kwargs.pop("storage_options", None)
+            dtype_backend = kwargs.pop("dtype_backend", None)
+            calculate_divisions = kwargs.pop("calculate_divisions", False)
+            ignore_metadata_file = kwargs.pop("ignore_metadata_file", False)
+            metadata_task_size = kwargs.pop("metadata_task_size", None)
+            split_row_groups = kwargs.pop("split_row_groups", "infer")
+            blocksize = kwargs.pop("blocksize", "default")
+            aggregate_files = kwargs.pop("aggregate_files", None)
+            parquet_file_extension = kwargs.pop(
+                "parquet_file_extension", (".parq", ".parquet", ".pq")
+            )
+            arrow_to_pandas = kwargs.pop("arrow_to_pandas", None)
+            open_file_options = kwargs.pop("open_file_options", None)
+
+            # Validate and normalize kwargs
+            kwargs["dtype_backend"] = dtype_backend
+            if arrow_to_pandas is not None:
+                raise ValueError(
+                    "arrow_to_pandas not supported for the 'cudf' backend."
+                )
+            if open_file_options is not None:
+                raise ValueError(
+                    "The open_file_options argument is no longer supported "
+                    "by the 'cudf' backend."
+                )
+            if filters is not None:
+                for filter in flatten(filters, container=list):
+                    _, op, val = filter
+                    if op == "in" and not isinstance(val, (set, list, tuple)):
+                        raise TypeError(
+                            "Value of 'in' filter must be a list, set or tuple."
+                        )
+            if metadata_task_size is not None:
+                raise NotImplementedError(
+                    "metadata_task_size is not supported when using the pyarrow filesystem."
+                )
+            if split_row_groups != "infer":
+                raise NotImplementedError(
+                    "split_row_groups is not supported when using the pyarrow filesystem."
+                )
+            if parquet_file_extension != (".parq", ".parquet", ".pq"):
+                raise NotImplementedError(
+                    "parquet_file_extension is not supported when using the pyarrow filesystem."
+                )
+            if blocksize is not None and blocksize != "default":
+                warnings.warn(
+                    "blocksize is not supported when using the pyarrow filesystem."
+                    "blocksize argument will be ignored."
+                )
+            if aggregate_files is not None:
+                warnings.warn(
+                    "aggregate_files is not supported when using the pyarrow filesystem. "
+                    "Please use the 'dataframe.parquet.minimum-partition-size' config."
+                    "aggregate_files argument will be ignored."
+                )
+
+            return dx.new_collection(
+                CudfReadParquetPyarrowFS(
+                    path,
+                    columns=dx._util._convert_to_list(columns),
+                    filters=filters,
+                    categories=categories,
+                    index=index,
+                    calculate_divisions=calculate_divisions,
+                    storage_options=storage_options,
+                    filesystem=filesystem,
+                    ignore_metadata_file=ignore_metadata_file,
+                    arrow_to_pandas=arrow_to_pandas,
+                    pyarrow_strings_enabled=pyarrow_strings_enabled(),
+                    kwargs=kwargs,
+                    _series=isinstance(columns, str),
+                )
+            )
 
     @staticmethod
     def read_csv(
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index b284ab3774d..af83a01da98 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -2,10 +2,13 @@
 import functools
 
 import dask_expr._shuffle as _shuffle_module
+import pandas as pd
 from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
 from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns
 from dask_expr._reductions import Reduction, Var
+from dask_expr.io.io import FusedParquetIO
+from dask_expr.io.parquet import ReadParquetPyarrowFS
 
 from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
 from dask.dataframe.dispatch import is_categorical_dtype
@@ -18,6 +21,92 @@
 ##
 
 
+class CudfFusedParquetIO(FusedParquetIO):
+    @staticmethod
+    def _load_multiple_files(
+        frag_filters,
+        columns,
+        schema,
+        *to_pandas_args,
+    ):
+        import pyarrow as pa
+
+        from dask.base import apply, tokenize
+        from dask.threaded import get
+
+        token = tokenize(frag_filters, columns, schema)
+        name = f"pq-file-{token}"
+        dsk = {
+            (name, i): (
+                CudfReadParquetPyarrowFS._fragment_to_table,
+                frag,
+                filter,
+                columns,
+                schema,
+            )
+            for i, (frag, filter) in enumerate(frag_filters)
+        }
+        dsk[name] = (
+            apply,
+            pa.concat_tables,
+            [list(dsk.keys())],
+            {"promote_options": "permissive"},
+        )
+        return CudfReadParquetPyarrowFS._table_to_pandas(
+            get(dsk, name),
+            *to_pandas_args,
+        )
+
+
+class CudfReadParquetPyarrowFS(ReadParquetPyarrowFS):
+    @functools.cached_property
+    def _dataset_info(self):
+        from dask_cudf.io.parquet import set_object_dtypes_from_pa_schema
+
+        dataset_info = super()._dataset_info
+        meta_pd = dataset_info["base_meta"]
+        if isinstance(meta_pd, cudf.DataFrame):
+            return dataset_info
+
+        # Convert to cudf
+        # (drop unsupported timezone information)
+        for k, v in meta_pd.dtypes.items():
+            if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None:
+                meta_pd[k] = meta_pd[k].dt.tz_localize(None)
+        meta_cudf = cudf.from_pandas(meta_pd)
+
+        # Re-set "object" dtypes to align with pa schema
+        kwargs = dataset_info.get("kwargs", {})
+        set_object_dtypes_from_pa_schema(
+            meta_cudf,
+            kwargs.get("schema", None),
+        )
+
+        dataset_info["base_meta"] = meta_cudf
+        self.operands[type(self)._parameters.index("_dataset_info_cache")] = (
+            dataset_info
+        )
+        return dataset_info
+
+    @staticmethod
+    def _table_to_pandas(
+        table,
+        index_name,
+        *args,
+    ):
+        df = cudf.DataFrame.from_arrow(table)
+        if index_name is not None:
+            df = df.set_index(index_name)
+        return df
+
+    def _tune_up(self, parent):
+        if self._fusion_compression_factor >= 1:
+            return
+        if isinstance(parent, CudfFusedParquetIO):
+            return
+        return parent.substitute(self, CudfFusedParquetIO(self))
+
+
 class RenameAxisCudf(RenameAxis):
     # TODO: Remove this after rename_axis is supported in cudf
     # (See: https://github.com/rapidsai/cudf/issues/16895)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index a14ffbc37dc..cf8af82e112 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -12,6 +12,7 @@
 from dask.dataframe import assert_eq
 
 import dask_cudf
+from dask_cudf.tests.utils import QUERY_PLANNING_ON
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
@@ -127,7 +128,20 @@ def test_read_parquet_open_file_options_raises():
         )
 
 
-def test_read_parquet_filesystem(s3_base, s3so, pdf):
+@pytest.mark.parametrize(
+    "filesystem",
+    [
+        pytest.param(
+            "arrow",
+            marks=pytest.mark.skipif(
+                not QUERY_PLANNING_ON or not dask_cudf.backends.PYARROW_GE_15,
+                reason="Not supported",
+            ),
+        ),
+        "fsspec",
+    ],
+)
+def test_read_parquet_filesystem(s3_base, s3so, pdf, filesystem):
     fname = "test_parquet_filesystem.parquet"
     bucket = "parquet"
     buffer = BytesIO()
@@ -135,21 +149,24 @@ def test_read_parquet_filesystem(s3_base, s3so, pdf):
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         path = f"s3://{bucket}/{fname}"
+        if filesystem == "arrow":
+            # This feature requires arrow >= 15
+            pytest.importorskip("pyarrow", minversion="15.0.0")
 
-        # Cannot pass filesystem="arrow"
-        with pytest.raises(ValueError):
-            dask_cudf.read_parquet(
+            import pyarrow.fs as pa_fs
+
+            df = dask_cudf.read_parquet(
+                path,
+                filesystem=pa_fs.S3FileSystem(
+                    endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+                ),
+            )
+        else:
+            df = dask_cudf.read_parquet(
                 path,
                 storage_options=s3so,
-                filesystem="arrow",
+                filesystem=filesystem,
             )
-
-        # Can pass filesystem="fsspec"
-        df = dask_cudf.read_parquet(
-            path,
-            storage_options=s3so,
-            filesystem="fsspec",
-        )
         assert df.b.sum().compute() == 9
 
 
From c7f6a22bb3edd3cea377d5405ca48a9eee353bc4 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 25 Sep 2024 12:59:58 -1000
Subject: [PATCH 263/270] Add string.attributes APIs to pylibcudf (#16785)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16785
---
 python/cudf/cudf/_lib/strings/attributes.pyx  | 46 ++++-------
 .../pylibcudf/strings/CMakeLists.txt          | 17 ++++-
 .../pylibcudf/pylibcudf/strings/__init__.pxd  | 19 +++++
 .../pylibcudf/pylibcudf/strings/__init__.py   | 19 +++++
 .../pylibcudf/strings/attributes.pxd          | 10 +++
 .../pylibcudf/strings/attributes.pyx          | 76 +++++++++++++++++++
 .../pylibcudf/tests/test_string_attributes.py | 32 ++++++++
 7 files changed, 185 insertions(+), 34 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/strings/attributes.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/attributes.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_attributes.py

diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx
index fe8c17c9e31..df81b3942b4 100644
--- a/python/cudf/cudf/_lib/strings/attributes.pyx
+++ b/python/cudf/cudf/_lib/strings/attributes.pyx
@@ -2,19 +2,10 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.attributes cimport (
-    code_points as cpp_code_points,
-    count_bytes as cpp_count_bytes,
-    count_characters as cpp_count_characters,
-)
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def count_characters(Column source_strings):
@@ -22,13 +13,10 @@ def count_characters(Column source_strings):
     Returns an integer numeric column containing the
     length of each string in characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_count_characters(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.attributes.count_characters(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -37,13 +25,10 @@ def count_bytes(Column source_strings):
     Returns an integer numeric column containing the
     number of bytes of each string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_count_bytes(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.attributes.count_bytes(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -52,10 +37,7 @@ def code_points(Column source_strings):
     Creates a numeric column with code point values (integers)
     for each character of each string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_code_points(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.attributes.code_points(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 77f20b0b917..142bc124ca2 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -13,8 +13,21 @@
 # =============================================================================
 
 set(cython_sources
-    capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx
-    regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx
+    attributes.pyx
+    capitalize.pyx
+    case.pyx
+    char_types.pyx
+    contains.pyx
+    extract.pyx
+    find.pyx
+    findall.pyx
+    regex_flags.pyx
+    regex_program.pyx
+    repeat.pyx
+    replace.pyx
+    side_type.pyx
+    slice.pyx
+    strip.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 91d884b294b..d8afccc7336 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from . cimport (
+    attributes,
     capitalize,
     case,
     char_types,
@@ -16,3 +17,21 @@ from . cimport (
     strip,
 )
 from .side_type cimport side_type
+
+__all__ = [
+    "attributes",
+    "capitalize",
+    "case",
+    "char_types",
+    "contains",
+    "convert",
+    "extract",
+    "find",
+    "findall",
+    "regex_flags",
+    "regex_program",
+    "replace",
+    "slice",
+    "strip",
+    "side_type",
+]
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index b4856784390..22452812e42 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from . import (
+    attributes,
     capitalize,
     case,
     char_types,
@@ -17,3 +18,21 @@
     strip,
 )
 from .side_type import SideType
+
+__all__ = [
+    "attributes",
+    "capitalize",
+    "case",
+    "char_types",
+    "contains",
+    "convert",
+    "extract",
+    "find",
+    "findall",
+    "regex_flags",
+    "regex_program",
+    "replace",
+    "slice",
+    "strip",
+    "SideType",
+]
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pxd b/python/pylibcudf/pylibcudf/strings/attributes.pxd
new file mode 100644
index 00000000000..27398766924
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column count_characters(Column source_strings)
+
+cpdef Column count_bytes(Column source_strings)
+
+cpdef Column code_points(Column source_strings)
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx
new file mode 100644
index 00000000000..36bee7bd1d9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport attributes as cpp_attributes
+
+
+cpdef Column count_characters(Column source_strings):
+    """
+    Returns a column containing character lengths of each string
+    in the given column.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Column of strings.
+
+    Returns
+    -------
+    Column
+        New column with lengths for each string
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_attributes.count_characters(source_strings.view()))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column count_bytes(Column source_strings):
+    """
+    Returns a column containing byte lengths of each string
+    in the given column.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Column of strings.
+
+    Returns
+    -------
+    Column
+        New column with the number of bytes for each string
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_attributes.count_bytes(source_strings.view()))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column code_points(Column source_strings):
+    """
+    Creates a numeric column with code point values (integers)
+    for each character of each string.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Column of strings.
+
+    Returns
+    -------
+    Column
+        New column with code point integer values for each character
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_attributes.code_points(source_strings.view()))
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
new file mode 100644
index 00000000000..a1820def0b1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture()
+def str_data():
+    pa_data = pa.array(["A", None])
+    return pa_data, plc.interop.from_arrow(pa_data)
+
+
+def test_count_characters(str_data):
+    result = plc.strings.attributes.count_characters(str_data[1])
+    expected = pc.utf8_length(str_data[0])
+    assert_column_eq(expected, result)
+
+
+def test_count_bytes(str_data):
+    result = plc.strings.attributes.count_characters(str_data[1])
+    expected = pc.binary_length(str_data[0])
+    assert_column_eq(expected, result)
+
+
+def test_code_points(str_data):
+    result = plc.strings.attributes.code_points(str_data[1])
+    exp_data = [ord(str_data[0].to_pylist()[0])]
+    expected = pa.chunked_array([exp_data], type=pa.int32())
+    assert_column_eq(expected, result)

From 12ee360048473ddd06019090c7d19c67d6959f7a Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 25 Sep 2024 20:13:45 -0400
Subject: [PATCH 264/270] [REVIEW] JSON host tree algorithms (#16545)

Depends on #16836
This change adds a new host tree building algorithms for JSON reader and utf8 field name support.

This constructs the device_column_tree using an adjacency list created from parent information.
This adjacency list is pruned based on input schema, and also types are enforced as per schema. `mark_is_pruned`
Tree is constructed from pruned adjacency list, (with mixed types handling). `construct_tree`

utf8 field name support added: (spark requested)
utf8 decoding of field names during hashing of field nodes so that utf8 encoded field names also match to same column.

All unit tests passes, 1 unit test added where old algorithm fails.
This code is kept under experimental flag.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16545
---
 cpp/include/cudf/io/json.hpp                  |  36 +
 cpp/src/io/json/host_tree_algorithms.cu       | 776 ++++++++++++++++--
 cpp/src/io/json/json_column.cu                |  46 +-
 cpp/src/io/json/json_tree.cu                  | 153 +++-
 cpp/src/io/json/nested_json.hpp               |  29 +-
 cpp/tests/io/json/json_test.cpp               |  53 ++
 cpp/tests/io/json/json_tree.cpp               |   1 +
 cpp/tests/io/json/json_tree_csr.cu            |   1 +
 .../main/java/ai/rapids/cudf/JSONOptions.java |  15 +
 java/src/main/java/ai/rapids/cudf/Table.java  |   9 +
 java/src/main/native/src/TableJni.cpp         |  12 +-
 11 files changed, 1011 insertions(+), 120 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index ff25a5bacae..6798557e14e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -105,6 +105,8 @@ class json_reader_options {
   char _delimiter = '\n';
   // Prune columns on read, selected based on the _dtypes option
   bool _prune_columns = false;
+  // Experimental features: new column tree construction
+  bool _experimental = false;
 
   // Bytes to skip from the start
   size_t _byte_range_offset = 0;
@@ -277,6 +279,15 @@ class json_reader_options {
    */
   [[nodiscard]] bool is_enabled_prune_columns() const { return _prune_columns; }
 
+  /**
+   * @brief Whether to enable experimental features.
+   *
+   * When set to true, experimental features, such as the new column tree construction,
+   * utf-8 matching of field names will be enabled.
+   * @return true if experimental features are enabled
+   */
+  [[nodiscard]] bool is_enabled_experimental() const { return _experimental; }
+
   /**
    * @brief Whether to parse dates as DD/MM versus MM/DD.
    *
@@ -453,6 +464,16 @@ class json_reader_options {
    */
   void enable_prune_columns(bool val) { _prune_columns = val; }
 
+  /**
+   * @brief Set whether to enable experimental features.
+   *
+   * When set to true, experimental features, such as the new column tree construction,
+   * utf-8 matching of field names will be enabled.
+   *
+   * @param val Boolean value to enable/disable experimental features
+   */
+  void enable_experimental(bool val) { _experimental = val; }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
@@ -695,6 +716,21 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether to enable experimental features.
+   *
+   * When set to true, experimental features, such as the new column tree construction,
+   * utf-8 matching of field names will be enabled.
+   *
+   * @param val Boolean value to enable/disable experimental features
+   * @return this for chaining
+   */
+  json_reader_options_builder& experimental(bool val)
+  {
+    options._experimental = val;
+    return *this;
+  }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 70d61132b42..5855f1b5a5f 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -43,6 +44,7 @@
 #include <thrust/uninitialized_fill.h>
 
 #include <algorithm>
+#include <deque>
 
 namespace cudf::io::json::detail {
 
@@ -58,16 +60,15 @@ namespace cudf::io::json::detail {
  */
 rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_array_children_level,
                                                           tree_meta_t const& d_tree,
-                                                          device_span<NodeIndexT> col_ids,
+                                                          device_span<NodeIndexT const> col_ids,
                                                           size_type const num_columns,
                                                           rmm::cuda_stream_view stream)
 {
-  CUDF_FUNC_RANGE();
   auto [level2_nodes, level2_indices] = get_array_children_indices(
     row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream);
   auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin());
   rmm::device_uvector<NodeIndexT> values_column_indices(num_columns, stream);
-  thrust::scatter(rmm::exec_policy(stream),
+  thrust::scatter(rmm::exec_policy_nosync(stream),
                   level2_indices.begin(),
                   level2_indices.end(),
                   col_id_location,
@@ -90,12 +91,11 @@ std::vector<std::string> copy_strings_to_host_sync(
   device_span<SymbolOffsetT const> node_range_end,
   rmm::cuda_stream_view stream)
 {
-  CUDF_FUNC_RANGE();
   auto const num_strings = node_range_begin.size();
   rmm::device_uvector<size_type> string_offsets(num_strings, stream);
   rmm::device_uvector<size_type> string_lengths(num_strings, stream);
   auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
-  thrust::transform(rmm::exec_policy(stream),
+  thrust::transform(rmm::exec_policy_nosync(stream),
                     d_offset_pairs,
                     d_offset_pairs + num_strings,
                     thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()),
@@ -161,18 +161,18 @@ std::vector<std::string> copy_strings_to_host_sync(
 rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const> input,
                                                       tree_meta_t const& d_column_tree,
                                                       tree_meta_t const& tree,
-                                                      device_span<NodeIndexT> col_ids,
+                                                      device_span<NodeIndexT const> col_ids,
                                                       cudf::io::json_reader_options const& options,
                                                       rmm::cuda_stream_view stream)
 {
   auto const num_nodes = col_ids.size();
   auto const num_cols  = d_column_tree.node_categories.size();
   rmm::device_uvector<uint8_t> is_all_nulls(num_cols, stream);
-  thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
+  thrust::fill(rmm::exec_policy_nosync(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
 
   auto parse_opt = parsing_options(options, stream);
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::counting_iterator<size_type>(0),
     num_nodes,
     [options           = parse_opt.view(),
@@ -193,7 +193,7 @@ rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const>
   return is_all_nulls;
 }
 
-NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT> col_ids,
+NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT const> col_ids,
                                        bool is_enabled_lines,
                                        rmm::cuda_stream_view stream)
 {
@@ -221,33 +221,34 @@ struct json_column_data {
   bitmask_type* validity;
 };
 
-std::pair<cudf::detail::host_vector<uint8_t>,
-          std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>>
-build_tree(device_json_column& root,
-           std::vector<uint8_t> const& is_str_column_all_nulls,
-           tree_meta_t& d_column_tree,
-           device_span<NodeIndexT const> d_unique_col_ids,
-           device_span<size_type const> d_max_row_offsets,
-           std::vector<std::string> const& column_names,
-           NodeIndexT row_array_parent_col_id,
-           bool is_array_of_arrays,
-           cudf::io::json_reader_options const& options,
-           rmm::cuda_stream_view stream,
-           rmm::device_async_resource_ref mr);
-void scatter_offsets(
-  tree_meta_t& tree,
-  device_span<NodeIndexT> col_ids,
-  device_span<size_type> row_offsets,
-  device_span<size_type> node_ids,
-  device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+using hashmap_of_device_columns =
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;
+
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
   tree_meta_t& d_column_tree,
-  host_span<const uint8_t> ignore_vals,
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>& columns,
-  rmm::cuda_stream_view stream);
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+void scatter_offsets(tree_meta_t const& tree,
+                     device_span<NodeIndexT const> col_ids,
+                     device_span<size_type const> row_offsets,
+                     device_span<size_type> node_ids,
+                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+                     tree_meta_t const& d_column_tree,
+                     host_span<const bool> ignore_vals,
+                     hashmap_of_device_columns const& columns,
+                     rmm::cuda_stream_view stream);
 
 /**
  * @brief Constructs `d_json_column` from node tree representation
- * Newly constructed columns are insert into `root`'s children.
+ * Newly constructed columns are inserted into `root`'s children.
  * `root` must be a list type.
  *
  * @param input Input JSON string device data
@@ -265,28 +266,28 @@ void scatter_offsets(
  * of child_offets and validity members of `d_json_column`
  */
 void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t& tree,
-                             device_span<NodeIndexT> col_ids,
-                             device_span<size_type> row_offsets,
+                             tree_meta_t const& tree,
+                             device_span<NodeIndexT const> col_ids,
+                             device_span<size_type const> row_offsets,
                              device_json_column& root,
                              bool is_array_of_arrays,
                              cudf::io::json_reader_options const& options,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
-  CUDF_FUNC_RANGE();
-
   bool const is_enabled_lines                 = options.is_enabled_lines();
   bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
-  auto const num_nodes                        = col_ids.size();
-  rmm::device_uvector<NodeIndexT> sorted_col_ids(col_ids.size(), stream);  // make a copy
-  thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin());
+  // make a copy
+  auto sorted_col_ids = cudf::detail::make_device_uvector_async(
+    col_ids, stream, cudf::get_current_device_resource_ref());
 
   // sort by {col_id} on {node_ids} stable
   rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
-  thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end());
-  thrust::stable_sort_by_key(
-    rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
+  thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end());
+  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
+                             sorted_col_ids.begin(),
+                             sorted_col_ids.end(),
+                             node_ids.begin());
 
   NodeIndexT const row_array_parent_col_id =
     get_row_array_parent_col_id(col_ids, is_enabled_lines, stream);
@@ -316,7 +317,7 @@ void make_device_json_column(device_span<SymbolT const> input,
       cudf::detail::make_host_vector_sync(values_column_indices, stream);
     std::transform(unique_col_ids.begin(),
                    unique_col_ids.end(),
-                   column_names.begin(),
+                   column_names.cbegin(),
                    column_names.begin(),
                    [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
                      auto col_id, auto name) mutable {
@@ -333,17 +334,17 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
     return std::vector<uint8_t>();
   }();
-  auto [ignore_vals, columns] = build_tree(root,
-                                           is_str_column_all_nulls,
-                                           d_column_tree,
-                                           d_unique_col_ids,
-                                           d_max_row_offsets,
-                                           column_names,
-                                           row_array_parent_col_id,
-                                           is_array_of_arrays,
-                                           options,
-                                           stream,
-                                           mr);
+  auto const [ignore_vals, columns] = build_tree(root,
+                                                 is_str_column_all_nulls,
+                                                 d_column_tree,
+                                                 d_unique_col_ids,
+                                                 d_max_row_offsets,
+                                                 column_names,
+                                                 row_array_parent_col_id,
+                                                 is_array_of_arrays,
+                                                 options,
+                                                 stream,
+                                                 mr);
 
   scatter_offsets(tree,
                   col_ids,
@@ -356,19 +357,18 @@ void make_device_json_column(device_span<SymbolT const> input,
                   stream);
 }
 
-std::pair<cudf::detail::host_vector<uint8_t>,
-          std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>>
-build_tree(device_json_column& root,
-           std::vector<uint8_t> const& is_str_column_all_nulls,
-           tree_meta_t& d_column_tree,
-           device_span<NodeIndexT const> d_unique_col_ids,
-           device_span<size_type const> d_max_row_offsets,
-           std::vector<std::string> const& column_names,
-           NodeIndexT row_array_parent_col_id,
-           bool is_array_of_arrays,
-           cudf::io::json_reader_options const& options,
-           rmm::cuda_stream_view stream,
-           rmm::device_async_resource_ref mr)
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
+  tree_meta_t& d_column_tree,
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
 {
   bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
   auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
@@ -380,6 +380,7 @@ build_tree(device_json_column& root,
     cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
   auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
   auto num_columns           = d_unique_col_ids.size();
+  stream.synchronize();
 
   auto to_json_col_type = [](auto category) {
     switch (category) {
@@ -439,11 +440,12 @@ build_tree(device_json_column& root,
   });
 
   // use hash map because we may skip field name's col_ids
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
+  hashmap_of_device_columns columns;
   // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
   std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
   // find column_ids which are values, but should be ignored in validity
-  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
+  auto ignore_vals = cudf::detail::make_host_vector<bool>(num_columns, stream);
+  std::fill(ignore_vals.begin(), ignore_vals.end(), false);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   std::vector<uint8_t> is_pruned(num_columns, 0);
   // for columns that are not mixed type but have been forced as string
@@ -452,7 +454,7 @@ build_tree(device_json_column& root,
 
   std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
     [&](NodeIndexT this_col_id, device_json_column& col) {
-      for (auto col_name : col.column_order) {
+      for (auto const& col_name : col.column_order) {
         auto child_id                  = mapped_columns[{this_col_id, col_name}];
         is_mixed_type_column[child_id] = 1;
         remove_child_columns(child_id, col.child_columns.at(col_name));
@@ -523,7 +525,7 @@ build_tree(device_json_column& root,
     if (parent_col_id != parent_node_sentinel &&
           (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
         forced_as_string_column[parent_col_id]) {
-      ignore_vals[this_col_id] = 1;
+      ignore_vals[this_col_id] = true;
       if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
       if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
       continue;
@@ -569,12 +571,12 @@ build_tree(device_json_column& root,
       }
 
       if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
-        ignore_vals[this_col_id] = 1;
+        ignore_vals[this_col_id] = true;
         continue;
       }
       if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
         // remap
-        ignore_vals[old_col_id] = 1;
+        ignore_vals[old_col_id] = true;
         mapped_columns.erase({parent_col_id, name});
         columns.erase(old_col_id);
         parent_col.child_columns.erase(name);
@@ -624,7 +626,7 @@ build_tree(device_json_column& root,
       auto parent_col_id = column_parent_ids[this_col_id];
       if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
         is_mixed_type_column[this_col_id] = 1;
-        ignore_vals[this_col_id]          = 1;
+        ignore_vals[this_col_id]          = true;
         columns.erase(this_col_id);
       }
       // Convert only mixed type columns as string (so to copy), but not its children
@@ -644,7 +646,7 @@ build_tree(device_json_column& root,
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
       forced_as_string_column[this_col_id] = true;
-      ignore_vals[this_col_id]             = 1;
+      ignore_vals[this_col_id]             = true;
     }
     // Convert only mixed type columns as string (so to copy), but not its children
     if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
@@ -664,16 +666,15 @@ build_tree(device_json_column& root,
   return {ignore_vals, columns};
 }
 
-void scatter_offsets(
-  tree_meta_t& tree,
-  device_span<NodeIndexT> col_ids,
-  device_span<size_type> row_offsets,
-  device_span<size_type> node_ids,
-  device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
-  tree_meta_t& d_column_tree,
-  host_span<const uint8_t> ignore_vals,
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>& columns,
-  rmm::cuda_stream_view stream)
+void scatter_offsets(tree_meta_t const& tree,
+                     device_span<NodeIndexT const> col_ids,
+                     device_span<size_type const> row_offsets,
+                     device_span<size_type> node_ids,
+                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+                     tree_meta_t const& d_column_tree,
+                     host_span<const bool> ignore_vals,
+                     hashmap_of_device_columns const& columns,
+                     rmm::cuda_stream_view stream)
 {
   auto const num_nodes   = col_ids.size();
   auto const num_columns = d_column_tree.node_categories.size();
@@ -695,7 +696,7 @@ void scatter_offsets(
 
   // 3. scatter string offsets to respective columns, set validity bits
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::counting_iterator<size_type>(0),
     num_nodes,
     [column_categories = d_column_tree.node_categories.begin(),
@@ -739,7 +740,7 @@ void scatter_offsets(
                                                                   : col_ids[parent_node_ids[node_id]];
       }));
   auto const list_children_end = thrust::copy_if(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
     thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
       num_nodes,
@@ -757,12 +758,12 @@ void scatter_offsets(
 
   auto const num_list_children =
     list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
-  thrust::stable_sort_by_key(rmm::exec_policy(stream),
+  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
                              parent_col_ids.begin(),
                              parent_col_ids.begin() + num_list_children,
                              node_ids.begin());
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator<size_type>(0),
     num_list_children,
     [node_ids        = node_ids.begin(),
@@ -805,4 +806,599 @@ void scatter_offsets(
   stream.synchronize();
 }
 
+namespace experimental {
+
+std::map<std::string, schema_element> unified_schema(cudf::io::json_reader_options const& options)
+{
+  return std::visit(
+    cudf::detail::visitor_overload{
+      [](std::vector<data_type> const& user_dtypes) {
+        std::map<std::string, schema_element> dnew;
+        std::transform(thrust::counting_iterator<size_t>(0),
+                       thrust::counting_iterator<size_t>(user_dtypes.size()),
+                       std::inserter(dnew, dnew.end()),
+                       [&user_dtypes](auto i) {
+                         return std::pair(std::to_string(i), schema_element{user_dtypes[i]});
+                       });
+        return dnew;
+      },
+      [](std::map<std::string, data_type> const& user_dtypes) {
+        std::map<std::string, schema_element> dnew;
+        std::transform(user_dtypes.begin(),
+                       user_dtypes.end(),
+                       std::inserter(dnew, dnew.end()),
+                       [](auto key_dtype) {
+                         return std::pair(key_dtype.first, schema_element{key_dtype.second});
+                       });
+        return dnew;
+      },
+      [](std::map<std::string, schema_element> const& user_dtypes) { return user_dtypes; }},
+    options.get_dtypes());
+}
+
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
+  tree_meta_t& d_column_tree,
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+/**
+ * @brief Constructs `d_json_column` from node tree representation
+ * Newly constructed columns are inserted into `root`'s children.
+ * `root` must be a list type.
+ *
+ * @param input Input JSON string device data
+ * @param tree Node tree representation of the JSON string
+ * @param col_ids Column ids of the nodes in the tree
+ * @param row_offsets Row offsets of the nodes in the tree
+ * @param root Root node of the `d_json_column` tree
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param options Parsing options specifying the parsing behaviour
+ * options affecting behaviour are
+ *   is_enabled_lines: Whether the input is a line-delimited JSON
+ *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the device memory
+ * of child_offets and validity members of `d_json_column`
+ */
+void make_device_json_column(device_span<SymbolT const> input,
+                             tree_meta_t const& tree,
+                             device_span<NodeIndexT const> col_ids,
+                             device_span<size_type const> row_offsets,
+                             device_json_column& root,
+                             bool is_array_of_arrays,
+                             cudf::io::json_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  bool const is_enabled_lines                 = options.is_enabled_lines();
+  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  // make a copy
+  auto sorted_col_ids = cudf::detail::make_device_uvector_async(
+    col_ids, stream, cudf::get_current_device_resource_ref());
+
+  // sort by {col_id} on {node_ids} stable
+  rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
+  thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end());
+  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
+                             sorted_col_ids.begin(),
+                             sorted_col_ids.end(),
+                             node_ids.begin());
+
+  NodeIndexT const row_array_parent_col_id =
+    get_row_array_parent_col_id(col_ids, is_enabled_lines, stream);
+
+  // 1. gather column information.
+  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
+    reduce_to_column_tree(tree,
+                          col_ids,
+                          sorted_col_ids,
+                          node_ids,
+                          row_offsets,
+                          is_array_of_arrays,
+                          row_array_parent_col_id,
+                          stream);
+
+  auto num_columns                      = d_unique_col_ids.size();
+  std::vector<std::string> column_names = copy_strings_to_host_sync(
+    input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
+  // array of arrays column names
+  if (is_array_of_arrays) {
+    auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
+    auto const column_parent_ids =
+      cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
+    TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
+    auto values_column_indices =
+      get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
+    auto h_values_column_indices =
+      cudf::detail::make_host_vector_sync(values_column_indices, stream);
+    std::transform(unique_col_ids.begin(),
+                   unique_col_ids.end(),
+                   column_names.cbegin(),
+                   column_names.begin(),
+                   [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
+                     auto col_id, auto name) mutable {
+                     return column_parent_ids[col_id] == row_array_parent_col_id
+                              ? std::to_string(h_values_column_indices[col_id])
+                              : name;
+                   });
+  }
+
+  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
+    if (is_enabled_mixed_types_as_string) {
+      return cudf::detail::make_std_vector_sync(
+        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
+    }
+    return std::vector<uint8_t>();
+  }();
+  auto const [ignore_vals, columns] = build_tree(root,
+                                                 is_str_column_all_nulls,
+                                                 d_column_tree,
+                                                 d_unique_col_ids,
+                                                 d_max_row_offsets,
+                                                 column_names,
+                                                 row_array_parent_col_id,
+                                                 is_array_of_arrays,
+                                                 options,
+                                                 stream,
+                                                 mr);
+  if (ignore_vals.empty()) return;
+  scatter_offsets(tree,
+                  col_ids,
+                  row_offsets,
+                  node_ids,
+                  sorted_col_ids,
+                  d_column_tree,
+                  ignore_vals,
+                  columns,
+                  stream);
+}
+
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
+  tree_meta_t& d_column_tree,
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  bool const is_enabled_lines                 = options.is_enabled_lines();
+  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
+  auto column_categories =
+    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
+  auto const column_parent_ids =
+    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
+  auto column_range_beg =
+    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
+  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
+  auto num_columns           = d_unique_col_ids.size();
+  stream.synchronize();
+
+  auto to_json_col_type = [](auto category) {
+    switch (category) {
+      case NC_STRUCT: return json_col_t::StructColumn;
+      case NC_LIST: return json_col_t::ListColumn;
+      case NC_STR: [[fallthrough]];
+      case NC_VAL: return json_col_t::StringColumn;
+      default: return json_col_t::Unknown;
+    }
+  };
+
+  auto initialize_json_columns = [&](auto i, auto& col_ref, auto column_category) {
+    auto& col = col_ref.get();
+    if (col.type != json_col_t::Unknown) { return; }
+    if (column_category == NC_ERR || column_category == NC_FN) {
+      return;
+    } else if (column_category == NC_VAL || column_category == NC_STR) {
+      col.string_offsets.resize(max_row_offsets[i] + 1, stream);
+      col.string_lengths.resize(max_row_offsets[i] + 1, stream);
+      thrust::fill(
+        rmm::exec_policy_nosync(stream),
+        thrust::make_zip_iterator(col.string_offsets.begin(), col.string_lengths.begin()),
+        thrust::make_zip_iterator(col.string_offsets.end(), col.string_lengths.end()),
+        thrust::make_tuple(0, 0));
+    } else if (column_category == NC_LIST) {
+      col.child_offsets.resize(max_row_offsets[i] + 2, stream);
+      thrust::uninitialized_fill(
+        rmm::exec_policy_nosync(stream), col.child_offsets.begin(), col.child_offsets.end(), 0);
+    }
+    col.num_rows = max_row_offsets[i] + 1;
+    col.validity =
+      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
+    col.type = to_json_col_type(column_category);
+  };
+
+  // 2. generate nested columns tree and its device_memory
+  // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
+  auto h_range_col_id_it =
+    thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin());
+  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
+    return thrust::get<0>(a) < thrust::get<0>(b);
+  });
+  // adjacency list construction
+  std::map<NodeIndexT, std::vector<NodeIndexT>> adj;
+  for (auto const this_col_id : unique_col_ids) {
+    auto parent_col_id = column_parent_ids[this_col_id];
+    adj[parent_col_id].push_back(this_col_id);
+  }
+
+  // Pruning
+  auto is_pruned = cudf::detail::make_host_vector<bool>(num_columns, stream);
+  std::fill_n(is_pruned.begin(), num_columns, options.is_enabled_prune_columns());
+
+  // prune all children of a column, but not self.
+  auto ignore_all_children = [&](auto parent_col_id) {
+    std::deque<NodeIndexT> offspring;
+    if (adj.count(parent_col_id)) {
+      for (auto const& child : adj[parent_col_id]) {
+        offspring.push_back(child);
+      }
+    }
+    while (!offspring.empty()) {
+      auto this_id = offspring.front();
+      offspring.pop_front();
+      is_pruned[this_id] = true;
+      if (adj.count(this_id)) {
+        for (auto const& child : adj[this_id]) {
+          offspring.push_back(child);
+        }
+      }
+    }
+  };
+
+  // Pruning: iterate through schema and mark only those columns and enforce type.
+  // NoPruning: iterate through schema and enforce type.
+
+  if (adj[parent_node_sentinel].empty())
+    return {cudf::detail::make_host_vector<bool>(0, stream), {}};  // for empty file
+  CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1");
+  auto expected_types = cudf::detail::make_host_vector<NodeT>(num_columns, stream);
+  std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES);
+
+  auto lookup_names = [&column_names](auto child_ids, auto name) {
+    for (auto const& child_id : child_ids) {
+      if (column_names[child_id] == name) return child_id;
+    }
+    return -1;
+  };
+  // recursive lambda on schema to mark columns as pruned.
+  std::function<void(NodeIndexT root, schema_element const& schema)> mark_is_pruned;
+  mark_is_pruned = [&is_pruned,
+                    &mark_is_pruned,
+                    &adj,
+                    &lookup_names,
+                    &column_categories,
+                    &expected_types,
+                    &ignore_all_children](NodeIndexT root, schema_element const& schema) -> void {
+    if (root == -1) return;
+    bool pass =
+      (schema.type == data_type{type_id::STRUCT} and column_categories[root] == NC_STRUCT) or
+      (schema.type == data_type{type_id::LIST} and column_categories[root] == NC_LIST) or
+      (schema.type != data_type{type_id::STRUCT} and schema.type != data_type{type_id::LIST} and
+       column_categories[root] != NC_FN);
+    if (!pass) {
+      // ignore all children of this column and prune this column.
+      is_pruned[root] = true;
+      ignore_all_children(root);
+      return;
+    }
+    is_pruned[root]    = false;
+    auto expected_type = [](auto type, auto cat) {
+      if (type == data_type{type_id::STRUCT} and cat == NC_STRUCT) return NC_STRUCT;
+      if (type == data_type{type_id::LIST} and cat == NC_LIST) return NC_LIST;
+      if (type != data_type{type_id::STRUCT} and type != data_type{type_id::LIST}) return NC_STR;
+      return NC_ERR;
+    }(schema.type, column_categories[root]);
+    expected_types[root] = expected_type;  // forced type.
+    // ignore children of nested columns, but not self.
+    if (expected_type == NC_STR and
+        (column_categories[root] == NC_STRUCT or column_categories[root] == NC_LIST))
+      ignore_all_children(root);
+    if (not(schema.type == data_type{type_id::STRUCT} or schema.type == data_type{type_id::LIST}))
+      return;  // no children to mark for non-nested.
+    auto child_ids = adj.count(root) ? adj[root] : std::vector<NodeIndexT>{};
+    if (schema.type == data_type{type_id::STRUCT}) {
+      for (auto const& key_pair : schema.child_types) {
+        auto col_id = lookup_names(child_ids, key_pair.first);
+        if (col_id == -1) continue;
+        is_pruned[col_id] = false;
+        for (auto const& child_id : adj[col_id])  // children of field (>1 if mixed)
+          mark_is_pruned(child_id, key_pair.second);
+      }
+    } else if (schema.type == data_type{type_id::LIST}) {
+      // partial solution for list children to have any name.
+      auto this_list_child_name =
+        schema.child_types.size() == 1 ? schema.child_types.begin()->first : list_child_name;
+      if (schema.child_types.count(this_list_child_name) == 0) return;
+      auto list_child = schema.child_types.at(this_list_child_name);
+      for (auto const& child_id : child_ids)
+        mark_is_pruned(child_id, list_child);
+    }
+  };
+  if (is_array_of_arrays) {
+    if (adj[adj[parent_node_sentinel][0]].empty())
+      return {cudf::detail::make_host_vector<bool>(0, stream), {}};
+    auto root_list_col_id =
+      is_enabled_lines ? adj[parent_node_sentinel][0] : adj[adj[parent_node_sentinel][0]][0];
+    // mark root and row array col_id as not pruned.
+    if (!is_enabled_lines) {
+      auto top_level_list_id       = adj[parent_node_sentinel][0];
+      is_pruned[top_level_list_id] = false;
+    }
+    is_pruned[root_list_col_id] = false;
+    std::visit(cudf::detail::visitor_overload{
+                 [&root_list_col_id, &adj, &mark_is_pruned, &column_names](
+                   std::vector<data_type> const& user_dtypes) -> void {
+                   for (size_t i = 0; i < adj[root_list_col_id].size() && i < user_dtypes.size();
+                        i++) {
+                     NodeIndexT const first_child_id = adj[root_list_col_id][i];
+                     auto name                       = column_names[first_child_id];
+                     auto value_id                   = std::stol(name);
+                     if (value_id >= 0 and value_id < static_cast<long>(user_dtypes.size()))
+                       mark_is_pruned(first_child_id, schema_element{user_dtypes[value_id]});
+                     // Note: mixed type - forced type, will work here.
+                   }
+                 },
+                 [&root_list_col_id, &adj, &mark_is_pruned, &column_names](
+                   std::map<std::string, data_type> const& user_dtypes) -> void {
+                   for (size_t i = 0; i < adj[root_list_col_id].size(); i++) {
+                     auto const first_child_id = adj[root_list_col_id][i];
+                     auto name                 = column_names[first_child_id];
+                     if (user_dtypes.count(name))
+                       mark_is_pruned(first_child_id, schema_element{user_dtypes.at(name)});
+                   }
+                 },
+                 [&root_list_col_id, &adj, &mark_is_pruned, &column_names](
+                   std::map<std::string, schema_element> const& user_dtypes) -> void {
+                   for (size_t i = 0; i < adj[root_list_col_id].size(); i++) {
+                     auto const first_child_id = adj[root_list_col_id][i];
+                     auto name                 = column_names[first_child_id];
+                     if (user_dtypes.count(name))
+                       mark_is_pruned(first_child_id, user_dtypes.at(name));
+                   }
+                 }},
+               options.get_dtypes());
+  } else {
+    auto root_struct_col_id =
+      is_enabled_lines
+        ? adj[parent_node_sentinel][0]
+        : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]);
+    // mark root and row struct col_id as not pruned.
+    if (!is_enabled_lines) {
+      auto top_level_list_id       = adj[parent_node_sentinel][0];
+      is_pruned[top_level_list_id] = false;
+    }
+    is_pruned[root_struct_col_id] = false;
+    schema_element u_schema{data_type{type_id::STRUCT}};
+    u_schema.child_types = unified_schema(options);
+    std::visit(
+      cudf::detail::visitor_overload{
+        [&is_pruned, &root_struct_col_id, &adj, &mark_is_pruned](
+          std::vector<data_type> const& user_dtypes) -> void {
+          for (size_t i = 0; i < adj[root_struct_col_id].size() && i < user_dtypes.size(); i++) {
+            NodeIndexT const first_field_id = adj[root_struct_col_id][i];
+            is_pruned[first_field_id]       = false;
+            for (auto const& child_id : adj[first_field_id])  // children of field (>1 if mixed)
+              mark_is_pruned(child_id, schema_element{user_dtypes[i]});
+          }
+        },
+        [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema](
+          std::map<std::string, data_type> const& user_dtypes) -> void {
+          mark_is_pruned(root_struct_col_id, u_schema);
+        },
+        [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema](
+          std::map<std::string, schema_element> const& user_dtypes) -> void {
+          mark_is_pruned(root_struct_col_id, u_schema);
+        }},
+      options.get_dtypes());
+  }
+  // Useful for array of arrays
+  auto named_level =
+    is_enabled_lines
+      ? adj[parent_node_sentinel][0]
+      : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]);
+
+  auto handle_mixed_types = [&column_categories,
+                             &is_str_column_all_nulls,
+                             &is_pruned,
+                             &expected_types,
+                             &is_enabled_mixed_types_as_string,
+                             &ignore_all_children](std::vector<NodeIndexT>& child_ids) {
+    // do these on unpruned columns only.
+    // when mixed types is disabled, ignore string sibling of nested column.
+    // when mixed types is disabled, and both list and struct columns are siblings, error out.
+    // when mixed types is enabled, force string type on all columns
+
+    // Remove pruned children (forced type will not clash here because other types are already
+    // pruned)
+    child_ids.erase(
+      std::remove_if(child_ids.begin(),
+                     child_ids.end(),
+                     [&is_pruned](NodeIndexT child_id) { return is_pruned[child_id]; }),
+      child_ids.end());
+    // find string id, struct id, list id.
+    NodeIndexT str_col_id{-1}, struct_col_id{-1}, list_col_id{-1};
+    for (auto const& child_id : child_ids) {
+      if (column_categories[child_id] == NC_VAL || column_categories[child_id] == NC_STR)
+        str_col_id = child_id;
+      else if (column_categories[child_id] == NC_STRUCT)
+        struct_col_id = child_id;
+      else if (column_categories[child_id] == NC_LIST)
+        list_col_id = child_id;
+    }
+    // conditions for handling mixed types.
+    if (is_enabled_mixed_types_as_string) {
+      if (struct_col_id != -1 and list_col_id != -1) {
+        expected_types[struct_col_id] = NC_STR;
+        expected_types[list_col_id]   = NC_STR;
+        // ignore children of nested columns.
+        ignore_all_children(struct_col_id);
+        ignore_all_children(list_col_id);
+      }
+      if ((struct_col_id != -1 or list_col_id != -1) and str_col_id != -1) {
+        if (is_str_column_all_nulls[str_col_id])
+          is_pruned[str_col_id] = true;
+        else {
+          // ignore children of nested columns.
+          if (struct_col_id != -1) {
+            expected_types[struct_col_id] = NC_STR;
+            ignore_all_children(struct_col_id);
+          }
+          if (list_col_id != -1) {
+            expected_types[list_col_id] = NC_STR;
+            ignore_all_children(list_col_id);
+          }
+        }
+      }
+    } else {
+      // if both are present, error out.
+      CUDF_EXPECTS(struct_col_id == -1 or list_col_id == -1,
+                   "A mix of lists and structs within the same column is not supported");
+      // either one only: so ignore str column.
+      if ((struct_col_id != -1 or list_col_id != -1) and str_col_id != -1) {
+        is_pruned[str_col_id] = true;
+      }
+    }
+  };
+
+  using dev_ref = std::reference_wrapper<device_json_column>;
+  std::unordered_map<NodeIndexT, dev_ref> columns;
+  columns.try_emplace(parent_node_sentinel, std::ref(root));
+  // convert adjaceny list to tree.
+  dev_ref parent_ref = std::ref(root);
+  // creates children column
+  std::function<void(NodeIndexT, dev_ref)> construct_tree;
+  construct_tree = [&](NodeIndexT root, dev_ref ref) -> void {
+    if (is_pruned[root]) return;
+    auto expected_category =
+      expected_types[root] == NUM_NODE_CLASSES ? column_categories[root] : expected_types[root];
+    initialize_json_columns(root, ref, expected_category);
+    auto child_ids = adj.count(root) ? adj[root] : std::vector<NodeIndexT>{};
+    if (expected_category == NC_STRUCT) {
+      // find field column ids, and its children and create columns.
+      for (auto const& field_id : child_ids) {
+        auto name = column_names[field_id];
+        if (is_pruned[field_id]) continue;
+        auto inserted =
+          ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second;
+        ref.get().column_order.emplace_back(name);
+        CUDF_EXPECTS(inserted,
+                     "struct child column insertion failed, duplicate column name in the parent");
+        auto this_ref = std::ref(ref.get().child_columns.at(name));
+        // Mixed type handling
+        auto& value_col_ids = adj[field_id];
+        handle_mixed_types(value_col_ids);
+        if (value_col_ids.empty()) {
+          // If no column is present, remove the uninitialized column.
+          ref.get().child_columns.erase(name);
+          ref.get().column_order.pop_back();
+          continue;
+        }
+        for (auto const& child_id : value_col_ids)  // children of field (>1 if mixed)
+        {
+          if (is_pruned[child_id]) continue;
+          columns.try_emplace(child_id, this_ref);
+          construct_tree(child_id, this_ref);
+        }
+      }
+    } else if (expected_category == NC_LIST) {
+      // array of arrays interpreted as array of structs.
+      if (is_array_of_arrays and root == named_level) {
+        // create column names
+        std::map<NodeIndexT, std::vector<NodeIndexT>> array_values;
+        for (auto const& child_id : child_ids) {
+          if (is_pruned[child_id]) continue;
+          auto name = column_names[child_id];
+          array_values[std::stoi(name)].push_back(child_id);
+        }
+        //
+        for (auto const& value_id_pair : array_values) {
+          auto [value_id, value_col_ids] = value_id_pair;
+          auto name                      = std::to_string(value_id);
+          auto inserted =
+            ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second;
+          ref.get().column_order.emplace_back(name);
+          CUDF_EXPECTS(inserted,
+                       "list child column insertion failed, duplicate column name in the parent");
+          auto this_ref = std::ref(ref.get().child_columns.at(name));
+          handle_mixed_types(value_col_ids);
+          if (value_col_ids.empty()) {
+            // If no column is present, remove the uninitialized column.
+            ref.get().child_columns.erase(name);
+            ref.get().column_order.pop_back();
+            continue;
+          }
+          for (auto const& child_id : value_col_ids)  // children of field (>1 if mixed)
+          {
+            if (is_pruned[child_id]) continue;
+            columns.try_emplace(child_id, this_ref);
+            construct_tree(child_id, this_ref);
+          }
+        }
+      } else {
+        if (child_ids.empty()) return;
+        auto inserted =
+          ref.get()
+            .child_columns.try_emplace(list_child_name, device_json_column(stream, mr))
+            .second;
+        CUDF_EXPECTS(inserted,
+                     "list child column insertion failed, duplicate column name in the parent");
+        ref.get().column_order.emplace_back(list_child_name);
+        auto this_ref = std::ref(ref.get().child_columns.at(list_child_name));
+        // Mixed type handling
+        handle_mixed_types(child_ids);
+        if (child_ids.empty()) {
+          // If no column is present, remove the uninitialized column.
+          ref.get().child_columns.erase(list_child_name);
+        }
+        for (auto const& child_id : child_ids) {
+          if (is_pruned[child_id]) continue;
+          columns.try_emplace(child_id, this_ref);
+          construct_tree(child_id, this_ref);
+        }
+      }
+    }
+  };
+  auto inserted = parent_ref.get()
+                    .child_columns.try_emplace(list_child_name, device_json_column(stream, mr))
+                    .second;
+  CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
+  parent_ref = std::ref(parent_ref.get().child_columns.at(list_child_name));
+  columns.try_emplace(adj[parent_node_sentinel][0], parent_ref);
+  construct_tree(adj[parent_node_sentinel][0], parent_ref);
+
+  // Forced string type due to input schema and mixed type as string.
+  for (size_t i = 0; i < expected_types.size(); i++) {
+    if (expected_types[i] == NC_STR) {
+      if (columns.count(i)) { columns.at(i).get().forced_as_string_column = true; }
+    }
+  }
+  std::transform(expected_types.cbegin(),
+                 expected_types.cend(),
+                 column_categories.cbegin(),
+                 expected_types.begin(),
+                 [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; });
+  cudaMemcpyAsync(d_column_tree.node_categories.begin(),
+                  expected_types.data(),
+                  expected_types.size() * sizeof(column_categories[0]),
+                  cudaMemcpyDefault,
+                  stream.value());
+
+  return {is_pruned, columns};
+}
+}  // namespace experimental
+
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index dfd9285f682..912e93d52ae 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -104,7 +104,7 @@ void print_tree(host_span<SymbolT const> input,
  * max row offsets of columns
  */
 std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
-reduce_to_column_tree(tree_meta_t& tree,
+reduce_to_column_tree(tree_meta_t const& tree,
                       device_span<NodeIndexT const> original_col_ids,
                       device_span<NodeIndexT const> sorted_col_ids,
                       device_span<NodeIndexT const> ordered_node_ids,
@@ -317,7 +317,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
     // Note: json_col modified here, moves this memory
   };
 
-  auto get_child_schema = [schema](auto child_name) -> std::optional<schema_element> {
+  auto get_child_schema = [&schema](auto child_name) -> std::optional<schema_element> {
     if (schema.has_value()) {
       auto const result = schema.value().child_types.find(child_name);
       if (result != std::end(schema.value().child_types)) { return result->second; }
@@ -325,6 +325,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
     return {};
   };
 
+  auto get_list_child_schema = [&schema]() -> std::optional<schema_element> {
+    if (schema.has_value()) {
+      if (schema.value().child_types.size() > 0) return schema.value().child_types.begin()->second;
+    }
+    return {};
+  };
+
   switch (json_col.type) {
     case json_col_t::StringColumn: {
       // move string_offsets to GPU and transform to string column
@@ -439,9 +446,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                                                      rmm::device_buffer{},
                                                      0);
       // Create children column
-      auto child_schema_element = json_col.child_columns.empty()
-                                    ? std::optional<schema_element>{}
-                                    : get_child_schema(json_col.child_columns.begin()->first);
+      auto child_schema_element =
+        json_col.child_columns.empty() ? std::optional<schema_element>{} : get_list_child_schema();
       auto [child_column, names] =
         json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
           ? std::pair<std::unique_ptr<column>,
@@ -479,6 +485,16 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
   }
 }
 
+template <typename... Args>
+auto make_device_json_column_dispatch(bool experimental, Args&&... args)
+{
+  if (experimental) {
+    return experimental::make_device_json_column(std::forward<Args>(args)...);
+  } else {
+    return make_device_json_column(std::forward<Args>(args)...);
+  }
+}
+
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
@@ -524,6 +540,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                   gpu_tree,
                                   is_array_of_arrays,
                                   options.is_enabled_lines(),
+                                  options.is_enabled_experimental(),
                                   stream,
                                   cudf::get_current_device_resource_ref());
 
@@ -536,15 +553,16 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                0);
 
   // Get internal JSON column
-  make_device_json_column(d_input,
-                          gpu_tree,
-                          gpu_col_id,
-                          gpu_row_offsets,
-                          root_column,
-                          is_array_of_arrays,
-                          options,
-                          stream,
-                          mr);
+  make_device_json_column_dispatch(options.is_enabled_experimental(),
+                                   d_input,
+                                   gpu_tree,
+                                   gpu_col_id,
+                                   gpu_row_offsets,
+                                   root_column,
+                                   is_array_of_arrays,
+                                   options,
+                                   stream,
+                                   mr);
 
   // data_root refers to the root column of the data represented by the given JSON string
   auto& data_root =
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 4d0dc010c57..d949635c1cc 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include "io/utilities/hostdevice_vector.hpp"
+#include "io/utilities/parsing_utils.cuh"
+#include "io/utilities/string_parsing.hpp"
 #include "nested_json.hpp"
 
 #include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/scatter.cuh>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/helper_functions.cuh>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
@@ -34,12 +35,14 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/device/device_radix_sort.cuh>
+#include <cuco/static_map.cuh>
 #include <cuco/static_set.cuh>
 #include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/fill.h>
+#include <thrust/functional.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -492,6 +495,85 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
           std::move(node_range_end)};
 }
 
+// Return field node ids after unicode decoding of field names and matching them to same field names
+std::pair<size_t, rmm::device_uvector<size_type>> remapped_field_nodes_after_unicode_decode(
+  device_span<SymbolT const> d_input,
+  tree_meta_t const& d_tree,
+  device_span<size_type const> keys,
+  rmm::cuda_stream_view stream)
+{
+  size_t num_keys = keys.size();
+  if (num_keys == 0) { return {num_keys, rmm::device_uvector<size_type>(num_keys, stream)}; }
+  rmm::device_uvector<size_type> offsets(num_keys, stream);
+  rmm::device_uvector<size_type> lengths(num_keys, stream);
+  auto offset_length_it = thrust::make_zip_iterator(offsets.begin(), lengths.begin());
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    keys.begin(),
+                    keys.end(),
+                    offset_length_it,
+                    [node_range_begin = d_tree.node_range_begin.data(),
+                     node_range_end   = d_tree.node_range_end.data()] __device__(auto key) {
+                      return thrust::make_tuple(node_range_begin[key],
+                                                node_range_end[key] - node_range_begin[key]);
+                    });
+  cudf::io::parse_options_view opt{',', '\n', '\0', '.'};
+  opt.keepquotes = true;
+
+  auto utf8_decoded_fields = parse_data(d_input.data(),
+                                        offset_length_it,
+                                        num_keys,
+                                        data_type{type_id::STRING},
+                                        rmm::device_buffer{},
+                                        0,
+                                        opt,
+                                        stream,
+                                        cudf::get_current_device_resource_ref());
+  // hash using iter, create a hashmap for 0-num_keys.
+  // insert and find. -> array
+  // store to static_map with keys as field key[index], and values as key[array[index]]
+
+  auto str_view         = strings_column_view{utf8_decoded_fields->view()};
+  auto const char_ptr   = str_view.chars_begin(stream);
+  auto const offset_ptr = str_view.offsets().begin<size_type>();
+
+  // String hasher
+  auto const d_hasher = cuda::proclaim_return_type<
+    typename cudf::hashing::detail::default_hash<cudf::string_view>::result_type>(
+    [char_ptr, offset_ptr] __device__(auto node_id) {
+      auto const field_name = cudf::string_view(char_ptr + offset_ptr[node_id],
+                                                offset_ptr[node_id + 1] - offset_ptr[node_id]);
+      return cudf::hashing::detail::default_hash<cudf::string_view>{}(field_name);
+    });
+  auto const d_equal = [char_ptr, offset_ptr] __device__(auto node_id1, auto node_id2) {
+    auto const field_name1 = cudf::string_view(char_ptr + offset_ptr[node_id1],
+                                               offset_ptr[node_id1 + 1] - offset_ptr[node_id1]);
+    auto const field_name2 = cudf::string_view(char_ptr + offset_ptr[node_id2],
+                                               offset_ptr[node_id2 + 1] - offset_ptr[node_id2]);
+    return field_name1 == field_name2;
+  };
+
+  using hasher_type                             = decltype(d_hasher);
+  constexpr size_type empty_node_index_sentinel = -1;
+  auto key_set                                  = cuco::static_set{
+    cuco::extent{compute_hash_table_size(num_keys)},
+    cuco::empty_key{empty_node_index_sentinel},
+    d_equal,
+    cuco::linear_probing<1, hasher_type>{d_hasher},
+                                     {},
+                                     {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+  auto const counting_iter = thrust::make_counting_iterator<size_type>(0);
+  rmm::device_uvector<size_type> found_keys(num_keys, stream);
+  key_set.insert_and_find_async(counting_iter,
+                                counting_iter + num_keys,
+                                found_keys.begin(),
+                                thrust::make_discard_iterator(),
+                                stream.value());
+  // set.size will synchronize the stream before return.
+  return {key_set.size(stream), std::move(found_keys)};
+}
+
 /**
  * @brief Generates unique node_type id for each node.
  * Field nodes with the same name are assigned the same node_type id.
@@ -500,11 +582,14 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
  * All inputs and outputs are in node_id order.
  * @param d_input JSON string in device memory
  * @param d_tree Tree representation of the JSON
+ * @param is_enabled_experimental Whether to enable experimental features such as
+ * utf8 field name support
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @return Vector of node_type ids
  */
 rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<SymbolT const> d_input,
                                                               tree_meta_t const& d_tree,
+                                                              bool is_enabled_experimental,
                                                               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
@@ -536,7 +621,7 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
   };
   // key-value pairs: uses node_id itself as node_type. (unique node_id for a field name due to
   // hashing)
-  auto const iter = thrust::make_counting_iterator<size_type>(0);
+  auto const counting_iter = thrust::make_counting_iterator<size_type>(0);
 
   auto const is_field_name_node = [node_categories =
                                      d_tree.node_categories.data()] __device__(auto node_id) {
@@ -554,15 +639,61 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
                                      {},
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value()};
-  key_set.insert_if_async(iter,
-                          iter + num_nodes,
+  key_set.insert_if_async(counting_iter,
+                          counting_iter + num_nodes,
                           thrust::counting_iterator<size_type>(0),  // stencil
                           is_field_name_node,
                           stream.value());
 
+  // experimental feature: utf8 field name support
+  // parse_data on field names,
+  // rehash it using another map,
+  // reassign the reverse map values to new matched node indices.
+  auto get_utf8_matched_field_nodes = [&]() {
+    auto make_map = [&stream](auto num_keys) {
+      using hasher_type3 = cudf::hashing::detail::default_hash<size_type>;
+      return cuco::static_map{
+        cuco::extent{compute_hash_table_size(num_keys, 100)},  // 100% occupancy
+        cuco::empty_key{empty_node_index_sentinel},
+        cuco::empty_value{empty_node_index_sentinel},
+        {},
+        cuco::linear_probing<1, hasher_type3>{hasher_type3{}},
+        {},
+        {},
+        cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+        stream.value()};
+    };
+    if (!is_enabled_experimental) { return std::pair{false, make_map(0)}; }
+    // get all unique field node ids for utf8 decoding
+    auto num_keys = key_set.size(stream);
+    rmm::device_uvector<size_type> keys(num_keys, stream);
+    key_set.retrieve_all(keys.data(), stream.value());
+
+    auto [num_unique_fields, found_keys] =
+      remapped_field_nodes_after_unicode_decode(d_input, d_tree, keys, stream);
+
+    auto is_need_remap = num_unique_fields != num_keys;
+    if (!is_need_remap) { return std::pair{false, make_map(0)}; }
+
+    // store to static_map with keys as field keys[index], and values as keys[found_keys[index]]
+    auto reverse_map        = make_map(num_keys);
+    auto matching_keys_iter = thrust::make_permutation_iterator(keys.begin(), found_keys.begin());
+    auto pair_iter =
+      thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), matching_keys_iter));
+    reverse_map.insert_async(pair_iter, pair_iter + num_keys, stream);
+    return std::pair{is_need_remap, std::move(reverse_map)};
+  };
+  auto [is_need_remap, reverse_map] = get_utf8_matched_field_nodes();
+
   auto const get_hash_value =
-    [key_set = key_set.ref(cuco::op::find)] __device__(auto node_id) -> size_type {
+    [key_set       = key_set.ref(cuco::op::find),
+     is_need_remap = is_need_remap,
+     rm            = reverse_map.ref(cuco::op::find)] __device__(auto node_id) -> size_type {
     auto const it = key_set.find(node_id);
+    if (it != key_set.end() and is_need_remap) {
+      auto const it2 = rm.find(*it);
+      return (it2 == rm.end()) ? size_type{0} : it2->second;
+    }
     return (it == key_set.end()) ? size_type{0} : *it;
   };
 
@@ -771,6 +902,8 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
  * @param d_tree Tree representation of the JSON
  * @param is_array_of_arrays Whether the tree is an array of arrays
  * @param is_enabled_lines Whether the input is a line-delimited JSON
+ * @param is_enabled_experimental Whether the experimental feature is enabled such as
+ * utf8 field name support
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return column_id, parent_column_id
@@ -780,6 +913,7 @@ std::pair<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<NodeIndexT>> gene
   tree_meta_t const& d_tree,
   bool is_array_of_arrays,
   bool is_enabled_lines,
+  bool is_enabled_experimental,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
@@ -793,7 +927,7 @@ std::pair<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<NodeIndexT>> gene
   auto [col_id, unique_keys] = [&]() {
     // Convert node_category + field_name to node_type.
     rmm::device_uvector<size_type> node_type =
-      hash_node_type_with_field_name(d_input, d_tree, stream);
+      hash_node_type_with_field_name(d_input, d_tree, is_enabled_experimental, stream);
 
     // hash entire path from node to root.
     return hash_node_path(d_tree.node_levels,
@@ -948,12 +1082,13 @@ records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               tree_meta_t const& d_tree,
                               bool is_array_of_arrays,
                               bool is_enabled_lines,
+                              bool is_enabled_experimental,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  auto [new_col_id, new_parent_col_id] =
-    generate_column_id(d_input, d_tree, is_array_of_arrays, is_enabled_lines, stream, mr);
+  auto [new_col_id, new_parent_col_id] = generate_column_id(
+    d_input, d_tree, is_array_of_arrays, is_enabled_lines, is_enabled_experimental, stream, mr);
 
   auto row_offsets = compute_row_offsets(
     std::move(new_parent_col_id), d_tree, is_array_of_arrays, is_enabled_lines, stream, mr);
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 93ef2b46be1..3d9a51833e0 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -316,6 +316,8 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
  * index, level, begin index, and end index in the input JSON string
  * @param is_array_of_arrays Whether the tree is an array of arrays
  * @param is_enabled_lines Whether the input is a line-delimited JSON
+ * @param is_enabled_experimental Whether to enable experimental features such as utf-8 field name
+ * support
  * @param stream The CUDA stream to which kernels are dispatched
  * @param mr Optional, resource with which to allocate
  * @return A tuple of the output column indices and the row offsets within each column for each node
@@ -326,6 +328,7 @@ records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               tree_meta_t const& d_tree,
                               bool is_array_of_arrays,
                               bool is_enabled_lines,
+                              bool is_enabled_experimental,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
@@ -352,7 +355,7 @@ get_array_children_indices(TreeDepthT row_array_children_level,
 /**
  * @brief Reduces node tree representation to column tree representation.
  *
- * @param node_tree Node tree representation of JSON string
+ * @param tree Node tree representation of JSON string
  * @param original_col_ids Column ids of nodes
  * @param sorted_col_ids Sorted column ids of nodes
  * @param ordered_node_ids Node ids of nodes sorted by column ids
@@ -365,7 +368,7 @@ get_array_children_indices(TreeDepthT row_array_children_level,
  */
 CUDF_EXPORT
 std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
-reduce_to_column_tree(tree_meta_t& node_tree,
+reduce_to_column_tree(tree_meta_t const& tree,
                       device_span<NodeIndexT const> original_col_ids,
                       device_span<NodeIndexT const> sorted_col_ids,
                       device_span<NodeIndexT const> ordered_node_ids,
@@ -393,14 +396,30 @@ reduce_to_column_tree(tree_meta_t& node_tree,
  * of child_offets and validity members of `d_json_column`
  */
 void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t& tree,
-                             device_span<NodeIndexT> col_ids,
-                             device_span<size_type> row_offsets,
+                             tree_meta_t const& tree,
+                             device_span<NodeIndexT const> col_ids,
+                             device_span<size_type const> row_offsets,
                              device_json_column& root,
                              bool is_array_of_arrays,
                              cudf::io::json_reader_options const& options,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
+
+namespace experimental {
+/**
+ * @copydoc cudf::io::json::detail::make_device_json_column
+ */
+void make_device_json_column(device_span<SymbolT const> input,
+                             tree_meta_t const& tree,
+                             device_span<NodeIndexT const> col_ids,
+                             device_span<size_type const> row_offsets,
+                             device_json_column& root,
+                             bool is_array_of_arrays,
+                             cudf::io::json_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
+}  // namespace experimental
+
 /**
  * @brief Retrieves the parse_options to be used for type inference and type casting
  *
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 48bc982d0e3..68ec255b39d 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2856,6 +2856,59 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren)
   }
 }
 
+TEST_F(JsonReaderTest, MixedTypesWithSchema)
+{
+  std::string data = "{\"data\": {\"A\": 0, \"B\": 1}}\n{\"data\": [1,0]}\n";
+
+  std::map<std::string, cudf::io::schema_element> data_types;
+  std::map<std::string, cudf::io::schema_element> child_types;
+  child_types.insert(
+    std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING, 0}, {}}});
+  data_types.insert(std::pair{
+    "data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST, 0}, child_types}});
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .dtypes(data_types)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+      .normalize_single_quotes(true)
+      .normalize_whitespace(true)
+      .mixed_types_as_string(true)
+      .experimental(true)
+      .keep_quotes(true)
+      .lines(true);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 2);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::LIST);
+  EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::STRING);
+}
+
+TEST_F(JsonReaderTest, UnicodeFieldname)
+{
+  // unicode at nested and leaf levels
+  std::string data = R"({"data": {"a": 0, "b	c": 1}}
+  {"data": {"\u0061": 2, "\u0062\tc": 3}}
+  {"d\u0061ta": {"a": 4}})";
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+      .experimental(true)
+      .lines(true);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 3);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+  EXPECT_EQ(result.tbl->get_column(0).num_children(), 2);
+  EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info.at(0).name, "data");
+  EXPECT_EQ(result.metadata.schema_info.at(0).children.at(0).name, "a");
+  EXPECT_EQ(result.metadata.schema_info.at(0).children.at(1).name, "b\tc");
+  EXPECT_EQ(result.metadata.schema_info.at(0).children.size(), 2);
+}
+
 TEST_F(JsonReaderTest, JsonDtypeSchema)
 {
   std::string data = R"(
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 875cc467b6a..15682c6ae6b 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -889,6 +889,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
                                                      gpu_tree,
                                                      is_array_of_arrays,
                                                      json_lines,
+                                                     false,
                                                      stream,
                                                      cudf::get_current_device_resource_ref());
 #if LIBCUDF_JSON_DEBUG_DUMP
diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu
index a336b327732..f988ae24b38 100644
--- a/cpp/tests/io/json/json_tree_csr.cu
+++ b/cpp/tests/io/json/json_tree_csr.cu
@@ -168,6 +168,7 @@ void run_test(std::string const& input, bool enable_lines = true)
                                                      gpu_tree,
                                                      is_array_of_arrays,
                                                      options.is_enabled_lines(),
+                                                     false,
                                                      stream,
                                                      rmm::mr::get_current_device_resource());
   auto& gpu_col_id      = std::get<0>(tup);
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 2bb74c3e3b1..e41cc15712f 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -39,6 +39,7 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean allowNonNumericNumbers;
   private final boolean allowUnquotedControlChars;
   private final boolean cudfPruneSchema;
+  private final boolean experimental;
   private final byte lineDelimiter;
 
   private JSONOptions(Builder builder) {
@@ -55,6 +56,7 @@ private JSONOptions(Builder builder) {
     allowNonNumericNumbers = builder.allowNonNumericNumbers;
     allowUnquotedControlChars = builder.allowUnquotedControlChars;
     cudfPruneSchema = builder.cudfPruneSchema;
+    experimental = builder.experimental;
     lineDelimiter = builder.lineDelimiter;
   }
 
@@ -111,6 +113,10 @@ public boolean unquotedControlChars() {
     return allowUnquotedControlChars;
   }
 
+  public boolean experimental() {
+    return experimental;
+  }
+
   @Override
   String[] getIncludeColumnNames() {
     throw new UnsupportedOperationException("JSON reader didn't support column prune");
@@ -136,6 +142,7 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean keepQuotes = false;
 
     private boolean cudfPruneSchema = false;
+    private boolean experimental = false;
     private byte lineDelimiter = '\n';
 
     public Builder withCudfPruneSchema(boolean prune) {
@@ -159,6 +166,14 @@ public Builder withStrictValidation(boolean isAllowed) {
       return this;
     }
 
+    /**
+     * Should experimental features be enabled or not
+     */
+    public Builder withExperimental(boolean isAllowed) {
+      experimental = isAllowed;
+      return this;
+    }
+
     /**
      * Should leading zeros on numbers be allowed or not. Strict validation
      * must be enabled for this to have any effect.
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 6d370ca27b2..dbee53640aa 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -260,6 +260,7 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean allowNonNumericNumbers,
                                         boolean allowUnquotedControl,
                                         boolean pruneColumns,
+                                        boolean experimental,
                                         byte lineDelimiter) throws CudfException;
 
   private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
@@ -275,6 +276,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean allowNonNumericNumbers,
                                       boolean allowUnquotedControl,
                                       boolean pruneColumns,
+                                      boolean experimental,
                                       byte lineDelimiter,
                                       long dsHandle) throws CudfException;
 
@@ -288,6 +290,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool
                                       boolean allowLeadingZeros,
                                       boolean allowNonNumericNumbers,
                                       boolean allowUnquotedControl,
+                                      boolean experimental,
                                       byte lineDelimiter,
                                       long dsHandle) throws CudfException;
 
@@ -303,6 +306,7 @@ private static native long readAndInferJSON(long address, long length,
                                               boolean allowLeadingZeros,
                                               boolean allowNonNumericNumbers,
                                               boolean allowUnquotedControl,
+                                              boolean experimental,
                                               byte lineDelimiter) throws CudfException;
 
   /**
@@ -1333,6 +1337,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     opts.nonNumericNumbersAllowed(),
                     opts.unquotedControlChars(),
                     cudfPruneSchema,
+                    opts.experimental(),
                     opts.getLineDelimiter()))) {
 
       return gatherJSONColumns(schema, twm, -1);
@@ -1417,6 +1422,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
         opts.leadingZerosAllowed(),
         opts.nonNumericNumbersAllowed(),
         opts.unquotedControlChars(),
+        opts.experimental(),
         opts.getLineDelimiter()));
   }
 
@@ -1439,6 +1445,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
           opts.leadingZerosAllowed(),
           opts.nonNumericNumbersAllowed(),
           opts.unquotedControlChars(),
+          opts.experimental(),
           opts.getLineDelimiter(),
           dsHandle));
         return twm;
@@ -1499,6 +1506,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
             opts.nonNumericNumbersAllowed(),
             opts.unquotedControlChars(),
             cudfPruneSchema,
+            opts.experimental(),
             opts.getLineDelimiter()))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
     }
@@ -1543,6 +1551,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
         opts.nonNumericNumbersAllowed(),
         opts.unquotedControlChars(),
         cudfPruneSchema,
+        opts.experimental(),
         opts.getLineDelimiter(),
         dsHandle))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 0f77da54152..0a667978ca3 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1627,6 +1627,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
                                                          jboolean allow_leading_zeros,
                                                          jboolean allow_nonnumeric_numbers,
                                                          jboolean allow_unquoted_control,
+                                                         jboolean experimental,
                                                          jbyte line_delimiter,
                                                          jlong ds_handle)
 {
@@ -1649,6 +1650,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
         .mixed_types_as_string(mixed_types_as_string)
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
+        .experimental(experimental)
         .keep_quotes(keep_quotes)
         .prune_columns(false);
     if (strict_validation) {
@@ -1680,6 +1682,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
                                            jboolean allow_leading_zeros,
                                            jboolean allow_nonnumeric_numbers,
                                            jboolean allow_unquoted_control,
+                                           jboolean experimental,
                                            jbyte line_delimiter)
 {
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
@@ -1705,6 +1708,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
         .strict_validation(strict_validation)
         .mixed_types_as_string(mixed_types_as_string)
         .prune_columns(false)
+        .experimental(experimental)
         .delimiter(static_cast<char>(line_delimiter))
         .keep_quotes(keep_quotes);
     if (strict_validation) {
@@ -1821,6 +1825,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
                                                  jboolean allow_nonnumeric_numbers,
                                                  jboolean allow_unquoted_control,
                                                  jboolean prune_columns,
+                                                 jboolean experimental,
                                                  jbyte line_delimiter,
                                                  jlong ds_handle)
 {
@@ -1859,7 +1864,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes)
-        .prune_columns(prune_columns);
+        .prune_columns(prune_columns)
+        .experimental(experimental);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
@@ -1920,6 +1926,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
                                                            jboolean allow_nonnumeric_numbers,
                                                            jboolean allow_unquoted_control,
                                                            jboolean prune_columns,
+                                                           jboolean experimental,
                                                            jbyte line_delimiter)
 {
   bool read_buffer = true;
@@ -1972,7 +1979,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes)
-        .prune_columns(prune_columns);
+        .prune_columns(prune_columns)
+        .experimental(experimental);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)

From 61af76978e97d94c1c9c7297fc73900d7827b433 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 25 Sep 2024 16:48:51 -1000
Subject: [PATCH 265/270] Add io/timezone APIs to pylibcudf (#16771)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16771
---
 .../api_docs/pylibcudf/io/index.rst           |  1 +
 .../api_docs/pylibcudf/io/timezone.rst        |  6 +++
 python/cudf/cudf/_lib/timezone.pyx            | 27 ++----------
 python/pylibcudf/pylibcudf/io/CMakeLists.txt  |  4 +-
 python/pylibcudf/pylibcudf/io/__init__.pxd    |  2 +-
 python/pylibcudf/pylibcudf/io/__init__.py     |  2 +-
 python/pylibcudf/pylibcudf/io/timezone.pxd    |  6 +++
 python/pylibcudf/pylibcudf/io/timezone.pyx    | 43 +++++++++++++++++++
 .../pylibcudf/tests/io/test_timezone.py       | 16 +++++++
 9 files changed, 81 insertions(+), 26 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst
 create mode 100644 python/pylibcudf/pylibcudf/io/timezone.pxd
 create mode 100644 python/pylibcudf/pylibcudf/io/timezone.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_timezone.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index c8933981736..53638f071cc 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -19,3 +19,4 @@ I/O Functions
     csv
     json
     parquet
+    timezone
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst
new file mode 100644
index 00000000000..20c1ffc2e93
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst
@@ -0,0 +1,6 @@
+========
+Timezone
+========
+
+.. automodule:: pylibcudf.io.timezone
+   :members:
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index bff3b2c4ce4..54624a5a2fd 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -1,29 +1,10 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.optional cimport make_optional
-from libcpp.string cimport string
-from libcpp.utility cimport move
+import pylibcudf as plc
 
-from pylibcudf.libcudf.io.timezone cimport (
-    make_timezone_transition_table as cpp_make_timezone_transition_table,
-)
-from pylibcudf.libcudf.table.table cimport table
-
-from cudf._lib.utils cimport columns_from_unique_ptr
+from cudf._lib.column cimport Column
 
 
 def make_timezone_transition_table(tzdir, tzname):
-    cdef unique_ptr[table] c_result
-    cdef string c_tzdir = tzdir.encode()
-    cdef string c_tzname = tzname.encode()
-
-    with nogil:
-        c_result = move(
-            cpp_make_timezone_transition_table(
-                make_optional[string](c_tzdir),
-                c_tzname
-            )
-        )
-
-    return columns_from_unique_ptr(move(c_result))
+    plc_table = plc.io.timezone.make_timezone_transition_table(tzdir, tzname)
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()]
diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
index 529a71a48ce..965724a47b1 100644
--- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx types.pyx)
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx
+                   types.pyx
+)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd
index 5927a19dc69..1bcc0a3f963 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/io/__init__.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 # CSV is removed since it is def not cpdef (to force kw-only arguments)
-from . cimport avro, datasource, json, orc, parquet, types
+from . cimport avro, datasource, json, orc, parquet, timezone, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
index 5d899ee0808..2e4f215b12c 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.py
+++ b/python/pylibcudf/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, csv, datasource, json, orc, parquet, types
+from . import avro, csv, datasource, json, orc, parquet, timezone, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pxd b/python/pylibcudf/pylibcudf/io/timezone.pxd
new file mode 100644
index 00000000000..2aa755dbbd8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/timezone.pxd
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from ..table cimport Table
+
+
+cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name)
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx
new file mode 100644
index 00000000000..e02239d7252
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/timezone.pyx
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport make_optional
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from pylibcudf.libcudf.io.timezone cimport (
+    make_timezone_transition_table as cpp_make_timezone_transition_table,
+)
+from pylibcudf.libcudf.table.table cimport table
+
+from ..table cimport Table
+
+
+cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name):
+    """
+    Creates a transition table to convert ORC timestamps to UTC.
+
+    Parameters
+    ----------
+    tzif_dir : str
+        The directory where the TZif files are located
+    timezone_name : str
+        standard timezone name
+
+    Returns
+    -------
+    Table
+        The transition table for the given timezone.
+    """
+    cdef unique_ptr[table] c_result
+    cdef string c_tzdir = tzif_dir.encode()
+    cdef string c_tzname = timezone_name.encode()
+
+    with nogil:
+        c_result = move(
+            cpp_make_timezone_transition_table(
+                make_optional[string](c_tzdir),
+                c_tzname
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
new file mode 100644
index 00000000000..76b0424b2af
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import zoneinfo
+
+import pylibcudf as plc
+import pytest
+
+
+def test_make_timezone_transition_table():
+    if len(zoneinfo.TZPATH) == 0:
+        pytest.skip("No TZPATH available.")
+    tz_path = zoneinfo.TZPATH[0]
+    result = plc.io.timezone.make_timezone_transition_table(
+        tz_path, "America/Los_Angeles"
+    )
+    assert isinstance(result, plc.Table)
+    assert result.num_rows() > 0

From b00a718a7980fadc91c8b37d6bbe829e4b8549e8 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 25 Sep 2024 16:51:18 -1000
Subject: [PATCH 266/270] Add partitioning APIs to pylibcudf (#16781)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16781
---
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 .../api_docs/pylibcudf/partitioning.rst       |   6 +
 python/cudf/cudf/_lib/hash.pyx                |  35 ++---
 python/cudf/cudf/_lib/partitioning.pyx        |  35 +----
 python/pylibcudf/pylibcudf/CMakeLists.txt     |   1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |   2 +
 python/pylibcudf/pylibcudf/__init__.py        |   2 +
 .../pylibcudf/libcudf/partitioning.pxd        |   7 +
 python/pylibcudf/pylibcudf/partitioning.pxd   |  19 +++
 python/pylibcudf/pylibcudf/partitioning.pyx   | 120 ++++++++++++++++++
 .../pylibcudf/tests/test_partitioning.py      |  55 ++++++++
 11 files changed, 229 insertions(+), 54 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst
 create mode 100644 python/pylibcudf/pylibcudf/partitioning.pxd
 create mode 100644 python/pylibcudf/pylibcudf/partitioning.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_partitioning.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index edb0963ed29..e21536e2e97 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -25,6 +25,7 @@ This page provides API documentation for pylibcudf.
     lists
     merge
     null_mask
+    partitioning
     quantiles
     reduce
     replace
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst
new file mode 100644
index 00000000000..6951dbecca0
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst
@@ -0,0 +1,6 @@
+============
+partitioning
+============
+
+.. automodule:: pylibcudf.partitioning
+   :members:
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 48f75b12a73..9b7ab0888d2 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -3,11 +3,8 @@
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
 from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
-cimport pylibcudf.libcudf.types as libcudf_types
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.hash cimport (
     md5,
@@ -19,37 +16,23 @@ from pylibcudf.libcudf.hash cimport (
     sha512,
     xxhash_64,
 )
-from pylibcudf.libcudf.partitioning cimport (
-    hash_partition as cpp_hash_partition,
-)
-from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
+from cudf._lib.utils cimport table_view_from_columns
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
-def hash_partition(list source_columns, object columns_to_hash,
+def hash_partition(list source_columns, list columns_to_hash,
                    int num_partitions):
-    cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash
-    cdef int c_num_partitions = num_partitions
-    cdef table_view c_source_view = table_view_from_columns(source_columns)
-
-    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
-    with nogil:
-        c_result = move(
-            cpp_hash_partition(
-                c_source_view,
-                c_columns_to_hash,
-                c_num_partitions
-            )
-        )
-
-    return (
-        columns_from_unique_ptr(move(c_result.first)),
-        list(c_result.second)
+    plc_table, offsets = plc.partitioning.hash_partition(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),
+        columns_to_hash,
+        num_partitions
     )
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index d94f0e1b564..13997da8403 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -2,24 +2,13 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.partitioning cimport partition as cpp_partition
-from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.table.table_view cimport table_view
-
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
+
+import pylibcudf as plc
 
 from cudf._lib.reduce import minmax
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
-cimport pylibcudf.libcudf.types as libcudf_types
-
 
 @acquire_spill_lock()
 def partition(list source_columns, Column partition_map,
@@ -50,25 +39,15 @@ def partition(list source_columns, Column partition_map,
 
     if num_partitions is None:
         num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True)
-    cdef int c_num_partitions = num_partitions
-    cdef table_view c_source_view = table_view_from_columns(source_columns)
-
-    cdef column_view c_partition_map_view = partition_map.view()
 
-    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
     if partition_map.size > 0:
         lo, hi = minmax(partition_map)
         if lo < 0 or hi >= num_partitions:
             raise ValueError("Partition map has invalid values")
-    with nogil:
-        c_result = move(
-            cpp_partition(
-                c_source_view,
-                c_partition_map_view,
-                c_num_partitions
-            )
-        )
 
-    return (
-        columns_from_unique_ptr(move(c_result.first)), list(c_result.second)
+    plc_table, offsets = plc.partitioning.partition(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),
+        partition_map.to_pylibcudf(mode="read"),
+        num_partitions
     )
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index fb3a6c13a70..a7cb66d7b16 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -31,6 +31,7 @@ set(cython_sources
     lists.pyx
     merge.pyx
     null_mask.pyx
+    partitioning.pyx
     quantiles.pyx
     reduce.pyx
     replace.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index 66d9c3d6165..a384edd456d 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -17,6 +17,7 @@ from . cimport (
     lists,
     merge,
     null_mask,
+    partitioning,
     quantiles,
     reduce,
     replace,
@@ -61,6 +62,7 @@ __all__ = [
     "lists",
     "merge",
     "null_mask",
+    "partitioning",
     "quantiles",
     "reduce",
     "replace",
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 0a3615fa941..2a5365e8fad 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -28,6 +28,7 @@
     lists,
     merge,
     null_mask,
+    partitioning,
     quantiles,
     reduce,
     replace,
@@ -75,6 +76,7 @@
     "lists",
     "merge",
     "null_mask",
+    "partitioning",
     "quantiles",
     "reduce",
     "replace",
diff --git a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
index 1ea10e8a194..89bddbffab5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
@@ -25,3 +25,10 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
         const column_view& partition_map,
         int num_partitions
     ) except +
+
+    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] \
+        round_robin_partition "cudf::round_robin_partition" (
+        const table_view& input,
+        int num_partitions,
+        int start_partition
+    ) except +
diff --git a/python/pylibcudf/pylibcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/partitioning.pxd
new file mode 100644
index 00000000000..aad60149fc4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/partitioning.pxd
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef tuple[Table, list] hash_partition(
+    Table input,
+    list columns_to_hash,
+    int num_partitions
+)
+
+cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions)
+
+cpdef tuple[Table, list] round_robin_partition(
+    Table input,
+    int num_partitions,
+    int start_partition=*
+)
diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx
new file mode 100644
index 00000000000..8fa70daab5a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/partitioning.pyx
@@ -0,0 +1,120 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cimport pylibcudf.libcudf.types as libcudf_types
+from libcpp.memory cimport unique_ptr
+from libcpp.pair cimport pair
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.libcudf cimport partitioning as cpp_partitioning
+from pylibcudf.libcudf.table.table cimport table
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef tuple[Table, list] hash_partition(
+    Table input,
+    list columns_to_hash,
+    int num_partitions
+):
+    """
+    Partitions rows from the input table into multiple output tables.
+
+    For details, see :cpp:func:`hash_partition`.
+
+    Parameters
+    ----------
+    input : Table
+        The table to partition
+    columns_to_hash : list[int]
+        Indices of input columns to hash
+    num_partitions : int
+        The number of partitions to use
+
+    Returns
+    -------
+    tuple[Table, list[int]]
+        An output table and a vector of row offsets to each partition
+    """
+    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
+    cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash
+    cdef int c_num_partitions = num_partitions
+
+    with nogil:
+        c_result = move(
+            cpp_partitioning.hash_partition(
+                input.view(), c_columns_to_hash, c_num_partitions
+            )
+        )
+
+    return Table.from_libcudf(move(c_result.first)), list(c_result.second)
+
+cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions):
+    """
+    Partitions rows of `t` according to the mapping specified by `partition_map`.
+
+    For details, see :cpp:func:`partition`.
+
+    Parameters
+    ----------
+    t : Table
+        The table to partition
+    partition_map : Column
+        Non-nullable column of integer values that map each row
+        in `t` to it's partition.
+    num_partitions : int
+        The total number of partitions
+
+    Returns
+    -------
+    tuple[Table, list[int]]
+        An output table and a list of row offsets to each partition
+    """
+    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
+    cdef int c_num_partitions = num_partitions
+
+    with nogil:
+        c_result = move(
+            cpp_partitioning.partition(t.view(), partition_map.view(), c_num_partitions)
+        )
+
+    return Table.from_libcudf(move(c_result.first)), list(c_result.second)
+
+
+cpdef tuple[Table, list] round_robin_partition(
+    Table input,
+    int num_partitions,
+    int start_partition=0
+):
+    """
+    Round-robin partition.
+
+    For details, see :cpp:func:`round_robin_partition`.
+
+    Parameters
+    ----------
+    input : Table
+        The input table to be round-robin partitioned
+    num_partitions : int
+        Number of partitions for the table
+    start_partition : int, default 0
+        Index of the 1st partition
+
+    Returns
+    -------
+    tuple[Table, list[int]]
+        The partitioned table and the partition offsets
+        for each partition within the table.
+    """
+    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
+    cdef int c_num_partitions = num_partitions
+    cdef int c_start_partition = start_partition
+
+    with nogil:
+        c_result = move(
+            cpp_partitioning.round_robin_partition(
+                input.view(), c_num_partitions, c_start_partition
+            )
+        )
+
+    return Table.from_libcudf(move(c_result.first)), list(c_result.second)
diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
new file mode 100644
index 00000000000..444d0089d2c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_table_eq
+
+
+@pytest.fixture(scope="module")
+def partitioning_data():
+    data = {"a": [1, 2, 3], "b": [1, 2, 5], "c": [1, 2, 10]}
+    pa_table = pa.table(data)
+    plc_table = plc.interop.from_arrow(pa_table)
+    return data, plc_table, pa_table
+
+
+def test_partition(partitioning_data):
+    raw_data, plc_table, pa_table = partitioning_data
+    result, result_offsets = plc.partitioning.partition(
+        plc_table,
+        plc.interop.from_arrow(pa.array([0, 0, 0])),
+        1,
+    )
+    expected = pa.table(
+        list(raw_data.values()),
+        schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3),
+    )
+    assert_table_eq(expected, result)
+    assert result_offsets == [0, 3]
+
+
+def test_hash_partition(partitioning_data):
+    raw_data, plc_table, pa_table = partitioning_data
+    result, result_offsets = plc.partitioning.hash_partition(
+        plc_table, [0, 1], 1
+    )
+    expected = pa.table(
+        list(raw_data.values()),
+        schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3),
+    )
+    assert_table_eq(expected, result)
+    assert result_offsets == [0]
+
+
+def test_round_robin_partition(partitioning_data):
+    raw_data, plc_table, pa_table = partitioning_data
+    result, result_offsets = plc.partitioning.round_robin_partition(
+        plc_table, 1, 0
+    )
+    expected = pa.table(
+        list(raw_data.values()),
+        schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3),
+    )
+    assert_table_eq(expected, result)
+    assert result_offsets == [0]

From 742eaadb92b0c5159d92be49e647a17e8c1d0b9b Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 26 Sep 2024 14:27:37 -0500
Subject: [PATCH 267/270] Fix links in Dask cuDF documentation (#16929)

More follow-up fixes to the recent Dask-cuDF documentation additions.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16929
---
 docs/dask_cudf/source/best_practices.rst | 15 +++++++++------
 docs/dask_cudf/source/conf.py            |  1 +
 docs/dask_cudf/source/index.rst          | 11 +++++------
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst
index 83039f86fed..41263ebf589 100644
--- a/docs/dask_cudf/source/best_practices.rst
+++ b/docs/dask_cudf/source/best_practices.rst
@@ -81,7 +81,7 @@ representations, native cuDF spilling may be insufficient. For these cases,
 `JIT-unspill <https://docs.rapids.ai/api/dask-cuda/nightly/spilling/#jit-unspill>`__
 is likely to produce better protection from out-of-memory (OOM) errors.
 Please see `Dask-CUDA's spilling documentation
-<https://docs.rapids.ai/api/dask-cuda/24.10/spilling/>`__ for further details
+<https://docs.rapids.ai/api/dask-cuda/stable/spilling/>`__ for further details
 and guidance.
 
 Use RMM
@@ -160,7 +160,7 @@ of the underlying task graph to materialize the collection.
 
 :func:`sort_values` / :func:`set_index` : These operations both require Dask to
 eagerly collect quantile information about the column(s) being targeted by the
-global sort operation. See `Avoid Sorting`__ for notes on sorting considerations.
+global sort operation. See the next section for notes on sorting considerations.
 
 .. note::
   When using :func:`set_index`, be sure to pass in ``sort=False`` whenever the
@@ -297,11 +297,14 @@ bottleneck is typically device-to-host memory spilling.
 Although every workflow is different, the following guidelines
 are often recommended:
 
-* `Use a distributed cluster with Dask-CUDA workers <Use Dask-CUDA>`_
-* `Use native cuDF spilling whenever possible <Enable cuDF Spilling>`_
+* Use a distributed cluster with `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__ workers
+
+* Use native cuDF spilling whenever possible (`Dask-CUDA spilling documentation <https://docs.rapids.ai/api/dask-cuda/stable/spilling/>`__)
+
 * Avoid shuffling whenever possible
-  * Use ``split_out=1`` for low-cardinality groupby aggregations
-  * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``)
+    * Use ``split_out=1`` for low-cardinality groupby aggregations
+    * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``)
+
 * `Use UCX <https://docs.rapids.ai/api/dask-cuda/nightly/examples/ucx/>`__ if communication is a bottleneck.
 
 .. note::
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
index dc40254312e..5daa8245695 100644
--- a/docs/dask_cudf/source/conf.py
+++ b/docs/dask_cudf/source/conf.py
@@ -78,6 +78,7 @@
     "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None),
     "dask": ("https://docs.dask.org/en/stable/", None),
     "pandas": ("https://pandas.pydata.org/docs/", None),
+    "dask-cuda": ("https://docs.rapids.ai/api/dask-cuda/stable/", None),
 }
 
 numpydoc_show_inherited_class_members = True
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 6eb755d7854..c2891ebc15e 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -16,10 +16,9 @@ as the ``"cudf"`` dataframe backend for
   Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU
   or multi-node execution on their own. You must also deploy a
   `dask.distributed <https://distributed.dask.org/en/stable/>`__ cluster
-  to leverage multiple GPUs. We strongly recommend using `Dask-CUDA
-  <https://docs.rapids.ai/api/dask-cuda/stable/>`__ to simplify the
-  setup of the cluster, taking advantage of all features of the GPU
-  and networking hardware.
+  to leverage multiple GPUs. We strongly recommend using :doc:`dask-cuda:index`
+  to simplify the setup of the cluster, taking advantage of all features
+  of the GPU and networking hardware.
 
 If you are familiar with Dask and `pandas <pandas.pydata.org>`__ or
 `cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask cuDF
@@ -161,7 +160,7 @@ out-of-core computing. This also means that the compute tasks can be
 executed in parallel over a multi-GPU cluster.
 
 In order to execute your Dask workflow on multiple GPUs, you will
-typically need to use `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+typically need to use :doc:`dask-cuda:index`
 to deploy distributed Dask cluster, and
 `Distributed <https://distributed.dask.org/en/stable/client.html>`__
 to define a client object. For example::
@@ -192,7 +191,7 @@ to define a client object. For example::
   <https://distributed.dask.org/en/stable/manage-computation.html>`__
   for more details.
 
-Please see the `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+Please see the :doc:`dask-cuda:index`
 documentation for more information about deploying GPU-aware clusters
 (including `best practices
 <https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/>`__).

From f20491d3366808e5c62dcee2160fc8a9d5e50fa7 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 30 Sep 2024 12:14:32 -0400
Subject: [PATCH 268/270] Parse newline as whitespace character while
 tokenizing JSONL inputs with non-newline delimiter (#16950)

Backporting PR #16923: : Parse newline as whitespace character while
tokenizing JSONL inputs

Addresses #16915
---
 cpp/src/io/json/nested_json_gpu.cu     |   8 +-
 cpp/tests/io/json/json_test.cpp        |  24 ++++
 cpp/tests/io/json/nested_json_test.cpp | 178 +++++++++++++++++++++++++
 3 files changed, 207 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 1c15e147b13..76816071d8c 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -618,12 +618,14 @@ struct PdaSymbolToSymbolGroupId {
     constexpr auto pda_sgid_lookup_size =
       static_cast<int32_t>(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0]));
     // We map the delimiter character to LINE_BREAK symbol group id, and the newline character
-    // to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
+    // to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
     // escape, comma, colon or whitespace characters.
+    auto constexpr newline    = '\n';
+    auto constexpr whitespace = ' ';
     auto const symbol_position =
       symbol == delimiter
-        ? static_cast<int32_t>('\n')
-        : (symbol == '\n' ? static_cast<int32_t>(delimiter) : static_cast<int32_t>(symbol));
+        ? static_cast<int32_t>(newline)
+        : (symbol == newline ? static_cast<int32_t>(whitespace) : static_cast<int32_t>(symbol));
     PdaSymbolGroupIdT symbol_gid =
       tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)];
     return stack_idx * static_cast<PdaSymbolGroupIdT>(symbol_group_id::NUM_PDA_INPUT_SGS) +
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 68ec255b39d..a094ac7d772 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2575,6 +2575,30 @@ TEST_F(JsonReaderTest, ViableDelimiter)
   EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument);
 }
 
+TEST_F(JsonReaderTest, ViableDelimiterNewlineWS)
+{
+  // Test input
+  std::string input = R"({"a":
+  100})";
+
+  cudf::io::json_reader_options json_parser_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()})
+      .lines(true)
+      .delimiter('\0');
+
+  auto result = cudf::io::read_json(json_parser_options);
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 1);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+
+  auto col1_iterator = thrust::constant_iterator<int64_t>(100);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper(col1_iterator, col1_iterator + 1));
+}
+
 // Test case for dtype prune:
 // all paths, only one.
 // one present, another not present, nothing present
diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp
index 327169ae563..f32aba0e632 100644
--- a/cpp/tests/io/json/nested_json_test.cpp
+++ b/cpp/tests/io/json/nested_json_test.cpp
@@ -29,6 +29,7 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -1196,4 +1197,181 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter)
   }
 }
 
+TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter)
+{
+  // Test input. Inline comments used to indicate character indexes
+  //                           012345678 <= line 0
+  char const delimiter = GetParam();
+
+  /* Input: (Note that \n is considered whitespace according to the JSON spec when it is not used as
+   * a delimiter for JSONL)
+   * {"a":2}
+   * {"a":<delimiter>{"a":{"a":[321<delimiter>{"a":[1]}
+   *
+   * <delimiter>{"b":123}
+   * {"b":123}<delimiter>
+   * {"b"\n:\n\n\n123\n}
+   */
+  std::string input = R"({"a":2})"
+                      "\n";
+  // starting position 8 (zero indexed)
+  input += R"({"a":)" + std::string(1, delimiter);
+  // starting position 14 (zero indexed)
+  input += R"({"a":{"a":[321)" + std::string(1, delimiter);
+  // starting position 29 (zero indexed)
+  input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter);
+  // starting position 41 (zero indexed)
+  input += R"({"b":123})"
+           "\n";
+  // starting position 51 (zero indexed)
+  input += R"({"b":123})" + std::string(1, delimiter);
+  // starting position 61 (zero indexed)
+  input += R"({"b")" + std::string("\n:\n\n\n123\n}");
+
+  // Golden token stream sample
+  using token_t = cuio_json::token_t;
+  std::vector<std::pair<std::size_t, cuio_json::PdaTokenT>> golden_token_stream;
+  if (delimiter != '\n') {
+    golden_token_stream = {// Line 0 (valid)
+                           {0, token_t::StructBegin},
+                           {1, token_t::StructMemberBegin},
+                           {1, token_t::FieldNameBegin},
+                           {3, token_t::FieldNameEnd},
+                           {5, token_t::ValueBegin},
+                           {6, token_t::ValueEnd},
+                           {6, token_t::StructMemberEnd},
+                           {6, token_t::StructEnd},
+                           // Line 1 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 2 (valid)
+                           {29, token_t::StructBegin},
+                           {30, token_t::StructMemberBegin},
+                           {30, token_t::FieldNameBegin},
+                           {32, token_t::FieldNameEnd},
+                           {34, token_t::ListBegin},
+                           {35, token_t::ValueBegin},
+                           {36, token_t::ValueEnd},
+                           {36, token_t::ListEnd},
+                           {37, token_t::StructMemberEnd},
+                           {37, token_t::StructEnd},
+                           // Line 3 (valid)
+                           {41, token_t::StructBegin},
+                           {42, token_t::StructMemberBegin},
+                           {42, token_t::FieldNameBegin},
+                           {44, token_t::FieldNameEnd},
+                           {46, token_t::ValueBegin},
+                           {49, token_t::ValueEnd},
+                           {49, token_t::StructMemberEnd},
+                           {49, token_t::StructEnd},
+                           // Line 4 (valid)
+                           {61, token_t::StructBegin},
+                           {62, token_t::StructMemberBegin},
+                           {62, token_t::FieldNameBegin},
+                           {64, token_t::FieldNameEnd},
+                           {70, token_t::ValueBegin},
+                           {73, token_t::ValueEnd},
+                           {74, token_t::StructMemberEnd},
+                           {74, token_t::StructEnd}};
+  } else {
+    /* Input:
+     * {"a":2}
+     * {"a":
+     * {"a":{"a":[321
+     * {"a":[1]}
+     *
+     *
+     * {"b":123}
+     * {"b":123}
+     * {"b"\n:\n\n\n123\n}
+     */
+    golden_token_stream = {// Line 0 (valid)
+                           {0, token_t::StructBegin},
+                           {1, token_t::StructMemberBegin},
+                           {1, token_t::FieldNameBegin},
+                           {3, token_t::FieldNameEnd},
+                           {5, token_t::ValueBegin},
+                           {6, token_t::ValueEnd},
+                           {6, token_t::StructMemberEnd},
+                           {6, token_t::StructEnd},
+                           // Line 1 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 2 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 3 (valid)
+                           {29, token_t::StructBegin},
+                           {30, token_t::StructMemberBegin},
+                           {30, token_t::FieldNameBegin},
+                           {32, token_t::FieldNameEnd},
+                           {34, token_t::ListBegin},
+                           {35, token_t::ValueBegin},
+                           {36, token_t::ValueEnd},
+                           {36, token_t::ListEnd},
+                           {37, token_t::StructMemberEnd},
+                           {37, token_t::StructEnd},
+                           // Line 4 (valid)
+                           {41, token_t::StructBegin},
+                           {42, token_t::StructMemberBegin},
+                           {42, token_t::FieldNameBegin},
+                           {44, token_t::FieldNameEnd},
+                           {46, token_t::ValueBegin},
+                           {49, token_t::ValueEnd},
+                           {49, token_t::StructMemberEnd},
+                           {49, token_t::StructEnd},
+                           // Line 5 (valid)
+                           {51, token_t::StructBegin},
+                           {52, token_t::StructMemberBegin},
+                           {52, token_t::FieldNameBegin},
+                           {54, token_t::FieldNameEnd},
+                           {56, token_t::ValueBegin},
+                           {59, token_t::ValueEnd},
+                           {59, token_t::StructMemberEnd},
+                           {59, token_t::StructEnd},
+                           // Line 6 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd}};
+  }
+
+  auto const stream = cudf::get_default_stream();
+
+  // Prepare input & output buffers
+  cudf::string_scalar const d_scalar(input, true, stream);
+  auto const d_input = cudf::device_span<cuio_json::SymbolT const>{
+    d_scalar.data(), static_cast<size_t>(d_scalar.size())};
+
+  // Default parsing options
+  cudf::io::json_reader_options const in_opts =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{})
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+      .delimiter(delimiter)
+      .lines(true);
+
+  // Parse the JSON and get the token stream
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, in_opts, stream, cudf::get_current_device_resource_ref());
+  // Copy back the number of tokens that were written
+  auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
+  auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
+
+  stream.synchronize();
+  // Verify the number of tokens matches
+  ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size());
+  ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size());
+
+  for (std::size_t i = 0; i < tokens_gpu.size(); i++) {
+    // Ensure the index the tokens are pointing to do match
+    EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i;
+    // Ensure the token category is correct
+    EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i;
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 8a9df040e18b2f54df67ad6fde94969990e61b7f Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Wed, 2 Oct 2024 14:59:00 -0400
Subject: [PATCH 269/270] Add license to the pylibcudf wheel (#16976)

Add the license file symlink to the `pylibcudf` wheels
---
 python/pylibcudf/LICENSE | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 python/pylibcudf/LICENSE

diff --git a/python/pylibcudf/LICENSE b/python/pylibcudf/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/pylibcudf/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file

From 319a53327ac7c921a78979a1f23c5caf7171129d Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 9 Oct 2024 09:38:30 -0400
Subject: [PATCH 270/270] Update Changelog [skip ci]

---
 CHANGELOG.md | 296 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 296 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f2a7c337675..7a75b2a95a4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,299 @@
+# cudf 24.10.00 (9 Oct 2024)
+
+## 🚨 Breaking Changes
+
+- Whitespace normalization of nested column coerced as string column in JSONL inputs ([#16759](https://github.com/rapidsai/cudf/pull/16759)) [@shrshi](https://github.com/shrshi)
+- Add libcudf wrappers around current_device_resource functions. ([#16679](https://github.com/rapidsai/cudf/pull/16679)) [@harrism](https://github.com/harrism)
+- Fix empty cluster handling in tdigest merge ([#16675](https://github.com/rapidsai/cudf/pull/16675)) [@jihoonson](https://github.com/jihoonson)
+- Remove java ColumnView.copyWithBooleanColumnAsValidity ([#16660](https://github.com/rapidsai/cudf/pull/16660)) [@revans2](https://github.com/revans2)
+- Support reading multiple PQ sources with mismatching nullability for columns ([#16639](https://github.com/rapidsai/cudf/pull/16639)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove arrow_io_source ([#16607](https://github.com/rapidsai/cudf/pull/16607)) [@vyasr](https://github.com/vyasr)
+- Remove legacy Arrow interop APIs ([#16590](https://github.com/rapidsai/cudf/pull/16590)) [@vyasr](https://github.com/vyasr)
+- Remove NativeFile support from cudf Python ([#16589](https://github.com/rapidsai/cudf/pull/16589)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Make proxy NumPy arrays pass isinstance check in `cudf.pandas`&quot; ([#16586](https://github.com/rapidsai/cudf/pull/16586)) [@Matt711](https://github.com/Matt711)
+- Align public utility function signatures  with pandas 2.x ([#16565](https://github.com/rapidsai/cudf/pull/16565)) [@mroeschke](https://github.com/mroeschke)
+- Disallow cudf.Index accepting column in favor of ._from_column ([#16549](https://github.com/rapidsai/cudf/pull/16549)) [@mroeschke](https://github.com/mroeschke)
+- Refactor dictionary encoding in PQ writer to migrate to the new `cuco::static_map` ([#16541](https://github.com/rapidsai/cudf/pull/16541)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Change IPv4 convert APIs to support UINT32 instead of INT64 ([#16489](https://github.com/rapidsai/cudf/pull/16489)) [@davidwendt](https://github.com/davidwendt)
+- enable list to be forced as string in JSON reader. ([#16472](https://github.com/rapidsai/cudf/pull/16472)) [@karthikeyann](https://github.com/karthikeyann)
+- Disallow cudf.Series to accept column in favor of `._from_column` ([#16454](https://github.com/rapidsai/cudf/pull/16454)) [@mroeschke](https://github.com/mroeschke)
+- Align groupby APIs with pandas 2.x ([#16403](https://github.com/rapidsai/cudf/pull/16403)) [@mroeschke](https://github.com/mroeschke)
+- Align misc DataFrame and MultiIndex methods with pandas 2.x ([#16402](https://github.com/rapidsai/cudf/pull/16402)) [@mroeschke](https://github.com/mroeschke)
+- Align Index APIs with pandas 2.x ([#16361](https://github.com/rapidsai/cudf/pull/16361)) [@mroeschke](https://github.com/mroeschke)
+- Add `stream` param to stream compaction APIs ([#16295](https://github.com/rapidsai/cudf/pull/16295)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+
+## 🐛 Bug Fixes
+
+- Add license to the pylibcudf wheel ([#16976](https://github.com/rapidsai/cudf/pull/16976)) [@raydouglass](https://github.com/raydouglass)
+- Parse newline as whitespace character while tokenizing JSONL inputs with non-newline delimiter ([#16950](https://github.com/rapidsai/cudf/pull/16950)) [@shrshi](https://github.com/shrshi)
+- Add dask-cudf workaround for missing `rename_axis` support in cudf ([#16899](https://github.com/rapidsai/cudf/pull/16899)) [@rjzamora](https://github.com/rjzamora)
+- Update oldest deps for `pyarrow` &amp; `numpy` ([#16883](https://github.com/rapidsai/cudf/pull/16883)) [@galipremsagar](https://github.com/galipremsagar)
+- Update labeler for pylibcudf ([#16868](https://github.com/rapidsai/cudf/pull/16868)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Refactor mixed_semi_join using cuco::static_set&quot; ([#16855](https://github.com/rapidsai/cudf/pull/16855)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Fix metadata after implicit array conversion from Dask cuDF ([#16842](https://github.com/rapidsai/cudf/pull/16842)) [@rjzamora](https://github.com/rjzamora)
+- Add cudf.pandas dependencies.yaml to update-version.sh ([#16840](https://github.com/rapidsai/cudf/pull/16840)) [@raydouglass](https://github.com/raydouglass)
+- Use cupy 12.2.0 as oldest dependency pinning on CUDA 12 ARM ([#16808](https://github.com/rapidsai/cudf/pull/16808)) [@bdice](https://github.com/bdice)
+- Revert &quot;Fix empty cluster handling in tdigest merge ([#16675)&quot; (#16800](https://github.com/rapidsai/cudf/pull/16675)&quot; (#16800)) [@jihoonson](https://github.com/jihoonson)
+- Intentionally leak thread_local CUDA resources to avoid crash (part 1) ([#16787](https://github.com/rapidsai/cudf/pull/16787)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Fix `cov`/`corr` bug in dask-cudf ([#16786](https://github.com/rapidsai/cudf/pull/16786)) [@rjzamora](https://github.com/rjzamora)
+- Fix slice_strings wide strings logic with multi-byte characters ([#16777](https://github.com/rapidsai/cudf/pull/16777)) [@davidwendt](https://github.com/davidwendt)
+- Fix nvbench output for sha512 ([#16773](https://github.com/rapidsai/cudf/pull/16773)) [@davidwendt](https://github.com/davidwendt)
+- Allow read_csv(header=None) to return int column labels in `mode.pandas_compatible` ([#16769](https://github.com/rapidsai/cudf/pull/16769)) [@mroeschke](https://github.com/mroeschke)
+- Whitespace normalization of nested column coerced as string column in JSONL inputs ([#16759](https://github.com/rapidsai/cudf/pull/16759)) [@shrshi](https://github.com/shrshi)
+- Fix DataFrame.drop(columns=cudf.Series/Index, axis=1) ([#16712](https://github.com/rapidsai/cudf/pull/16712)) [@mroeschke](https://github.com/mroeschke)
+- Use merge base when calculating changed files ([#16709](https://github.com/rapidsai/cudf/pull/16709)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Ensure we pass the has_nulls tparam to mixed_join kernels ([#16708](https://github.com/rapidsai/cudf/pull/16708)) [@abellina](https://github.com/abellina)
+- Add boost-devel to Java CI Docker image ([#16707](https://github.com/rapidsai/cudf/pull/16707)) [@jlowe](https://github.com/jlowe)
+- [BUG] Add gpu node type to cudf-pandas 3rd-party integration nightly CI job ([#16704](https://github.com/rapidsai/cudf/pull/16704)) [@Matt711](https://github.com/Matt711)
+- Fix typo in column_factories.hpp comment from &#39;depth 1&#39; to &#39;depth 2&#39; ([#16700](https://github.com/rapidsai/cudf/pull/16700)) [@a-hirota](https://github.com/a-hirota)
+- Fix Series.to_frame(name=None) setting a None name ([#16698](https://github.com/rapidsai/cudf/pull/16698)) [@mroeschke](https://github.com/mroeschke)
+- Disable gtests/ERROR_TEST during compute-sanitizer memcheck test ([#16691](https://github.com/rapidsai/cudf/pull/16691)) [@davidwendt](https://github.com/davidwendt)
+- Enable batched multi-source reading of JSONL files with large records ([#16687](https://github.com/rapidsai/cudf/pull/16687)) [@shrshi](https://github.com/shrshi)
+- Handle `ordered` parameter in `CategoricalIndex.__repr__` ([#16683](https://github.com/rapidsai/cudf/pull/16683)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix loc/iloc.__setitem__[:, loc] with non cupy types ([#16677](https://github.com/rapidsai/cudf/pull/16677)) [@mroeschke](https://github.com/mroeschke)
+- Fix empty cluster handling in tdigest merge ([#16675](https://github.com/rapidsai/cudf/pull/16675)) [@jihoonson](https://github.com/jihoonson)
+- Fix `cudf::rank` not getting enough params ([#16666](https://github.com/rapidsai/cudf/pull/16666)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Fix slowdown in `CategoricalIndex.__repr__` ([#16665](https://github.com/rapidsai/cudf/pull/16665)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove java ColumnView.copyWithBooleanColumnAsValidity ([#16660](https://github.com/rapidsai/cudf/pull/16660)) [@revans2](https://github.com/revans2)
+- Fix slowdown in DataFrame repr in jupyter notebook ([#16656](https://github.com/rapidsai/cudf/pull/16656)) [@galipremsagar](https://github.com/galipremsagar)
+- Preserve Series name in duplicated method. ([#16655](https://github.com/rapidsai/cudf/pull/16655)) [@bdice](https://github.com/bdice)
+- Fix interval_range right child non-zero offset ([#16651](https://github.com/rapidsai/cudf/pull/16651)) [@mroeschke](https://github.com/mroeschke)
+- fix libcudf wheel publishing, make package-type explicit in wheel publishing ([#16650](https://github.com/rapidsai/cudf/pull/16650)) [@jameslamb](https://github.com/jameslamb)
+- Revert &quot;Hide all gtest symbols in cudftestutil ([#16546)&quot; (#16644](https://github.com/rapidsai/cudf/pull/16546)&quot; (#16644)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix integer overflow in indexalator pointer logic ([#16643](https://github.com/rapidsai/cudf/pull/16643)) [@davidwendt](https://github.com/davidwendt)
+- Allow for binops between two differently sized DecimalDtypes ([#16638](https://github.com/rapidsai/cudf/pull/16638)) [@mroeschke](https://github.com/mroeschke)
+- Move pragma once in rolling/jit/operation.hpp. ([#16636](https://github.com/rapidsai/cudf/pull/16636)) [@bdice](https://github.com/bdice)
+- Fix overflow bug in low-memory JSON reader ([#16632](https://github.com/rapidsai/cudf/pull/16632)) [@shrshi](https://github.com/shrshi)
+- Add the missing `num_aggregations` axis for `groupby_max_cardinality` ([#16630](https://github.com/rapidsai/cudf/pull/16630)) [@PointKernel](https://github.com/PointKernel)
+- Fix strings::detail::copy_range when target contains nulls ([#16626](https://github.com/rapidsai/cudf/pull/16626)) [@davidwendt](https://github.com/davidwendt)
+- Fix function parameters with common dependency modified during their evaluation ([#16620](https://github.com/rapidsai/cudf/pull/16620)) [@ttnghia](https://github.com/ttnghia)
+- bug-fix: Don&#39;t enable the CUDA language if testing was requested when finding cudf ([#16615](https://github.com/rapidsai/cudf/pull/16615)) [@cryos](https://github.com/cryos)
+- bug-fix: cudf/io/json.hpp use after move ([#16609](https://github.com/rapidsai/cudf/pull/16609)) [@NicolasDenoyelle](https://github.com/NicolasDenoyelle)
+- Remove CUDA whole compilation ODR violations ([#16603](https://github.com/rapidsai/cudf/pull/16603)) [@robertmaynard](https://github.com/robertmaynard)
+- MAINT: Adapt to numpy hiding flagsobject away ([#16593](https://github.com/rapidsai/cudf/pull/16593)) [@seberg](https://github.com/seberg)
+- Revert &quot;Make proxy NumPy arrays pass isinstance check in `cudf.pandas`&quot; ([#16586](https://github.com/rapidsai/cudf/pull/16586)) [@Matt711](https://github.com/Matt711)
+- Switch python version to `3.10` in `cudf.pandas` pandas test scripts ([#16559](https://github.com/rapidsai/cudf/pull/16559)) [@galipremsagar](https://github.com/galipremsagar)
+- Hide all gtest symbols in cudftestutil ([#16546](https://github.com/rapidsai/cudf/pull/16546)) [@robertmaynard](https://github.com/robertmaynard)
+- Update the java code to properly deal with lists being returned as strings ([#16536](https://github.com/rapidsai/cudf/pull/16536)) [@revans2](https://github.com/revans2)
+- Register `read_parquet` and `read_csv` with dask-expr ([#16535](https://github.com/rapidsai/cudf/pull/16535)) [@rjzamora](https://github.com/rjzamora)
+- Change cudf::empty_like to not include offsets for empty strings columns ([#16529](https://github.com/rapidsai/cudf/pull/16529)) [@davidwendt](https://github.com/davidwendt)
+- Fix DataFrame reductions with median returning scalar instead of Series ([#16527](https://github.com/rapidsai/cudf/pull/16527)) [@mroeschke](https://github.com/mroeschke)
+- Allow DataFrame.sort_values(by=) to select an index level ([#16519](https://github.com/rapidsai/cudf/pull/16519)) [@mroeschke](https://github.com/mroeschke)
+- Fix `date_range(start, end, freq)` when end-start is divisible by freq ([#16516](https://github.com/rapidsai/cudf/pull/16516)) [@mroeschke](https://github.com/mroeschke)
+- Preserve array name in MultiIndex.from_arrays ([#16515](https://github.com/rapidsai/cudf/pull/16515)) [@mroeschke](https://github.com/mroeschke)
+- Disallow indexing by selecting duplicate labels ([#16514](https://github.com/rapidsai/cudf/pull/16514)) [@mroeschke](https://github.com/mroeschke)
+- Fix `.replace(Index, Index)` raising a TypeError ([#16513](https://github.com/rapidsai/cudf/pull/16513)) [@mroeschke](https://github.com/mroeschke)
+- Check index bounds in compact protocol reader. ([#16493](https://github.com/rapidsai/cudf/pull/16493)) [@bdice](https://github.com/bdice)
+- Fix build failures with GCC 13 ([#16488](https://github.com/rapidsai/cudf/pull/16488)) [@PointKernel](https://github.com/PointKernel)
+- Fix all-empty input column for strings split APIs ([#16466](https://github.com/rapidsai/cudf/pull/16466)) [@davidwendt](https://github.com/davidwendt)
+- Fix segmented-sort overlapped input/output indices ([#16463](https://github.com/rapidsai/cudf/pull/16463)) [@davidwendt](https://github.com/davidwendt)
+- Fix merge conflict for auto merge 16447 ([#16449](https://github.com/rapidsai/cudf/pull/16449)) [@davidwendt](https://github.com/davidwendt)
+
+## 📖 Documentation
+
+- Fix links in Dask cuDF documentation ([#16929](https://github.com/rapidsai/cudf/pull/16929)) [@rjzamora](https://github.com/rjzamora)
+- Improve aggregation documentation ([#16822](https://github.com/rapidsai/cudf/pull/16822)) [@PointKernel](https://github.com/PointKernel)
+- Add best practices page to Dask cuDF docs ([#16821](https://github.com/rapidsai/cudf/pull/16821)) [@rjzamora](https://github.com/rjzamora)
+- [DOC] Update Pylibcudf doc strings ([#16810](https://github.com/rapidsai/cudf/pull/16810)) [@Matt711](https://github.com/Matt711)
+- Recommending `miniforge` for conda install ([#16782](https://github.com/rapidsai/cudf/pull/16782)) [@mmccarty](https://github.com/mmccarty)
+- Add labeling pylibcudf doc pages ([#16779](https://github.com/rapidsai/cudf/pull/16779)) [@mroeschke](https://github.com/mroeschke)
+- Migrate dask-cudf README improvements to dask-cudf sphinx docs ([#16765](https://github.com/rapidsai/cudf/pull/16765)) [@rjzamora](https://github.com/rjzamora)
+- [DOC] Remove out of date section from cudf.pandas docs ([#16697](https://github.com/rapidsai/cudf/pull/16697)) [@Matt711](https://github.com/Matt711)
+- Add performance tips to cudf.pandas FAQ. ([#16693](https://github.com/rapidsai/cudf/pull/16693)) [@bdice](https://github.com/bdice)
+- Update documentation for Dask cuDF ([#16671](https://github.com/rapidsai/cudf/pull/16671)) [@rjzamora](https://github.com/rjzamora)
+- Add missing pylibcudf strings docs ([#16471](https://github.com/rapidsai/cudf/pull/16471)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- DOC: Refresh pylibcudf guide ([#15856](https://github.com/rapidsai/cudf/pull/15856)) [@lithomas1](https://github.com/lithomas1)
+
+## 🚀 New Features
+
+- Build `cudf-polars` with `build.sh` ([#16898](https://github.com/rapidsai/cudf/pull/16898)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add polars to &quot;all&quot; dependency list. ([#16875](https://github.com/rapidsai/cudf/pull/16875)) [@bdice](https://github.com/bdice)
+- nvCOMP GZIP integration ([#16770](https://github.com/rapidsai/cudf/pull/16770)) [@vuule](https://github.com/vuule)
+- [FEA] Add support for `cudf.NamedAgg` ([#16744](https://github.com/rapidsai/cudf/pull/16744)) [@Matt711](https://github.com/Matt711)
+- Add experimental `filesystem=&quot;arrow&quot;` support in `dask_cudf.read_parquet` ([#16684](https://github.com/rapidsai/cudf/pull/16684)) [@rjzamora](https://github.com/rjzamora)
+- Relax Arrow pin ([#16681](https://github.com/rapidsai/cudf/pull/16681)) [@vyasr](https://github.com/vyasr)
+- Add libcudf wrappers around current_device_resource functions. ([#16679](https://github.com/rapidsai/cudf/pull/16679)) [@harrism](https://github.com/harrism)
+- Move NDS-H examples into benchmarks ([#16663](https://github.com/rapidsai/cudf/pull/16663)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- [FEA] Add third-party library integration testing of cudf.pandas to cudf ([#16645](https://github.com/rapidsai/cudf/pull/16645)) [@Matt711](https://github.com/Matt711)
+- Make isinstance check pass for proxy ndarrays ([#16601](https://github.com/rapidsai/cudf/pull/16601)) [@Matt711](https://github.com/Matt711)
+- [FEA] Add an environment variable to fail on fallback in `cudf.pandas` ([#16562](https://github.com/rapidsai/cudf/pull/16562)) [@Matt711](https://github.com/Matt711)
+- [FEA] Add support for `cudf.unique` ([#16554](https://github.com/rapidsai/cudf/pull/16554)) [@Matt711](https://github.com/Matt711)
+- [FEA] Support named aggregations in `df.groupby().agg()` ([#16528](https://github.com/rapidsai/cudf/pull/16528)) [@Matt711](https://github.com/Matt711)
+- Change IPv4 convert APIs to support UINT32 instead of INT64 ([#16489](https://github.com/rapidsai/cudf/pull/16489)) [@davidwendt](https://github.com/davidwendt)
+- enable list to be forced as string in JSON reader. ([#16472](https://github.com/rapidsai/cudf/pull/16472)) [@karthikeyann](https://github.com/karthikeyann)
+- Remove cuDF dependency from pylibcudf column from_device tests ([#16441](https://github.com/rapidsai/cudf/pull/16441)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Enable cudf.pandas REPL and -c command support ([#16428](https://github.com/rapidsai/cudf/pull/16428)) [@bdice](https://github.com/bdice)
+- Setup pylibcudf package ([#16299](https://github.com/rapidsai/cudf/pull/16299)) [@lithomas1](https://github.com/lithomas1)
+- Add a libcudf/thrust-based TPC-H derived datagen ([#16294](https://github.com/rapidsai/cudf/pull/16294)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Make proxy NumPy arrays pass isinstance check in `cudf.pandas` ([#16286](https://github.com/rapidsai/cudf/pull/16286)) [@Matt711](https://github.com/Matt711)
+- Add skiprows and nrows to parquet reader ([#16214](https://github.com/rapidsai/cudf/pull/16214)) [@lithomas1](https://github.com/lithomas1)
+- Upgrade to nvcomp 4.0.1 ([#16076](https://github.com/rapidsai/cudf/pull/16076)) [@vuule](https://github.com/vuule)
+- Migrate ORC reader to pylibcudf ([#16042](https://github.com/rapidsai/cudf/pull/16042)) [@lithomas1](https://github.com/lithomas1)
+- JSON reader validation of values ([#15968](https://github.com/rapidsai/cudf/pull/15968)) [@karthikeyann](https://github.com/karthikeyann)
+- Implement exposed null mask APIs in pylibcudf ([#15908](https://github.com/rapidsai/cudf/pull/15908)) [@charlesbluca](https://github.com/charlesbluca)
+- Word-based nvtext::minhash function ([#15368](https://github.com/rapidsai/cudf/pull/15368)) [@davidwendt](https://github.com/davidwendt)
+
+## 🛠️ Improvements
+
+- Make tests deterministic ([#16910](https://github.com/rapidsai/cudf/pull/16910)) [@galipremsagar](https://github.com/galipremsagar)
+- Update update-version.sh to use packaging lib ([#16891](https://github.com/rapidsai/cudf/pull/16891)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Pin polars for 24.10 and update polars test suite xfail list ([#16886](https://github.com/rapidsai/cudf/pull/16886)) [@wence-](https://github.com/wence-)
+- Add in support for setting delim when parsing JSON through java ([#16867) (#16880](https://github.com/rapidsai/cudf/pull/16867) (#16880)) [@revans2](https://github.com/revans2)
+- Remove unnecessary flag from build.sh ([#16879](https://github.com/rapidsai/cudf/pull/16879)) [@vyasr](https://github.com/vyasr)
+- Ignore numba warning specific to ARM runners ([#16872](https://github.com/rapidsai/cudf/pull/16872)) [@galipremsagar](https://github.com/galipremsagar)
+- Display deltas for `cudf.pandas` test summary ([#16864](https://github.com/rapidsai/cudf/pull/16864)) [@galipremsagar](https://github.com/galipremsagar)
+- Switch to using native `traceback` ([#16851](https://github.com/rapidsai/cudf/pull/16851)) [@galipremsagar](https://github.com/galipremsagar)
+- JSON tree algorithm code reorg ([#16836](https://github.com/rapidsai/cudf/pull/16836)) [@karthikeyann](https://github.com/karthikeyann)
+- Add string.repeats API to pylibcudf ([#16834](https://github.com/rapidsai/cudf/pull/16834)) [@mroeschke](https://github.com/mroeschke)
+- Use CI workflow branch &#39;branch-24.10&#39; again ([#16832](https://github.com/rapidsai/cudf/pull/16832)) [@jameslamb](https://github.com/jameslamb)
+- Rename the NDS-H benchmark binaries ([#16831](https://github.com/rapidsai/cudf/pull/16831)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Add string.findall APIs to pylibcudf ([#16825](https://github.com/rapidsai/cudf/pull/16825)) [@mroeschke](https://github.com/mroeschke)
+- Add string.extract APIs to pylibcudf ([#16823](https://github.com/rapidsai/cudf/pull/16823)) [@mroeschke](https://github.com/mroeschke)
+- use get-pr-info from nv-gha-runners ([#16819](https://github.com/rapidsai/cudf/pull/16819)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Add string.contains APIs to pylibcudf ([#16814](https://github.com/rapidsai/cudf/pull/16814)) [@mroeschke](https://github.com/mroeschke)
+- Forward-merge branch-24.08 to branch-24.10 ([#16813](https://github.com/rapidsai/cudf/pull/16813)) [@bdice](https://github.com/bdice)
+- Add io_type axis with default `PINNED_BUFFER` to nvbench PQ multithreaded reader ([#16809](https://github.com/rapidsai/cudf/pull/16809)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Update fmt (to 11.0.2) and spdlog (to 1.14.1). ([#16806](https://github.com/rapidsai/cudf/pull/16806)) [@jameslamb](https://github.com/jameslamb)
+- Add ability to set parquet row group max #rows and #bytes in java ([#16805](https://github.com/rapidsai/cudf/pull/16805)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Add in option for Java JSON APIs to do column pruning in CUDF ([#16796](https://github.com/rapidsai/cudf/pull/16796)) [@revans2](https://github.com/revans2)
+- Support drop_first in get_dummies ([#16795](https://github.com/rapidsai/cudf/pull/16795)) [@mroeschke](https://github.com/mroeschke)
+- Exposed stream-ordering to join API ([#16793](https://github.com/rapidsai/cudf/pull/16793)) [@lamarrr](https://github.com/lamarrr)
+- Add string.attributes APIs to pylibcudf ([#16785](https://github.com/rapidsai/cudf/pull/16785)) [@mroeschke](https://github.com/mroeschke)
+- Java: Make ColumnVector.fromViewWithContiguousAllocation public ([#16784](https://github.com/rapidsai/cudf/pull/16784)) [@jlowe](https://github.com/jlowe)
+- Add partitioning APIs to pylibcudf ([#16781](https://github.com/rapidsai/cudf/pull/16781)) [@mroeschke](https://github.com/mroeschke)
+- Optimization of tdigest merge aggregation. ([#16780](https://github.com/rapidsai/cudf/pull/16780)) [@nvdbaranec](https://github.com/nvdbaranec)
+- use libkvikio wheels in wheel builds ([#16778](https://github.com/rapidsai/cudf/pull/16778)) [@jameslamb](https://github.com/jameslamb)
+- Exposed stream-ordering to datetime API ([#16774](https://github.com/rapidsai/cudf/pull/16774)) [@lamarrr](https://github.com/lamarrr)
+- Add io/timezone APIs to pylibcudf ([#16771](https://github.com/rapidsai/cudf/pull/16771)) [@mroeschke](https://github.com/mroeschke)
+- Remove `MultiIndex._poplevel` inplace implementation. ([#16767](https://github.com/rapidsai/cudf/pull/16767)) [@mroeschke](https://github.com/mroeschke)
+- allow pandas patch version to float in cudf-pandas unit tests ([#16763](https://github.com/rapidsai/cudf/pull/16763)) [@jameslamb](https://github.com/jameslamb)
+- Simplify the nvCOMP adapter ([#16762](https://github.com/rapidsai/cudf/pull/16762)) [@vuule](https://github.com/vuule)
+- Add labeling APIs to pylibcudf ([#16761](https://github.com/rapidsai/cudf/pull/16761)) [@mroeschke](https://github.com/mroeschke)
+- Add transform APIs to pylibcudf ([#16760](https://github.com/rapidsai/cudf/pull/16760)) [@mroeschke](https://github.com/mroeschke)
+- Add a benchmark to study Parquet reader&#39;s performance for wide tables ([#16751](https://github.com/rapidsai/cudf/pull/16751)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Change the Parquet writer&#39;s `default_row_group_size_bytes` from 128MB to inf ([#16750](https://github.com/rapidsai/cudf/pull/16750)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Add transpose API to pylibcudf ([#16749](https://github.com/rapidsai/cudf/pull/16749)) [@mroeschke](https://github.com/mroeschke)
+- Add support for Python 3.12, update Kafka dependencies to 2.5.x ([#16745](https://github.com/rapidsai/cudf/pull/16745)) [@jameslamb](https://github.com/jameslamb)
+- Generate GPU vs CPU usage metrics per pytest file in pandas testsuite for `cudf.pandas` ([#16739](https://github.com/rapidsai/cudf/pull/16739)) [@galipremsagar](https://github.com/galipremsagar)
+- Refactor cudf pandas integration tests CI ([#16728](https://github.com/rapidsai/cudf/pull/16728)) [@Matt711](https://github.com/Matt711)
+- Remove ERROR_TEST gtest from libcudf ([#16722](https://github.com/rapidsai/cudf/pull/16722)) [@davidwendt](https://github.com/davidwendt)
+- Use Series._from_column more consistently to avoid validation ([#16716](https://github.com/rapidsai/cudf/pull/16716)) [@mroeschke](https://github.com/mroeschke)
+- remove some unnecessary libcudf nightly builds ([#16714](https://github.com/rapidsai/cudf/pull/16714)) [@jameslamb](https://github.com/jameslamb)
+- Remove xfail from torch-cudf.pandas integration test ([#16705](https://github.com/rapidsai/cudf/pull/16705)) [@Matt711](https://github.com/Matt711)
+- Add return type annotations to MultiIndex ([#16696](https://github.com/rapidsai/cudf/pull/16696)) [@mroeschke](https://github.com/mroeschke)
+- Add type annotations to Index classes, utilize _from_column more ([#16695](https://github.com/rapidsai/cudf/pull/16695)) [@mroeschke](https://github.com/mroeschke)
+- Have interval_range use IntervalIndex.from_breaks, remove column_empty_same_mask ([#16694](https://github.com/rapidsai/cudf/pull/16694)) [@mroeschke](https://github.com/mroeschke)
+- Increase timeouts for couple of tests ([#16692](https://github.com/rapidsai/cudf/pull/16692)) [@galipremsagar](https://github.com/galipremsagar)
+- Replace raw device_memory_resource pointer in pylibcudf Cython ([#16674](https://github.com/rapidsai/cudf/pull/16674)) [@harrism](https://github.com/harrism)
+- switch from typing.Callable to collections.abc.Callable ([#16670](https://github.com/rapidsai/cudf/pull/16670)) [@jameslamb](https://github.com/jameslamb)
+- Update rapidsai/pre-commit-hooks ([#16669](https://github.com/rapidsai/cudf/pull/16669)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Multi-file and Parquet-aware prefetching from remote storage ([#16657](https://github.com/rapidsai/cudf/pull/16657)) [@rjzamora](https://github.com/rjzamora)
+- Access Frame attributes instead of ColumnAccessor attributes when available ([#16652](https://github.com/rapidsai/cudf/pull/16652)) [@mroeschke](https://github.com/mroeschke)
+- Use non-mangled type names in nvbench output ([#16649](https://github.com/rapidsai/cudf/pull/16649)) [@davidwendt](https://github.com/davidwendt)
+- Add pylibcudf build dir in build.sh for `clean` ([#16648](https://github.com/rapidsai/cudf/pull/16648)) [@galipremsagar](https://github.com/galipremsagar)
+- Prune workflows based on changed files ([#16642](https://github.com/rapidsai/cudf/pull/16642)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Remove arrow dependency ([#16640](https://github.com/rapidsai/cudf/pull/16640)) [@vyasr](https://github.com/vyasr)
+- Support reading multiple PQ sources with mismatching nullability for columns ([#16639](https://github.com/rapidsai/cudf/pull/16639)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Drop Python 3.9 support ([#16637](https://github.com/rapidsai/cudf/pull/16637)) [@jameslamb](https://github.com/jameslamb)
+- Support DecimalDtype meta in dask_cudf ([#16634](https://github.com/rapidsai/cudf/pull/16634)) [@mroeschke](https://github.com/mroeschke)
+- Add `num_multiprocessors` utility ([#16628](https://github.com/rapidsai/cudf/pull/16628)) [@PointKernel](https://github.com/PointKernel)
+- Annotate `ColumnAccessor._data` labels as `Hashable` ([#16623](https://github.com/rapidsai/cudf/pull/16623)) [@mroeschke](https://github.com/mroeschke)
+- Remove build_categorical_column in favor of CategoricalColumn constructor ([#16617](https://github.com/rapidsai/cudf/pull/16617)) [@mroeschke](https://github.com/mroeschke)
+- Move apply_boolean_mask benchmark to nvbench ([#16616](https://github.com/rapidsai/cudf/pull/16616)) [@davidwendt](https://github.com/davidwendt)
+- Revise `get_reader_filepath_or_buffer` to handle a list of data sources ([#16613](https://github.com/rapidsai/cudf/pull/16613)) [@rjzamora](https://github.com/rjzamora)
+- do not install cudf in cudf_polars wheel tests ([#16612](https://github.com/rapidsai/cudf/pull/16612)) [@jameslamb](https://github.com/jameslamb)
+- remove streamz git dependency, standardize build dependency names, consolidate some dependency lists ([#16611](https://github.com/rapidsai/cudf/pull/16611)) [@jameslamb](https://github.com/jameslamb)
+- Fix C++ and Cython io types ([#16610](https://github.com/rapidsai/cudf/pull/16610)) [@vyasr](https://github.com/vyasr)
+- Remove arrow_io_source ([#16607](https://github.com/rapidsai/cudf/pull/16607)) [@vyasr](https://github.com/vyasr)
+- Remove thrust::optional from expression evaluator ([#16604](https://github.com/rapidsai/cudf/pull/16604)) [@bdice](https://github.com/bdice)
+- Add stricter typing and validation to ColumnAccessor ([#16602](https://github.com/rapidsai/cudf/pull/16602)) [@mroeschke](https://github.com/mroeschke)
+- make more use of YAML anchors in dependencies.yaml ([#16597](https://github.com/rapidsai/cudf/pull/16597)) [@jameslamb](https://github.com/jameslamb)
+- Enable testing `cudf.pandas` unit tests for all minor versions of pandas ([#16595](https://github.com/rapidsai/cudf/pull/16595)) [@galipremsagar](https://github.com/galipremsagar)
+- Extend the Parquet writer&#39;s dictionary encoding benchmark. ([#16591](https://github.com/rapidsai/cudf/pull/16591)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove legacy Arrow interop APIs ([#16590](https://github.com/rapidsai/cudf/pull/16590)) [@vyasr](https://github.com/vyasr)
+- Remove NativeFile support from cudf Python ([#16589](https://github.com/rapidsai/cudf/pull/16589)) [@vyasr](https://github.com/vyasr)
+- Add build job for pylibcudf ([#16587](https://github.com/rapidsai/cudf/pull/16587)) [@vyasr](https://github.com/vyasr)
+- Add `public` qualifier for some member functions in Java class `Schema` ([#16583](https://github.com/rapidsai/cudf/pull/16583)) [@ttnghia](https://github.com/ttnghia)
+- Enable gtests previously disabled for compute-sanitizer bug ([#16581](https://github.com/rapidsai/cudf/pull/16581)) [@davidwendt](https://github.com/davidwendt)
+- [FEA] Add filesystem argument to `cudf.read_parquet` ([#16577](https://github.com/rapidsai/cudf/pull/16577)) [@rjzamora](https://github.com/rjzamora)
+- Ensure size is always passed to NumericalColumn ([#16576](https://github.com/rapidsai/cudf/pull/16576)) [@mroeschke](https://github.com/mroeschke)
+- standardize and consolidate wheel installations in testing scripts ([#16575](https://github.com/rapidsai/cudf/pull/16575)) [@jameslamb](https://github.com/jameslamb)
+- Performance improvement for strings::slice for wide strings ([#16574](https://github.com/rapidsai/cudf/pull/16574)) [@davidwendt](https://github.com/davidwendt)
+- Add `ToCudfBackend` expression to dask-cudf ([#16573](https://github.com/rapidsai/cudf/pull/16573)) [@rjzamora](https://github.com/rjzamora)
+- CI: Test against old versions of key dependencies ([#16570](https://github.com/rapidsai/cudf/pull/16570)) [@seberg](https://github.com/seberg)
+- Replace `NativeFile` dependency in dask-cudf Parquet reader ([#16569](https://github.com/rapidsai/cudf/pull/16569)) [@rjzamora](https://github.com/rjzamora)
+- Align public utility function signatures  with pandas 2.x ([#16565](https://github.com/rapidsai/cudf/pull/16565)) [@mroeschke](https://github.com/mroeschke)
+- Move libcudf reduction google-benchmarks to nvbench ([#16564](https://github.com/rapidsai/cudf/pull/16564)) [@davidwendt](https://github.com/davidwendt)
+- Rework strings::slice benchmark to use nvbench ([#16563](https://github.com/rapidsai/cudf/pull/16563)) [@davidwendt](https://github.com/davidwendt)
+- Reenable arrow tests ([#16556](https://github.com/rapidsai/cudf/pull/16556)) [@vyasr](https://github.com/vyasr)
+- Clean up reshaping ops ([#16553](https://github.com/rapidsai/cudf/pull/16553)) [@mroeschke](https://github.com/mroeschke)
+- Disallow cudf.Index accepting column in favor of ._from_column ([#16549](https://github.com/rapidsai/cudf/pull/16549)) [@mroeschke](https://github.com/mroeschke)
+- Rewrite remaining Python Arrow interop conversions using the C Data Interface ([#16548](https://github.com/rapidsai/cudf/pull/16548)) [@vyasr](https://github.com/vyasr)
+- [REVIEW] JSON host tree algorithms ([#16545](https://github.com/rapidsai/cudf/pull/16545)) [@shrshi](https://github.com/shrshi)
+- Refactor dictionary encoding in PQ writer to migrate to the new `cuco::static_map` ([#16541](https://github.com/rapidsai/cudf/pull/16541)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove hardcoded versions from workflows. ([#16540](https://github.com/rapidsai/cudf/pull/16540)) [@bdice](https://github.com/bdice)
+- Ensure comparisons with pyints and integer series always succeed ([#16532](https://github.com/rapidsai/cudf/pull/16532)) [@seberg](https://github.com/seberg)
+- Remove unneeded output size parameter from internal count_matches utility ([#16531](https://github.com/rapidsai/cudf/pull/16531)) [@davidwendt](https://github.com/davidwendt)
+- Remove invalid column_view usage in string-scalar-to-column function ([#16530](https://github.com/rapidsai/cudf/pull/16530)) [@davidwendt](https://github.com/davidwendt)
+- Raise NotImplementedError for Series.rename that&#39;s not a scalar ([#16525](https://github.com/rapidsai/cudf/pull/16525)) [@mroeschke](https://github.com/mroeschke)
+- Remove deprecated public APIs from libcudf ([#16524](https://github.com/rapidsai/cudf/pull/16524)) [@davidwendt](https://github.com/davidwendt)
+- Return Interval object in pandas compat mode for IntervalIndex reductions ([#16523](https://github.com/rapidsai/cudf/pull/16523)) [@mroeschke](https://github.com/mroeschke)
+- Update json normalization to take device_buffer ([#16520](https://github.com/rapidsai/cudf/pull/16520)) [@karthikeyann](https://github.com/karthikeyann)
+- Rework cudf::io::text::byte_range_info class member functions ([#16518](https://github.com/rapidsai/cudf/pull/16518)) [@davidwendt](https://github.com/davidwendt)
+- Remove unneeded pair-iterator benchmark ([#16511](https://github.com/rapidsai/cudf/pull/16511)) [@davidwendt](https://github.com/davidwendt)
+- Update pre-commit hooks ([#16510](https://github.com/rapidsai/cudf/pull/16510)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Improve update-version.sh ([#16506](https://github.com/rapidsai/cudf/pull/16506)) [@bdice](https://github.com/bdice)
+- Use tool.scikit-build.cmake.version, set scikit-build-core minimum-version ([#16503](https://github.com/rapidsai/cudf/pull/16503)) [@jameslamb](https://github.com/jameslamb)
+- Pass batch size to JSON reader using environment variable ([#16502](https://github.com/rapidsai/cudf/pull/16502)) [@shrshi](https://github.com/shrshi)
+- Remove a deprecated multibyte_split API ([#16501](https://github.com/rapidsai/cudf/pull/16501)) [@davidwendt](https://github.com/davidwendt)
+- Add interop example for `arrow::StringViewArray` to `cudf::column` ([#16498](https://github.com/rapidsai/cudf/pull/16498)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Add keep option to distinct nvbench ([#16497](https://github.com/rapidsai/cudf/pull/16497)) [@bdice](https://github.com/bdice)
+- Use more idomatic cudf APIs in dask_cudf meta generation ([#16487](https://github.com/rapidsai/cudf/pull/16487)) [@mroeschke](https://github.com/mroeschke)
+- Fix typo in dispatch_row_equal. ([#16473](https://github.com/rapidsai/cudf/pull/16473)) [@bdice](https://github.com/bdice)
+- Use explicit construction of column subclass instead of `build_column` when type is known ([#16470](https://github.com/rapidsai/cudf/pull/16470)) [@mroeschke](https://github.com/mroeschke)
+- Move exception handler into pylibcudf from cudf ([#16468](https://github.com/rapidsai/cudf/pull/16468)) [@lithomas1](https://github.com/lithomas1)
+- Make StructColumn.__init__ strict ([#16467](https://github.com/rapidsai/cudf/pull/16467)) [@mroeschke](https://github.com/mroeschke)
+- Make ListColumn.__init__ strict ([#16465](https://github.com/rapidsai/cudf/pull/16465)) [@mroeschke](https://github.com/mroeschke)
+- Make Timedelta/DatetimeColumn.__init__ strict ([#16464](https://github.com/rapidsai/cudf/pull/16464)) [@mroeschke](https://github.com/mroeschke)
+- Make NumericalColumn.__init__ strict ([#16457](https://github.com/rapidsai/cudf/pull/16457)) [@mroeschke](https://github.com/mroeschke)
+- Make CategoricalColumn.__init__ strict ([#16456](https://github.com/rapidsai/cudf/pull/16456)) [@mroeschke](https://github.com/mroeschke)
+- Disallow cudf.Series to accept column in favor of `._from_column` ([#16454](https://github.com/rapidsai/cudf/pull/16454)) [@mroeschke](https://github.com/mroeschke)
+- Expose `stream` param in transform APIs ([#16452](https://github.com/rapidsai/cudf/pull/16452)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Add upper bound pin for polars ([#16442](https://github.com/rapidsai/cudf/pull/16442)) [@wence-](https://github.com/wence-)
+- Make (Indexed)Frame.__init__ require data (and index) ([#16430](https://github.com/rapidsai/cudf/pull/16430)) [@mroeschke](https://github.com/mroeschke)
+- Add Java APIs to copy column data to host asynchronously ([#16429](https://github.com/rapidsai/cudf/pull/16429)) [@jlowe](https://github.com/jlowe)
+- Update docs of the TPC-H derived examples ([#16423](https://github.com/rapidsai/cudf/pull/16423)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Use RMM adaptor constructors instead of factories. ([#16414](https://github.com/rapidsai/cudf/pull/16414)) [@bdice](https://github.com/bdice)
+- Align ewm APIs with pandas 2.x ([#16413](https://github.com/rapidsai/cudf/pull/16413)) [@mroeschke](https://github.com/mroeschke)
+- Remove checking for specific tests in memcheck script ([#16412](https://github.com/rapidsai/cudf/pull/16412)) [@davidwendt](https://github.com/davidwendt)
+- Add stream parameter to reshape APIs ([#16410](https://github.com/rapidsai/cudf/pull/16410)) [@davidwendt](https://github.com/davidwendt)
+- Align groupby APIs with pandas 2.x ([#16403](https://github.com/rapidsai/cudf/pull/16403)) [@mroeschke](https://github.com/mroeschke)
+- Align misc DataFrame and MultiIndex methods with pandas 2.x ([#16402](https://github.com/rapidsai/cudf/pull/16402)) [@mroeschke](https://github.com/mroeschke)
+- update some branch references in GitHub Actions configs ([#16397](https://github.com/rapidsai/cudf/pull/16397)) [@jameslamb](https://github.com/jameslamb)
+- Support reading matching projected and filter cols from Parquet files with otherwise mismatched schemas ([#16394](https://github.com/rapidsai/cudf/pull/16394)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Merge branch-24.08 into branch-24.10 ([#16393](https://github.com/rapidsai/cudf/pull/16393)) [@jameslamb](https://github.com/jameslamb)
+- Add query 10 to the TPC-H suite ([#16392](https://github.com/rapidsai/cudf/pull/16392)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Use `make_host_vector` instead of `make_std_vector` to facilitate pinned memory optimizations ([#16386](https://github.com/rapidsai/cudf/pull/16386)) [@vuule](https://github.com/vuule)
+- Fix some issues with deprecated / removed cccl facilities ([#16377](https://github.com/rapidsai/cudf/pull/16377)) [@miscco](https://github.com/miscco)
+- Align IntervalIndex APIs with pandas 2.x ([#16371](https://github.com/rapidsai/cudf/pull/16371)) [@mroeschke](https://github.com/mroeschke)
+- Align CategoricalIndex APIs with pandas 2.x ([#16369](https://github.com/rapidsai/cudf/pull/16369)) [@mroeschke](https://github.com/mroeschke)
+- Align TimedeltaIndex APIs with pandas 2.x ([#16368](https://github.com/rapidsai/cudf/pull/16368)) [@mroeschke](https://github.com/mroeschke)
+- Align DatetimeIndex APIs with pandas 2.x ([#16367](https://github.com/rapidsai/cudf/pull/16367)) [@mroeschke](https://github.com/mroeschke)
+- fix [tool.setuptools] reference in custreamz config ([#16365](https://github.com/rapidsai/cudf/pull/16365)) [@jameslamb](https://github.com/jameslamb)
+- Align Index APIs with pandas 2.x ([#16361](https://github.com/rapidsai/cudf/pull/16361)) [@mroeschke](https://github.com/mroeschke)
+- Rebuild for &amp; Support NumPy 2 ([#16300](https://github.com/rapidsai/cudf/pull/16300)) [@jakirkham](https://github.com/jakirkham)
+- Add `stream` param to stream compaction APIs ([#16295](https://github.com/rapidsai/cudf/pull/16295)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Added batch memset to memset data and validity buffers in parquet reader ([#16281](https://github.com/rapidsai/cudf/pull/16281)) [@sdrp713](https://github.com/sdrp713)
+- Deduplicate decimal32/decimal64 to decimal128 conversion function ([#16236](https://github.com/rapidsai/cudf/pull/16236)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Refactor mixed_semi_join using cuco::static_set ([#16230](https://github.com/rapidsai/cudf/pull/16230)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- Improve performance of hash_character_ngrams using warp-per-string kernel ([#16212](https://github.com/rapidsai/cudf/pull/16212)) [@davidwendt](https://github.com/davidwendt)
+- Add environment variable to log cudf.pandas fallback calls ([#16161](https://github.com/rapidsai/cudf/pull/16161)) [@mroeschke](https://github.com/mroeschke)
+- Add libcudf example with large strings ([#15983](https://github.com/rapidsai/cudf/pull/15983)) [@davidwendt](https://github.com/davidwendt)
+- JSON tree algorithms refactor I: CSR data structure for column tree ([#15979](https://github.com/rapidsai/cudf/pull/15979)) [@shrshi](https://github.com/shrshi)
+- Support multiple new-line characters in regex APIs ([#15961](https://github.com/rapidsai/cudf/pull/15961)) [@davidwendt](https://github.com/davidwendt)
+- adding wheel build for libcudf ([#15483](https://github.com/rapidsai/cudf/pull/15483)) [@msarahan](https://github.com/msarahan)
+- Replace usages of `thrust::optional` with `std::optional` ([#15091](https://github.com/rapidsai/cudf/pull/15091)) [@miscco](https://github.com/miscco)
+
 # cudf 24.08.00 (7 Aug 2024)
 
 ## 🚨 Breaking Changes