diff --git a/.github/labeler.yml b/.github/labeler.yml
index 90cdda4d3ca..8506d38a048 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -12,7 +12,7 @@ cudf.polars:
   - 'python/cudf_polars/**'
 
 pylibcudf:
-  - 'python/cudf/pylibcudf/**'
+  - 'python/pylibcudf/**'
 
 libcudf:
   - 'cpp/**'
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 08d08c9c5a0..c034752d373 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -62,7 +62,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index ade2f35397b..a65cae34653 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -30,6 +30,7 @@ jobs:
       - wheel-tests-cudf
       - wheel-build-cudf-polars
       - wheel-tests-cudf-polars
+      - cudf-polars-polars-tests
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
       - devcontainer
@@ -49,6 +50,7 @@ jobs:
       test_java: ${{ steps.changed-files.outputs.java_any_changed == 'true' }}
       test_notebooks: ${{ steps.changed-files.outputs.notebooks_any_changed == 'true' }}
       test_python: ${{ steps.changed-files.outputs.python_any_changed == 'true' }}
+      test_cudf_pandas: ${{ steps.changed-files.outputs.cudf_pandas_any_changed == 'true' }}
     steps:
       - name: Get PR info
         id: get-pr-info
@@ -81,6 +83,7 @@ jobs:
               - '!java/**'
               - '!notebooks/**'
               - '!python/**'
+              - '!ci/cudf_pandas_scripts/**'
             java:
               - '**'
               - '!CONTRIBUTING.md'
@@ -89,11 +92,13 @@ jobs:
               - '!img/**'
               - '!notebooks/**'
               - '!python/**'
+              - '!ci/cudf_pandas_scripts/**'
             notebooks:
               - '**'
               - '!CONTRIBUTING.md'
               - '!README.md'
               - '!java/**'
+              - '!ci/cudf_pandas_scripts/**'
             python:
               - '**'
               - '!CONTRIBUTING.md'
@@ -102,6 +107,16 @@ jobs:
               - '!img/**'
               - '!java/**'
               - '!notebooks/**'
+              - '!ci/cudf_pandas_scripts/**'
+            cudf_pandas:
+              - '**'
+              - 'ci/cudf_pandas_scripts/**'
+              - '!CONTRIBUTING.md'
+              - '!README.md'
+              - '!docs/**'
+              - '!img/**'
+              - '!java/**'
+              - '!notebooks/**'
   checks:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
@@ -159,7 +174,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       run_script: "ci/test_java.sh"
   static-configure:
     needs: checks
@@ -180,7 +195,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
@@ -190,7 +205,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       run_script: "ci/build_docs.sh"
   wheel-build-libcudf:
     needs: checks
@@ -244,6 +259,17 @@ jobs:
       # This always runs, but only fails if this PR touches code in
       # pylibcudf or cudf_polars
       script: "ci/test_wheel_cudf_polars.sh"
+  cudf-polars-polars-tests:
+    needs: wheel-build-cudf-polars
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      # This always runs, but only fails if this PR touches code in
+      # pylibcudf or cudf_polars
+      script: "ci/test_cudf_polars_polars_tests.sh"
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
@@ -277,7 +303,7 @@ jobs:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_python == 'true'
+    if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -288,7 +314,7 @@ jobs:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_python == 'true'
+    if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index c06fe929988..a22d3c5b9cc 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -41,7 +41,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
@@ -81,7 +81,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
@@ -93,7 +93,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
diff --git a/build.sh b/build.sh
index 211e1db9fbf..56359eae235 100755
--- a/build.sh
+++ b/build.sh
@@ -17,13 +17,14 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcudf pylibcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
-HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
+VALIDARGS="clean libcudf pylibcudf cudf cudf_polars cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
+HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudf_polars] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
    clean                         - remove all existing build artifacts and configuration (start
                                    over)
    libcudf                       - build the cudf C++ code only
    pylibcudf                     - build the pylibcudf Python package
    cudf                          - build the cudf Python package
+   cudf_polars                   - build the cudf_polars Python package
    cudfjar                       - build cudf JAR with static libcudf using devtoolset toolchain
    dask_cudf                     - build the dask_cudf Python package
    benchmarks                    - build benchmarks
@@ -239,11 +240,6 @@ if hasArg --pydevelop; then
     PYTHON_ARGS_FOR_INSTALL="${PYTHON_ARGS_FOR_INSTALL} -e"
 fi
 
-# Append `-DFIND_CUDF_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option.
-if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUDF_CPP"* ]]; then
-    EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_CUDF_CPP=ON"
-fi
-
 if hasArg --disable_large_strings; then
     BUILD_DISABLE_LARGE_STRINGS="ON"
 fi
@@ -358,6 +354,12 @@ if buildAll || hasArg cudf; then
         python ${PYTHON_ARGS_FOR_INSTALL} .
 fi
 
+# Build and install the cudf_polars Python package
+if buildAll || hasArg cudf_polars; then
+
+    cd ${REPODIR}/python/cudf_polars
+    python ${PYTHON_ARGS_FOR_INSTALL} .
+fi
 
 # Build and install the dask_cudf Python package
 if buildAll || hasArg dask_cudf; then
diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 7a12db927e5..485b2ac8a51 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -67,20 +67,33 @@ def emoji_failed(x):
 # convert pr_results to a pandas DataFrame and then a markdown table
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
-diff_df = pr_df - main_df
-total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call']
-pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1)
-pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1)
+total_usage = main_df["_slow_function_call"] + main_df["_fast_function_call"]
+main_df["CPU Usage"] = ((main_df["_slow_function_call"] / total_usage) * 100.0).round(1)
+main_df["GPU Usage"] = ((main_df["_fast_function_call"] / total_usage) * 100.0).round(1)
+
+total_usage = pr_df["_slow_function_call"] + pr_df["_fast_function_call"]
+pr_df["CPU Usage"] = ((pr_df["_slow_function_call"] / total_usage) * 100.0).round(1)
+pr_df["GPU Usage"] = ((pr_df["_fast_function_call"] / total_usage) * 100.0).round(1)
+
+cpu_usage_mean = pr_df["CPU Usage"].mean().round(2)
+gpu_usage_mean = pr_df["GPU Usage"].mean().round(2)
+
+gpu_usage_rate_change = abs(pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean())
+pr_df["CPU Usage"] = pr_df["CPU Usage"].fillna(0)
+pr_df["GPU Usage"] = pr_df["GPU Usage"].fillna(0)
+main_df["CPU Usage"] = main_df["CPU Usage"].fillna(0)
+main_df["GPU Usage"] = main_df["GPU Usage"].fillna(0)
 
-cpu_usage_mean = pr_df['CPU Usage'].mean().round(2)
-gpu_usage_mean = pr_df['GPU Usage'].mean().round(2)
+diff_df = pr_df - main_df
+diff_df["CPU Usage"] = diff_df["CPU Usage"].round(1).fillna(0)
+diff_df["GPU Usage"] = diff_df["GPU Usage"].round(1).fillna(0)
 
-# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns
-pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%'
-pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%'
+# Add '%' suffix to "CPU Usage" and "GPU Usage" columns
+pr_df["CPU Usage"] = pr_df["CPU Usage"].astype(str) + "%"
+pr_df["GPU Usage"] = pr_df["GPU Usage"].astype(str) + "%"
 
-pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']]
-diff_df = diff_df[["total", "passed", "failed", "skipped"]]
+pr_df = pr_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
+diff_df = diff_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
 diff_df.columns = diff_df.columns + "_diff"
 diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
 diff_df["failed_diff"] = diff_df["failed_diff"].map(emoji_failed)
@@ -99,13 +112,36 @@ def emoji_failed(x):
         "passed_diff": "Passed delta",
         "failed_diff": "Failed delta",
         "skipped_diff": "Skipped delta",
+        "CPU Usage_diff": "CPU Usage delta",
+        "GPU Usage_diff": "GPU Usage delta",
     }
 )
-df = df.sort_values(by=["Failed tests", "Skipped tests"], ascending=False)
-
+df = df.sort_values(by=["CPU Usage delta", "Total tests"], ascending=False)
+df["CPU Usage delta"] = df["CPU Usage delta"].map(emoji_failed)
+df["GPU Usage delta"] = df["GPU Usage delta"].map(emoji_passed)
+df = df[
+    [
+        "Total tests",
+        "CPU Usage delta",
+        "GPU Usage delta",
+        "Passed tests",
+        "Failed tests",
+        "Skipped tests",
+        "CPU Usage",
+        "GPU Usage",
+        "Total delta",
+        "Passed delta",
+        "Failed delta",
+        "Skipped delta",
+    ]
+]
 print(comment)
 print()
-print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%")
+print(
+    f"Average GPU usage: {gpu_usage_mean}% {'an increase' if gpu_usage_rate_change > 0 else 'a decrease'} by {gpu_usage_rate_change}%"
+)
+print()
+print(f"Average CPU usage: {cpu_usage_mean}%")
 print()
 print("Here are the results of running the Pandas tests against this PR:")
 print()
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index c6228a4ef33..f6bdc6f9484 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -56,10 +56,10 @@ else
 
     echo "" > ./constraints.txt
     if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-        # `test_python` constraints are for `[test]` not `[cudf-pandas-tests]`
+        # `test_python_cudf_pandas` constraints are for `[test]` not `[cudf-pandas-tests]`
         rapids-dependency-file-generator \
             --output requirements \
-            --file-key test_python \
+            --file-key test_python_cudf_pandas \
             --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
         | tee ./constraints.txt
     fi
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index be55b49870f..870901d223b 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -25,9 +25,9 @@ NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 
 # Need to distutils-normalize the versions for some use cases
-CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))")
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
-PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))")
+CURRENT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${CURRENT_SHORT_TAG}'))")
+NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
+PATCH_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_PATCH}'))")
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -45,6 +45,8 @@ sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_dask_cudf.sh
 DEPENDENCIES=(
   cudf
   cudf_kafka
+  cugraph
+  cuml
   custreamz
   dask-cuda
   dask-cudf
@@ -57,7 +59,7 @@ DEPENDENCIES=(
   rmm
 )
 for DEP in "${DEPENDENCIES[@]}"; do
-  for FILE in dependencies.yaml conda/environments/*.yaml; do
+  for FILE in dependencies.yaml conda/environments/*.yaml python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
   for FILE in python/*/pyproject.toml; do
@@ -80,6 +82,7 @@ for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do
   sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
+sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_cudf_polars_polars_tests.sh
 
 # Java files
 NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT"
diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh
new file mode 100755
index 00000000000..95f78f17f2f
--- /dev/null
+++ b/ci/run_cudf_polars_polars_tests.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_cudf_polars_pytests.sh outside the script directory
+# Assumption, polars has been cloned in the root of the repo.
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../polars/
+
+DESELECTED_TESTS=(
+    "tests/unit/test_polars_import.py::test_polars_import" # relies on a polars built in place
+    "tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode
+    "tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error
+    "tests/docs/test_user_guide.py" # No dot binary in CI image
+)
+
+DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}")
+python -m pytest \
+       --import-mode=importlib \
+       --cache-clear \
+       -m "" \
+       -p cudf_polars.testing.plugin \
+       -v \
+       --tb=native \
+       ${DESELECTED_TESTS} \
+       "$@" \
+       py-polars/tests
diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
new file mode 100755
index 00000000000..55399d0371a
--- /dev/null
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -eou pipefail
+
+# We will only fail these tests if the PR touches code in pylibcudf
+# or cudf_polars itself.
+# Note, the three dots mean we are doing diff between the merge-base
+# of upstream and HEAD. So this is asking, "does _this branch_ touch
+# files in cudf_polars/pylibcudf", rather than "are there changes
+# between upstream and this branch which touch cudf_polars/pylibcudf"
+# TODO: is the target branch exposed anywhere in an environment variable?
+if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+then
+    HAS_CHANGES=1
+    rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
+else
+    HAS_CHANGES=0
+    rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
+fi
+
+rapids-logger "Download wheels"
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
+
+# Download the pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
+
+rapids-logger "Install pylibcudf"
+python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
+
+rapids-logger "Install cudf_polars"
+python -m pip install $(echo ./dist/cudf_polars*.whl)
+
+TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
+rapids-logger "Clone polars to ${TAG}"
+git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1
+
+# Install requirements for running polars tests
+rapids-logger "Install polars test requirements"
+python -m pip install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt
+
+function set_exitcode()
+{
+    EXITCODE=$?
+}
+EXITCODE=0
+trap set_exitcode ERR
+set +e
+
+rapids-logger "Run polars tests"
+./ci/run_cudf_polars_polars_tests.sh
+
+trap ERR
+set -e
+
+if [ ${EXITCODE} != 0 ]; then
+    rapids-logger "Running polars test suite FAILED: exitcode ${EXITCODE}"
+else
+    rapids-logger "Running polars test suite PASSED"
+fi
+
+if [ ${HAS_CHANGES} == 1 ]; then
+    exit ${EXITCODE}
+else
+    exit 0
+fi
diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
index d0675b0431a..dc70661a17a 100755
--- a/ci/test_python_common.sh
+++ b/ci/test_python_common.sh
@@ -10,10 +10,10 @@ set -euo pipefail
 rapids-logger "Generate Python testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
-
+FILE_KEY=$1
 rapids-dependency-file-generator \
   --output conda \
-  --file-key test_python \
+  --file-key ${FILE_KEY} \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
     | tee "${ENV_YAML_DIR}/env.yaml"
 
diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh
index ae34047e87f..2386414b32e 100755
--- a/ci/test_python_cudf.sh
+++ b/ci/test_python_cudf.sh
@@ -5,7 +5,7 @@
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../;
 
 # Common setup steps shared by Python test jobs
-source ./ci/test_python_common.sh
+source ./ci/test_python_common.sh test_python_cudf
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index 06a24773cae..67c97ad29a5 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -5,7 +5,7 @@
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 
 # Common setup steps shared by Python test jobs
-source ./ci/test_python_common.sh
+source ./ci/test_python_common.sh test_python_other
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 28ded2f8e0f..a701bfe15e0 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -39,6 +39,7 @@ rapids-logger "pytest pylibcudf"
 pushd python/pylibcudf/pylibcudf/tests
 python -m pytest \
   --cache-clear \
+  --numprocesses=8 \
   --dist=worksteal \
   .
 popd
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index da9e50d0a2b..05f882a475b 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -13,10 +13,14 @@ set -eou pipefail
 if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/pylibcudf/)" ];
 then
     HAS_CHANGES=1
+    rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
 else
     HAS_CHANGES=0
+    rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
 fi
 
+rapids-logger "Download wheels"
+
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist
 
@@ -35,7 +39,7 @@ if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
       | tee ./constraints.txt
 fi
 
-# echo to expand wildcard before adding `[extra]` requires for pip
+# echo to expand wildcard before adding `[test]` requires for pip
 python -m pip install \
     -v \
     --constraint ./constraints.txt \
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 0d39807d56c..361a42ccda9 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -41,6 +41,7 @@ pushd python/dask_cudf/dask_cudf
 DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
+  --dist=worksteal \
   .
 popd
 
@@ -50,5 +51,6 @@ pushd python/dask_cudf/dask_cudf
 DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
   --numprocesses=8 \
+  --dist=worksteal \
   .
 popd
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 62d75965b9f..f91bf1e7046 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -31,7 +31,7 @@ dependencies:
 - doxygen=1.9.1
 - fastavro>=0.22.9
 - flatbuffers==24.3.25
-- fmt>=10.1.1,<11
+- fmt>=11.0.2,<12
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
 - hypothesis
@@ -84,7 +84,7 @@ dependencies:
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
-- spdlog>=1.12.0,<1.13
+- spdlog>=1.14.1,<1.15
 - sphinx
 - sphinx-autobuild
 - sphinx-copybutton
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index f16f2b377df..f4ec6bd5407 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - doxygen=1.9.1
 - fastavro>=0.22.9
 - flatbuffers==24.3.25
-- fmt>=10.1.1,<11
+- fmt>=11.0.2,<12
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
 - hypothesis
@@ -82,7 +82,7 @@ dependencies:
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
-- spdlog>=1.12.0,<1.13
+- spdlog>=1.14.1,<1.15
 - sphinx
 - sphinx-autobuild
 - sphinx-copybutton
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 33fa4b4eccf..dc75eb4b252 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -26,13 +26,13 @@ librdkafka_version:
   - ">=2.5.0,<2.6.0a0"
 
 fmt_version:
-  - ">=10.1.1,<11"
+  - ">=11.0.2,<12"
 
 flatbuffers_version:
   - "=24.3.25"
 
 spdlog_version:
-  - ">=1.12.0,<1.13"
+  - ">=1.14.1,<1.15"
 
 nvcomp_version:
   - "=4.0.1"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7bc01e64441..136f43ee706 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -378,7 +378,9 @@ add_library(
   src/io/csv/reader_impl.cu
   src/io/csv/writer_impl.cu
   src/io/functions.cpp
+  src/io/json/host_tree_algorithms.cu
   src/io/json/json_column.cu
+  src/io/json/column_tree_construction.cu
   src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
@@ -797,7 +799,7 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
+  PUBLIC CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool> spdlog::spdlog_header_only
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
           kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index abc6f74fccf..4113e38dcf4 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -230,6 +230,11 @@ ConfigureNVBench(STRUCT_CREATION_NVBENCH structs/create_structs.cpp)
 # --------------------------------------------------------------------------------
 ConfigureBench(QUANTILES_BENCH quantiles/quantiles.cpp)
 
+# ##################################################################################################
+# * tdigest benchmark
+# --------------------------------------------------------------------------------
+ConfigureNVBench(TDIGEST_NVBENCH quantiles/tdigest.cu)
+
 # ##################################################################################################
 # * type_dispatcher benchmark ---------------------------------------------------------------------
 ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher.cu)
diff --git a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
index d4368906702..54d177df401 100644
--- a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
+++ b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
@@ -85,7 +85,7 @@ std::unique_ptr<cudf::table> perform_left_join(cudf::table_view const& left_inpu
   auto const left_selected  = left_input.select(left_on);
   auto const right_selected = right_input.select(right_on);
   auto const [left_join_indices, right_join_indices] =
-    cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, mr);
+    cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, stream, mr);
 
   auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
   auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp
index 2d514764fc2..62116ddf661 100644
--- a/cpp/benchmarks/ndsh/utilities.cpp
+++ b/cpp/benchmarks/ndsh/utilities.cpp
@@ -28,6 +28,7 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/transform.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <cstdlib>
 #include <ctime>
@@ -146,11 +147,15 @@ std::unique_ptr<cudf::table> join_and_gather(cudf::table_view const& left_input,
                                              cudf::null_equality compare_nulls)
 {
   CUDF_FUNC_RANGE();
-  constexpr auto oob_policy                          = cudf::out_of_bounds_policy::DONT_CHECK;
-  auto const left_selected                           = left_input.select(left_on);
-  auto const right_selected                          = right_input.select(right_on);
-  auto const [left_join_indices, right_join_indices] = cudf::inner_join(
-    left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref());
+  constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK;
+  auto const left_selected  = left_input.select(left_on);
+  auto const right_selected = right_input.select(right_on);
+  auto const [left_join_indices, right_join_indices] =
+    cudf::inner_join(left_selected,
+                     right_selected,
+                     compare_nulls,
+                     cudf::get_default_stream(),
+                     cudf::get_current_device_resource_ref());
 
   auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
   auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
diff --git a/cpp/benchmarks/quantiles/tdigest.cu b/cpp/benchmarks/quantiles/tdigest.cu
new file mode 100644
index 00000000000..9d37dbc9a26
--- /dev/null
+++ b/cpp/benchmarks/quantiles/tdigest.cu
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+
+#include <nvbench/nvbench.cuh>
+
+void bm_tdigest_merge(nvbench::state& state)
+{
+  auto const num_tdigests = static_cast<cudf::size_type>(state.get_int64("num_tdigests"));
+  auto const tdigest_size = static_cast<cudf::size_type>(state.get_int64("tdigest_size"));
+  auto const tdigests_per_group =
+    static_cast<cudf::size_type>(state.get_int64("tdigests_per_group"));
+  auto const max_centroids   = static_cast<cudf::size_type>(state.get_int64("max_centroids"));
+  auto const num_groups      = num_tdigests / tdigests_per_group;
+  auto const total_centroids = num_tdigests * tdigest_size;
+
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
+  constexpr int base_value = 5;
+
+  // construct inner means/weights
+  auto val_iter = cudf::detail::make_counting_transform_iterator(
+    0, cuda::proclaim_return_type<double>([tdigest_size](cudf::size_type i) {
+      return static_cast<double>(base_value + (i % tdigest_size));
+    }));
+  auto one_iter = thrust::make_constant_iterator(1);
+  cudf::test::fixed_width_column_wrapper<double> means(val_iter, val_iter + total_centroids);
+  cudf::test::fixed_width_column_wrapper<double> weights(one_iter, one_iter + total_centroids);
+  std::vector<std::unique_ptr<cudf::column>> inner_struct_children;
+  inner_struct_children.push_back(means.release());
+  inner_struct_children.push_back(weights.release());
+  cudf::test::structs_column_wrapper inner_struct(std::move(inner_struct_children));
+
+  // construct the tdigest lists themselves
+  auto offset_iter = cudf::detail::make_counting_transform_iterator(
+    0, cuda::proclaim_return_type<cudf::size_type>([tdigest_size](cudf::size_type i) {
+      return i * tdigest_size;
+    }));
+  cudf::test::fixed_width_column_wrapper<int> offsets(offset_iter, offset_iter + num_tdigests + 1);
+  auto list_col = cudf::make_lists_column(
+    num_tdigests, offsets.release(), inner_struct.release(), 0, {}, stream, mr);
+
+  // min and max columns
+  auto min_iter = thrust::make_constant_iterator(base_value);
+  auto max_iter = thrust::make_constant_iterator(base_value + (tdigest_size - 1));
+  cudf::test::fixed_width_column_wrapper<double> mins(min_iter, min_iter + num_tdigests);
+  cudf::test::fixed_width_column_wrapper<double> maxes(max_iter, max_iter + num_tdigests);
+
+  // assemble the whole thing
+  std::vector<std::unique_ptr<cudf::column>> tdigest_children;
+  tdigest_children.push_back(std::move(list_col));
+  tdigest_children.push_back(mins.release());
+  tdigest_children.push_back(maxes.release());
+  cudf::test::structs_column_wrapper tdigest(std::move(tdigest_children));
+
+  rmm::device_uvector<cudf::size_type> group_offsets(num_groups + 1, stream, mr);
+  rmm::device_uvector<cudf::size_type> group_labels(num_tdigests, stream, mr);
+  auto group_offset_iter = cudf::detail::make_counting_transform_iterator(
+    0,
+    cuda::proclaim_return_type<cudf::size_type>(
+      [tdigests_per_group] __device__(cudf::size_type i) { return i * tdigests_per_group; }));
+  thrust::copy(rmm::exec_policy_nosync(stream, mr),
+               group_offset_iter,
+               group_offset_iter + num_groups + 1,
+               group_offsets.begin());
+  auto group_label_iter = cudf::detail::make_counting_transform_iterator(
+    0,
+    cuda::proclaim_return_type<cudf::size_type>(
+      [tdigests_per_group] __device__(cudf::size_type i) { return i / tdigests_per_group; }));
+  thrust::copy(rmm::exec_policy_nosync(stream, mr),
+               group_label_iter,
+               group_label_iter + num_tdigests,
+               group_labels.begin());
+
+  state.add_element_count(total_centroids);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch, auto& timer) {
+               timer.start();
+               auto result = cudf::tdigest::detail::group_merge_tdigest(
+                 tdigest, group_offsets, group_labels, num_groups, max_centroids, stream, mr);
+               timer.stop();
+             });
+}
+
+NVBENCH_BENCH(bm_tdigest_merge)
+  .set_name("TDigest many tiny groups")
+  .add_int64_axis("num_tdigests", {500'000})
+  .add_int64_axis("tdigest_size", {1, 1000})
+  .add_int64_axis("tdigests_per_group", {1})
+  .add_int64_axis("max_centroids", {10000, 1000});
+
+NVBENCH_BENCH(bm_tdigest_merge)
+  .set_name("TDigest many small groups")
+  .add_int64_axis("num_tdigests", {500'000})
+  .add_int64_axis("tdigest_size", {1, 1000})
+  .add_int64_axis("tdigests_per_group", {3})
+  .add_int64_axis("max_centroids", {10000, 1000});
diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
index c0e07d02d94..90b0f4d8a8e 100644
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ b/cpp/cmake/thirdparty/get_spdlog.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -16,21 +16,12 @@
 function(find_and_configure_spdlog)
 
   include(${rapids-cmake-dir}/cpm/spdlog.cmake)
-  rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET cudf-exports)
-  rapids_export_package(BUILD spdlog cudf-exports)
+  rapids_cpm_spdlog(
+    FMT_OPTION "EXTERNAL_FMT_HO"
+    INSTALL_EXPORT_SET cudf-exports
+    BUILD_EXPORT_SET cudf-exports
+  )
 
-  if(spdlog_ADDED)
-    rapids_export(
-      BUILD spdlog
-      EXPORT_SET spdlog
-      GLOBAL_TARGETS spdlog spdlog_header_only
-      NAMESPACE spdlog::
-    )
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(
-      BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports
-    )
-  endif()
 endfunction()
 
 find_and_configure_spdlog()
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 442731694fa..9cda22d0695 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -18,6 +18,8 @@
 
 #include "../utilities/timer.hpp"
 
+#include <cudf/utilities/default_stream.hpp>
+
 /**
  * @file parquet_io.cpp
  * @brief Demonstrates usage of the libcudf APIs to read and write
@@ -159,8 +161,11 @@ int main(int argc, char const** argv)
     // Left anti-join the original and transcoded tables
     // identical tables should not throw an exception and
     // return an empty indices vector
-    auto const indices = cudf::left_anti_join(
-      input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get());
+    auto const indices = cudf::left_anti_join(input->view(),
+                                              transcoded_input->view(),
+                                              cudf::null_equality::EQUAL,
+                                              cudf::get_default_stream(),
+                                              resource.get());
 
     // No exception thrown, check indices
     auto const valid = indices->size() == 0;
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index c7523c80b2b..7359a0d5fde 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -17,9 +17,12 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 
 /**
@@ -40,6 +43,7 @@ namespace datetime {
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t years
@@ -47,6 +51,7 @@ namespace datetime {
  */
 std::unique_ptr<cudf::column> extract_year(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -54,6 +59,7 @@ std::unique_ptr<cudf::column> extract_year(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t months
@@ -61,6 +67,7 @@ std::unique_ptr<cudf::column> extract_year(
  */
 std::unique_ptr<cudf::column> extract_month(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -68,6 +75,7 @@ std::unique_ptr<cudf::column> extract_month(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t days
@@ -75,6 +83,7 @@ std::unique_ptr<cudf::column> extract_month(
  */
 std::unique_ptr<cudf::column> extract_day(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -82,6 +91,7 @@ std::unique_ptr<cudf::column> extract_day(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t days
@@ -89,6 +99,7 @@ std::unique_ptr<cudf::column> extract_day(
  */
 std::unique_ptr<cudf::column> extract_weekday(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -96,6 +107,7 @@ std::unique_ptr<cudf::column> extract_weekday(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t hours
@@ -103,6 +115,7 @@ std::unique_ptr<cudf::column> extract_weekday(
  */
 std::unique_ptr<cudf::column> extract_hour(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -110,6 +123,7 @@ std::unique_ptr<cudf::column> extract_hour(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t minutes
@@ -117,6 +131,7 @@ std::unique_ptr<cudf::column> extract_hour(
  */
 std::unique_ptr<cudf::column> extract_minute(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -124,6 +139,7 @@ std::unique_ptr<cudf::column> extract_minute(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t seconds
@@ -131,6 +147,7 @@ std::unique_ptr<cudf::column> extract_minute(
  */
 std::unique_ptr<cudf::column> extract_second(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -141,6 +158,7 @@ std::unique_ptr<cudf::column> extract_second(
  * For example, the millisecond fraction of 1.234567890 seconds is 234.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t milliseconds
@@ -148,6 +166,7 @@ std::unique_ptr<cudf::column> extract_second(
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -158,6 +177,7 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  * For example, the microsecond fraction of 1.234567890 seconds is 567.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t microseconds
@@ -165,6 +185,7 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -175,6 +196,7 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  * For example, the nanosecond fraction of 1.234567890 seconds is 890.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t nanoseconds
@@ -182,6 +204,7 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
@@ -196,6 +219,7 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column containing last day of the month as TIMESTAMP_DAYS
@@ -203,6 +227,7 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
  */
 std::unique_ptr<cudf::column> last_day_of_month(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -210,6 +235,7 @@ std::unique_ptr<cudf::column> last_day_of_month(
  * returns an int16_t cudf::column. The value is between [1, {365-366}]
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of datatype INT16 containing the day number since the start of the year
@@ -217,6 +243,7 @@ std::unique_ptr<cudf::column> last_day_of_month(
  */
 std::unique_ptr<cudf::column> day_of_year(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -245,6 +272,7 @@ std::unique_ptr<cudf::column> day_of_year(
  *
  * @param timestamps cudf::column_view of timestamp type
  * @param months cudf::column_view of integer type containing the number of months to add
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of timestamp type containing the computed timestamps
@@ -252,6 +280,7 @@ std::unique_ptr<cudf::column> day_of_year(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::column_view const& months,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -280,6 +309,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  *
  * @param timestamps cudf::column_view of timestamp type
  * @param months cudf::scalar of integer type containing the number of months to add
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @return cudf::column of timestamp type containing the computed timestamps
@@ -287,6 +317,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::scalar const& months,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -297,6 +328,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  * `output[i] is null` if `column[i]` is null
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of datatype BOOL8 truth value of the corresponding date
@@ -304,6 +336,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  */
 std::unique_ptr<cudf::column> is_leap_year(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -315,11 +348,13 @@ std::unique_ptr<cudf::column> is_leap_year(
  * @throw cudf::logic_error if input column datatype is not a TIMESTAMP
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  * @return cudf::column of datatype INT16 of days in month of the corresponding date
  */
 std::unique_ptr<cudf::column> days_in_month(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -331,11 +366,13 @@ std::unique_ptr<cudf::column> days_in_month(
  * @throw cudf::logic_error if input column datatype is not a TIMESTAMP
  *
  * @param column The input column containing datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  * @return A column of INT16 type indicating which quarter the date is in
  */
 std::unique_ptr<cudf::column> extract_quarter(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -357,6 +394,7 @@ enum class rounding_frequency : int32_t {
  *
  * @param column cudf::column_view of the input datetime values
  * @param freq rounding_frequency indicating the frequency to round up to
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @throw cudf::logic_error if input column datatype is not TIMESTAMP.
@@ -365,6 +403,7 @@ enum class rounding_frequency : int32_t {
 std::unique_ptr<cudf::column> ceil_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -372,6 +411,7 @@ std::unique_ptr<cudf::column> ceil_datetimes(
  *
  * @param column cudf::column_view of the input datetime values
  * @param freq rounding_frequency indicating the frequency to round down to
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @throw cudf::logic_error if input column datatype is not TIMESTAMP.
@@ -380,6 +420,7 @@ std::unique_ptr<cudf::column> ceil_datetimes(
 std::unique_ptr<cudf::column> floor_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -387,6 +428,7 @@ std::unique_ptr<cudf::column> floor_datetimes(
  *
  * @param column cudf::column_view of the input datetime values
  * @param freq rounding_frequency indicating the frequency to round to
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @throw cudf::logic_error if input column datatype is not TIMESTAMP.
@@ -395,6 +437,7 @@ std::unique_ptr<cudf::column> floor_datetimes(
 std::unique_ptr<cudf::column> round_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index 31782cbaf8a..9db7e48498f 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -26,111 +26,108 @@ namespace CUDF_EXPORT cudf {
 namespace datetime {
 namespace detail {
 /**
- * @copydoc cudf::extract_year(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_year(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_year(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_month(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_month(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_month(cudf::column_view const& column,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_day(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_day(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_day(cudf::column_view const& column,
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_weekday(cudf::column_view const& column,
                                               rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_hour(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_minute(cudf::column_view const& column,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_second(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_second(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_second(cudf::column_view const& column,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&,
+ * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, rmm::cuda_stream_view,
  * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(cudf::column_view const& column,
                                                            rmm::cuda_stream_view stream,
                                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&,
+ * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, rmm::cuda_stream_view,
  * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(cudf::column_view const& column,
                                                            rmm::cuda_stream_view stream,
                                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&,
+ * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, rmm::cuda_stream_view,
  * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(cudf::column_view const& column,
                                                           rmm::cuda_stream_view stream,
                                                           rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> last_day_of_month(cudf::column_view const& column,
                                                 rmm::cuda_stream_view stream,
                                                 rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> day_of_year(cudf::column_view const& column,
                                           rmm::cuda_stream_view stream,
@@ -138,9 +135,8 @@ std::unique_ptr<cudf::column> day_of_year(cudf::column_view const& column,
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::column_view const&,
- * rmm::device_async_resource_ref)
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
                                                      cudf::column_view const& months,
@@ -149,9 +145,8 @@ std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& ti
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::scalar const&,
- * rmm::device_async_resource_ref)
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
                                                      cudf::scalar const& months,
@@ -159,9 +154,9 @@ std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& ti
                                                      rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> is_leap_year(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
index 5738f9ec8e9..f51d1ba42b2 100644
--- a/cpp/include/cudf/detail/timezone.hpp
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/timezone.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -26,14 +27,13 @@ namespace detail {
 
 /**
  * @copydoc cudf::make_timezone_transition_table(std::optional<std::string_view>, std::string_view,
- * rmm::device_async_resource_ref)
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
-  rmm::cuda_stream_view stream,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index cc8912cb022..a590eb27511 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -97,6 +97,7 @@ class distinct_hash_join;
  * @param[in] right_keys The right table
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -108,6 +109,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 inner_join(cudf::table_view const& left_keys,
            cudf::table_view const& right_keys,
            null_equality compare_nulls       = null_equality::EQUAL,
+           rmm::cuda_stream_view stream      = cudf::get_default_stream(),
            rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -137,6 +139,7 @@ inner_join(cudf::table_view const& left_keys,
  * @param[in] right_keys The right table
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -148,6 +151,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 left_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
           null_equality compare_nulls       = null_equality::EQUAL,
+          rmm::cuda_stream_view stream      = cudf::get_default_stream(),
           rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -176,6 +180,7 @@ left_join(cudf::table_view const& left_keys,
  * @param[in] right_keys The right table
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -187,6 +192,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 full_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
           null_equality compare_nulls       = null_equality::EQUAL,
+          rmm::cuda_stream_view stream      = cudf::get_default_stream(),
           rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -205,6 +211,7 @@ full_join(cudf::table_view const& left_keys,
  * @param left_keys The left table
  * @param right_keys The right table
  * @param compare_nulls Controls whether null join-key values should match or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A vector `left_indices` that can be used to construct
@@ -215,6 +222,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -236,6 +244,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
  * @param[in] right_keys The right table
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A column `left_indices` that can be used to construct
@@ -246,6 +255,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -266,6 +276,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
  *
  * @param left  The left table
  * @param right The right table
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr    Device memory resource used to allocate the returned table's device memory
  *
  * @return     Result of cross joining `left` and `right` tables
@@ -273,6 +284,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
 std::unique_ptr<cudf::table> cross_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -567,6 +579,7 @@ class distinct_hash_join {
  * @param right The right table
  * @param binary_predicate The condition on which to join
  * @param output_size Optional value which allows users to specify the exact output size
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -578,6 +591,7 @@ conditional_inner_join(table_view const& left,
                        table_view const& right,
                        ast::expression const& binary_predicate,
                        std::optional<std::size_t> output_size = {},
+                       rmm::cuda_stream_view stream           = cudf::get_default_stream(),
                        rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -612,6 +626,7 @@ conditional_inner_join(table_view const& left,
  * @param right The right table
  * @param binary_predicate The condition on which to join
  * @param output_size Optional value which allows users to specify the exact output size
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -623,6 +638,7 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size = {},
+                      rmm::cuda_stream_view stream           = cudf::get_default_stream(),
                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -655,6 +671,7 @@ conditional_left_join(table_view const& left,
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -665,6 +682,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
+                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -693,6 +711,7 @@ conditional_full_join(table_view const& left,
  * @param right The right table
  * @param binary_predicate The condition on which to join
  * @param output_size Optional value which allows users to specify the exact output size
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A vector `left_indices` that can be used to construct the result of
@@ -704,6 +723,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
@@ -732,6 +752,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
  * @param right The right table
  * @param binary_predicate The condition on which to join
  * @param output_size Optional value which allows users to specify the exact output size
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A vector `left_indices` that can be used to construct the result of
@@ -743,6 +764,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
@@ -786,6 +808,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
  * @param output_size_data An optional pair of values indicating the exact output size and the
  * number of matches for each row in the larger of the two input tables, left or right (may be
  * precomputed using the corresponding mixed_inner_join_size API).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -801,6 +824,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -846,6 +870,7 @@ mixed_inner_join(
  * @param output_size_data An optional pair of values indicating the exact output size and the
  * number of matches for each row in the larger of the two input tables, left or right (may be
  * precomputed using the corresponding mixed_left_join_size API).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -861,6 +886,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -906,6 +932,7 @@ mixed_left_join(
  * @param output_size_data An optional pair of values indicating the exact output size and the
  * number of matches for each row in the larger of the two input tables, left or right (may be
  * precomputed using the corresponding mixed_full_join_size API).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -921,6 +948,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -956,6 +984,7 @@ mixed_full_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -968,6 +997,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1004,6 +1034,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -1016,6 +1047,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1041,6 +1073,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair containing the size that would result from performing the
@@ -1056,6 +1089,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1081,6 +1115,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair containing the size that would result from performing the
@@ -1096,6 +1131,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1111,6 +1147,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return The size that would result from performing the requested join
@@ -1119,6 +1156,7 @@ std::size_t conditional_inner_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1134,6 +1172,7 @@ std::size_t conditional_inner_join_size(
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return The size that would result from performing the requested join
@@ -1142,6 +1181,7 @@ std::size_t conditional_left_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1157,6 +1197,7 @@ std::size_t conditional_left_join_size(
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return The size that would result from performing the requested join
@@ -1165,6 +1206,7 @@ std::size_t conditional_left_semi_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1180,6 +1222,7 @@ std::size_t conditional_left_semi_join_size(
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return The size that would result from performing the requested join
@@ -1188,6 +1231,7 @@ std::size_t conditional_left_anti_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
index aa903770e26..f6de1056c24 100644
--- a/cpp/include/cudf/timezone.hpp
+++ b/cpp/include/cudf/timezone.hpp
@@ -15,9 +15,12 @@
  */
 #pragma once
 
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <optional>
 #include <string>
@@ -43,6 +46,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
  *
  * @param tzif_dir The directory where the TZif files are located
  * @param timezone_name standard timezone name (for example, "America/Los_Angeles")
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory.
  *
  * @return The transition table for the given timezone
@@ -50,6 +54,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
 std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index fd9a6b8f5fe..ddb0dbcd96d 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -580,142 +580,167 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 
 std::unique_ptr<column> ceil_datetimes(column_view const& column,
                                        rounding_frequency freq,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(
-    detail::rounding_function::CEIL, freq, column, cudf::get_default_stream(), mr);
+  return detail::round_general(detail::rounding_function::CEIL, freq, column, stream, mr);
 }
 
 std::unique_ptr<column> floor_datetimes(column_view const& column,
                                         rounding_frequency freq,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(
-    detail::rounding_function::FLOOR, freq, column, cudf::get_default_stream(), mr);
+  return detail::round_general(detail::rounding_function::FLOOR, freq, column, stream, mr);
 }
 
 std::unique_ptr<column> round_datetimes(column_view const& column,
                                         rounding_frequency freq,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(
-    detail::rounding_function::ROUND, freq, column, cudf::get_default_stream(), mr);
+  return detail::round_general(detail::rounding_function::ROUND, freq, column, stream, mr);
 }
 
-std::unique_ptr<column> extract_year(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_year(column_view const& column,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_year(column, cudf::get_default_stream(), mr);
+  return detail::extract_year(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_month(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_month(column_view const& column,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_month(column, cudf::get_default_stream(), mr);
+  return detail::extract_month(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_day(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_day(column_view const& column,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_day(column, cudf::get_default_stream(), mr);
+  return detail::extract_day(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_weekday(column_view const& column,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_weekday(column, cudf::get_default_stream(), mr);
+  return detail::extract_weekday(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_hour(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_hour(column_view const& column,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_hour(column, cudf::get_default_stream(), mr);
+  return detail::extract_hour(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_minute(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_minute(column_view const& column,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_minute(column, cudf::get_default_stream(), mr);
+  return detail::extract_minute(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_second(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_second(column_view const& column,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_second(column, cudf::get_default_stream(), mr);
+  return detail::extract_second(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_millisecond_fraction(column, cudf::get_default_stream(), mr);
+  return detail::extract_millisecond_fraction(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_microsecond_fraction(column, cudf::get_default_stream(), mr);
+  return detail::extract_microsecond_fraction(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_nanosecond_fraction(column, cudf::get_default_stream(), mr);
+  return detail::extract_nanosecond_fraction(column, stream, mr);
 }
 
 std::unique_ptr<column> last_day_of_month(column_view const& column,
+                                          rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::last_day_of_month(column, cudf::get_default_stream(), mr);
+  return detail::last_day_of_month(column, stream, mr);
 }
 
-std::unique_ptr<column> day_of_year(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> day_of_year(column_view const& column,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::day_of_year(column, cudf::get_default_stream(), mr);
+  return detail::day_of_year(column, stream, mr);
 }
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
                                                      cudf::column_view const& months_column,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::add_calendrical_months(
-    timestamp_column, months_column, cudf::get_default_stream(), mr);
+  return detail::add_calendrical_months(timestamp_column, months_column, stream, mr);
 }
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
                                                      cudf::scalar const& months,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::add_calendrical_months(timestamp_column, months, cudf::get_default_stream(), mr);
+  return detail::add_calendrical_months(timestamp_column, months, stream, mr);
 }
 
-std::unique_ptr<column> is_leap_year(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> is_leap_year(column_view const& column,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_leap_year(column, cudf::get_default_stream(), mr);
+  return detail::is_leap_year(column, stream, mr);
 }
 
-std::unique_ptr<column> days_in_month(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> days_in_month(column_view const& column,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::days_in_month(column, cudf::get_default_stream(), mr);
+  return detail::days_in_month(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_quarter(column_view const& column,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_quarter(column, cudf::get_default_stream(), mr);
+  return detail::extract_quarter(column, stream, mr);
 }
 
 }  // namespace datetime
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 6498a5e6c55..cf239297255 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -380,11 +380,11 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year)
 
 std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
                                                       std::string_view timezone_name,
+                                                      rmm::cuda_stream_view stream,
                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::make_timezone_transition_table(
-    tzif_dir, timezone_name, cudf::get_default_stream(), mr);
+  return detail::make_timezone_transition_table(tzif_dir, timezone_name, stream, mr);
 }
 
 namespace detail {
diff --git a/cpp/src/io/json/column_tree_construction.cu b/cpp/src/io/json/column_tree_construction.cu
new file mode 100644
index 00000000000..c4fe7926706
--- /dev/null
+++ b/cpp/src/io/json/column_tree_construction.cu
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nested_json.hpp"
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/for_each.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/transform_scan.h>
+#include <thrust/unique.h>
+
+namespace cudf::io::json {
+
+using row_offset_t = size_type;
+
+#ifdef CSR_DEBUG_PRINT
+template <typename T>
+void print(device_span<T const> d_vec, std::string name, rmm::cuda_stream_view stream)
+{
+  stream.synchronize();
+  auto h_vec = cudf::detail::make_std_vector_sync(d_vec, stream);
+  std::cout << name << " = ";
+  for (auto e : h_vec) {
+    std::cout << e << " ";
+  }
+  std::cout << std::endl;
+}
+#endif
+
+namespace experimental::detail {
+
+struct level_ordering {
+  device_span<TreeDepthT const> node_levels;
+  device_span<NodeIndexT const> col_ids;
+  device_span<NodeIndexT const> parent_node_ids;
+  __device__ bool operator()(NodeIndexT lhs_node_id, NodeIndexT rhs_node_id) const
+  {
+    auto lhs_parent_col_id = parent_node_ids[lhs_node_id] == parent_node_sentinel
+                               ? parent_node_sentinel
+                               : col_ids[parent_node_ids[lhs_node_id]];
+    auto rhs_parent_col_id = parent_node_ids[rhs_node_id] == parent_node_sentinel
+                               ? parent_node_sentinel
+                               : col_ids[parent_node_ids[rhs_node_id]];
+
+    return (node_levels[lhs_node_id] < node_levels[rhs_node_id]) ||
+           (node_levels[lhs_node_id] == node_levels[rhs_node_id] &&
+            lhs_parent_col_id < rhs_parent_col_id) ||
+           (node_levels[lhs_node_id] == node_levels[rhs_node_id] &&
+            lhs_parent_col_id == rhs_parent_col_id && col_ids[lhs_node_id] < col_ids[rhs_node_id]);
+  }
+};
+
+struct parent_nodeids_to_colids {
+  device_span<NodeIndexT const> rev_mapped_col_ids;
+  __device__ auto operator()(NodeIndexT parent_node_id) -> NodeIndexT
+  {
+    return parent_node_id == parent_node_sentinel ? parent_node_sentinel
+                                                  : rev_mapped_col_ids[parent_node_id];
+  }
+};
+
+/**
+ * @brief Reduces node tree representation to column tree CSR representation.
+ *
+ * @param node_tree Node tree representation of JSON string
+ * @param original_col_ids Column ids of nodes
+ * @param row_offsets Row offsets of nodes
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A tuple of column tree representation of JSON string, column ids of columns, and
+ * max row offsets of columns
+ */
+std::tuple<compressed_sparse_row, column_tree_properties> reduce_to_column_tree(
+  tree_meta_t& node_tree,
+  device_span<NodeIndexT const> original_col_ids,
+  device_span<NodeIndexT const> sorted_col_ids,
+  device_span<NodeIndexT const> ordered_node_ids,
+  device_span<row_offset_t const> row_offsets,
+  bool is_array_of_arrays,
+  NodeIndexT row_array_parent_col_id,
+  rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+
+  if (original_col_ids.empty()) {
+    rmm::device_uvector<NodeIndexT> empty_row_idx(0, stream);
+    rmm::device_uvector<NodeIndexT> empty_col_idx(0, stream);
+    rmm::device_uvector<NodeT> empty_column_categories(0, stream);
+    rmm::device_uvector<row_offset_t> empty_max_row_offsets(0, stream);
+    rmm::device_uvector<NodeIndexT> empty_mapped_col_ids(0, stream);
+    return std::tuple{compressed_sparse_row{std::move(empty_row_idx), std::move(empty_col_idx)},
+                      column_tree_properties{std::move(empty_column_categories),
+                                             std::move(empty_max_row_offsets),
+                                             std::move(empty_mapped_col_ids)}};
+  }
+
+  auto [unpermuted_tree, unpermuted_col_ids, unpermuted_max_row_offsets] =
+    cudf::io::json::detail::reduce_to_column_tree(node_tree,
+                                                  original_col_ids,
+                                                  sorted_col_ids,
+                                                  ordered_node_ids,
+                                                  row_offsets,
+                                                  is_array_of_arrays,
+                                                  row_array_parent_col_id,
+                                                  stream);
+
+  NodeIndexT num_columns = unpermuted_col_ids.size();
+
+  auto mapped_col_ids = cudf::detail::make_device_uvector_async(
+    unpermuted_col_ids, stream, cudf::get_current_device_resource_ref());
+  rmm::device_uvector<NodeIndexT> rev_mapped_col_ids(num_columns, stream);
+  rmm::device_uvector<NodeIndexT> reordering_index(unpermuted_col_ids.size(), stream);
+
+  thrust::sequence(
+    rmm::exec_policy_nosync(stream), reordering_index.begin(), reordering_index.end());
+  // Reorder nodes and column ids in level-wise fashion
+  thrust::sort_by_key(
+    rmm::exec_policy_nosync(stream),
+    reordering_index.begin(),
+    reordering_index.end(),
+    mapped_col_ids.begin(),
+    level_ordering{
+      unpermuted_tree.node_levels, unpermuted_col_ids, unpermuted_tree.parent_node_ids});
+
+  {
+    auto mapped_col_ids_copy = cudf::detail::make_device_uvector_async(
+      mapped_col_ids, stream, cudf::get_current_device_resource_ref());
+    thrust::sequence(
+      rmm::exec_policy_nosync(stream), rev_mapped_col_ids.begin(), rev_mapped_col_ids.end());
+    thrust::sort_by_key(rmm::exec_policy_nosync(stream),
+                        mapped_col_ids_copy.begin(),
+                        mapped_col_ids_copy.end(),
+                        rev_mapped_col_ids.begin());
+  }
+
+  rmm::device_uvector<NodeIndexT> parent_col_ids(num_columns, stream);
+  thrust::transform_output_iterator parent_col_ids_it(parent_col_ids.begin(),
+                                                      parent_nodeids_to_colids{rev_mapped_col_ids});
+  rmm::device_uvector<row_offset_t> max_row_offsets(num_columns, stream);
+  rmm::device_uvector<NodeT> column_categories(num_columns, stream);
+  thrust::copy_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_zip_iterator(thrust::make_permutation_iterator(
+                                unpermuted_tree.parent_node_ids.begin(), reordering_index.begin()),
+                              thrust::make_permutation_iterator(unpermuted_max_row_offsets.begin(),
+                                                                reordering_index.begin()),
+                              thrust::make_permutation_iterator(
+                                unpermuted_tree.node_categories.begin(), reordering_index.begin())),
+    num_columns,
+    thrust::make_zip_iterator(
+      parent_col_ids_it, max_row_offsets.begin(), column_categories.begin()));
+
+#ifdef CSR_DEBUG_PRINT
+  print<NodeIndexT>(reordering_index, "h_reordering_index", stream);
+  print<NodeIndexT>(mapped_col_ids, "h_mapped_col_ids", stream);
+  print<NodeIndexT>(rev_mapped_col_ids, "h_rev_mapped_col_ids", stream);
+  print<NodeIndexT>(parent_col_ids, "h_parent_col_ids", stream);
+  print<row_offset_t>(max_row_offsets, "h_max_row_offsets", stream);
+#endif
+
+  auto construct_row_idx = [&stream](NodeIndexT num_columns,
+                                     device_span<NodeIndexT const> parent_col_ids) {
+    auto row_idx = cudf::detail::make_zeroed_device_uvector_async<NodeIndexT>(
+      static_cast<std::size_t>(num_columns + 1), stream, cudf::get_current_device_resource_ref());
+    // Note that the first element of csr_parent_col_ids is -1 (parent_node_sentinel)
+    // children adjacency
+
+    auto num_non_leaf_columns = thrust::unique_count(
+      rmm::exec_policy_nosync(stream), parent_col_ids.begin() + 1, parent_col_ids.end());
+    rmm::device_uvector<NodeIndexT> non_leaf_nodes(num_non_leaf_columns, stream);
+    rmm::device_uvector<NodeIndexT> non_leaf_nodes_children(num_non_leaf_columns, stream);
+    thrust::reduce_by_key(rmm::exec_policy_nosync(stream),
+                          parent_col_ids.begin() + 1,
+                          parent_col_ids.end(),
+                          thrust::make_constant_iterator(1),
+                          non_leaf_nodes.begin(),
+                          non_leaf_nodes_children.begin(),
+                          thrust::equal_to<TreeDepthT>());
+
+    thrust::scatter(rmm::exec_policy_nosync(stream),
+                    non_leaf_nodes_children.begin(),
+                    non_leaf_nodes_children.end(),
+                    non_leaf_nodes.begin(),
+                    row_idx.begin() + 1);
+
+    if (num_columns > 1) {
+      thrust::transform_inclusive_scan(
+        rmm::exec_policy_nosync(stream),
+        thrust::make_zip_iterator(thrust::make_counting_iterator(1), row_idx.begin() + 1),
+        thrust::make_zip_iterator(thrust::make_counting_iterator(1) + num_columns, row_idx.end()),
+        row_idx.begin() + 1,
+        cuda::proclaim_return_type<NodeIndexT>([] __device__(auto a) {
+          auto n   = thrust::get<0>(a);
+          auto idx = thrust::get<1>(a);
+          return n == 1 ? idx : idx + 1;
+        }),
+        thrust::plus<NodeIndexT>{});
+    } else {
+      auto single_node = 1;
+      row_idx.set_element_async(1, single_node, stream);
+    }
+
+#ifdef CSR_DEBUG_PRINT
+    print<NodeIndexT>(row_idx, "h_row_idx", stream);
+#endif
+    return row_idx;
+  };
+
+  auto construct_col_idx = [&stream](NodeIndexT num_columns,
+                                     device_span<NodeIndexT const> parent_col_ids,
+                                     device_span<NodeIndexT const> row_idx) {
+    rmm::device_uvector<NodeIndexT> col_idx((num_columns - 1) * 2, stream);
+    thrust::fill(rmm::exec_policy_nosync(stream), col_idx.begin(), col_idx.end(), -1);
+    // excluding root node, construct scatter map
+    rmm::device_uvector<NodeIndexT> map(num_columns - 1, stream);
+    thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(stream),
+                                  parent_col_ids.begin() + 1,
+                                  parent_col_ids.end(),
+                                  thrust::make_constant_iterator(1),
+                                  map.begin());
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::make_counting_iterator(1),
+                       num_columns - 1,
+                       [row_idx        = row_idx.begin(),
+                        map            = map.begin(),
+                        parent_col_ids = parent_col_ids.begin()] __device__(auto i) {
+                         auto parent_col_id = parent_col_ids[i];
+                         if (parent_col_id == 0)
+                           --map[i - 1];
+                         else
+                           map[i - 1] += row_idx[parent_col_id];
+                       });
+    thrust::scatter(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator(1),
+                    thrust::make_counting_iterator(1) + num_columns - 1,
+                    map.begin(),
+                    col_idx.begin());
+
+    // Skip the parent of root node
+    thrust::scatter(rmm::exec_policy_nosync(stream),
+                    parent_col_ids.begin() + 1,
+                    parent_col_ids.end(),
+                    row_idx.begin() + 1,
+                    col_idx.begin());
+
+#ifdef CSR_DEBUG_PRINT
+    print<NodeIndexT>(col_idx, "h_col_idx", stream);
+#endif
+
+    return col_idx;
+  };
+
+  /*
+    5. CSR construction:
+      a. Sort column levels and get their ordering
+      b. For each column node coln iterated according to sorted_column_levels; do
+          i. Find nodes that have coln as the parent node -> set adj_coln
+          ii. row idx[coln] = size of adj_coln + 1
+          iii. col idx[coln] = adj_coln U {parent_col_id[coln]}
+  */
+  auto row_idx = construct_row_idx(num_columns, parent_col_ids);
+  auto col_idx = construct_col_idx(num_columns, parent_col_ids, row_idx);
+
+  return std::tuple{
+    compressed_sparse_row{std::move(row_idx), std::move(col_idx)},
+    column_tree_properties{
+      std::move(column_categories), std::move(max_row_offsets), std::move(mapped_col_ids)}};
+}
+
+}  // namespace experimental::detail
+}  // namespace cudf::io::json
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
new file mode 100644
index 00000000000..70d61132b42
--- /dev/null
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -0,0 +1,808 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/utilities/parsing_utils.cuh"
+#include "io/utilities/string_parsing.hpp"
+#include "nested_json.hpp"
+
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/copy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/uninitialized_fill.h>
+
+#include <algorithm>
+
+namespace cudf::io::json::detail {
+
+/**
+ * @brief Get the column indices for the values column for array of arrays rows
+ *
+ * @param row_array_children_level The level of the row array's children
+ * @param d_tree The tree metadata
+ * @param col_ids The column ids
+ * @param num_columns The number of columns
+ * @param stream The stream to use
+ * @return The value columns' indices
+ */
+rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_array_children_level,
+                                                          tree_meta_t const& d_tree,
+                                                          device_span<NodeIndexT> col_ids,
+                                                          size_type const num_columns,
+                                                          rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  auto [level2_nodes, level2_indices] = get_array_children_indices(
+    row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream);
+  auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin());
+  rmm::device_uvector<NodeIndexT> values_column_indices(num_columns, stream);
+  thrust::scatter(rmm::exec_policy(stream),
+                  level2_indices.begin(),
+                  level2_indices.end(),
+                  col_id_location,
+                  values_column_indices.begin());
+  return values_column_indices;
+}
+
+/**
+ * @brief Copies strings specified by pair of begin, end offsets to host vector of strings.
+ *
+ * @param input String device buffer
+ * @param node_range_begin Begin offset of the strings
+ * @param node_range_end End offset of the strings
+ * @param stream CUDA stream
+ * @return Vector of strings
+ */
+std::vector<std::string> copy_strings_to_host_sync(
+  device_span<SymbolT const> input,
+  device_span<SymbolOffsetT const> node_range_begin,
+  device_span<SymbolOffsetT const> node_range_end,
+  rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  auto const num_strings = node_range_begin.size();
+  rmm::device_uvector<size_type> string_offsets(num_strings, stream);
+  rmm::device_uvector<size_type> string_lengths(num_strings, stream);
+  auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
+  thrust::transform(rmm::exec_policy(stream),
+                    d_offset_pairs,
+                    d_offset_pairs + num_strings,
+                    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()),
+                    [] __device__(auto const& offsets) {
+                      // Note: first character for non-field columns
+                      return thrust::make_tuple(
+                        static_cast<size_type>(thrust::get<0>(offsets)),
+                        static_cast<size_type>(thrust::get<1>(offsets) - thrust::get<0>(offsets)));
+                    });
+
+  cudf::io::parse_options_view options_view{};
+  options_view.quotechar  = '\0';  // no quotes
+  options_view.keepquotes = true;
+  auto d_offset_length_it =
+    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin());
+  auto d_column_names = parse_data(input.data(),
+                                   d_offset_length_it,
+                                   num_strings,
+                                   data_type{type_id::STRING},
+                                   rmm::device_buffer{},
+                                   0,
+                                   options_view,
+                                   stream,
+                                   cudf::get_current_device_resource_ref());
+  auto to_host        = [stream](auto const& col) {
+    if (col.is_empty()) return std::vector<std::string>{};
+    auto const scv     = cudf::strings_column_view(col);
+    auto const h_chars = cudf::detail::make_host_vector_async<char>(
+      cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
+    auto const h_offsets = cudf::detail::make_host_vector_async(
+      cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
+                                               scv.size() + 1),
+      stream);
+    stream.synchronize();
+
+    // build std::string vector from chars and offsets
+    std::vector<std::string> host_data;
+    host_data.reserve(col.size());
+    std::transform(
+      std::begin(h_offsets),
+      std::end(h_offsets) - 1,
+      std::begin(h_offsets) + 1,
+      std::back_inserter(host_data),
+      [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
+    return host_data;
+  };
+  return to_host(d_column_names->view());
+}
+
+/**
+ * @brief Checks if all strings in each string column in the tree are nulls.
+ * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as
+ * false.
+ *
+ * @param input Input JSON string device data
+ * @param d_column_tree column tree representation of JSON string
+ * @param tree Node tree representation of the JSON string
+ * @param col_ids Column ids of the nodes in the tree
+ * @param options Parsing options specifying the parsing behaviour
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Array of bytes where each byte indicate if it is all nulls string column.
+ */
+rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const> input,
+                                                      tree_meta_t const& d_column_tree,
+                                                      tree_meta_t const& tree,
+                                                      device_span<NodeIndexT> col_ids,
+                                                      cudf::io::json_reader_options const& options,
+                                                      rmm::cuda_stream_view stream)
+{
+  auto const num_nodes = col_ids.size();
+  auto const num_cols  = d_column_tree.node_categories.size();
+  rmm::device_uvector<uint8_t> is_all_nulls(num_cols, stream);
+  thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
+
+  auto parse_opt = parsing_options(options, stream);
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::counting_iterator<size_type>(0),
+    num_nodes,
+    [options           = parse_opt.view(),
+     data              = input.data(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     is_all_nulls      = is_all_nulls.begin()] __device__(size_type i) {
+      auto const node_category = column_categories[col_ids[i]];
+      if (node_category == NC_STR or node_category == NC_VAL) {
+        auto const is_null_literal = serialized_trie_contains(
+          options.trie_na,
+          {data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
+        if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
+      }
+    });
+  return is_all_nulls;
+}
+
+NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT> col_ids,
+                                       bool is_enabled_lines,
+                                       rmm::cuda_stream_view stream)
+{
+  NodeIndexT value = parent_node_sentinel;
+  if (!col_ids.empty()) {
+    auto const list_node_index = is_enabled_lines ? 0 : 1;
+    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
+                                  col_ids.data() + list_node_index,
+                                  sizeof(NodeIndexT),
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    stream.synchronize();
+  }
+  return value;
+}
+/**
+ * @brief Holds member data pointers of `d_json_column`
+ *
+ */
+struct json_column_data {
+  using row_offset_t = json_column::row_offset_t;
+  row_offset_t* string_offsets;
+  row_offset_t* string_lengths;
+  row_offset_t* child_offsets;
+  bitmask_type* validity;
+};
+
+std::pair<cudf::detail::host_vector<uint8_t>,
+          std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>>
+build_tree(device_json_column& root,
+           std::vector<uint8_t> const& is_str_column_all_nulls,
+           tree_meta_t& d_column_tree,
+           device_span<NodeIndexT const> d_unique_col_ids,
+           device_span<size_type const> d_max_row_offsets,
+           std::vector<std::string> const& column_names,
+           NodeIndexT row_array_parent_col_id,
+           bool is_array_of_arrays,
+           cudf::io::json_reader_options const& options,
+           rmm::cuda_stream_view stream,
+           rmm::device_async_resource_ref mr);
+void scatter_offsets(
+  tree_meta_t& tree,
+  device_span<NodeIndexT> col_ids,
+  device_span<size_type> row_offsets,
+  device_span<size_type> node_ids,
+  device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+  tree_meta_t& d_column_tree,
+  host_span<const uint8_t> ignore_vals,
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>& columns,
+  rmm::cuda_stream_view stream);
+
+/**
+ * @brief Constructs `d_json_column` from node tree representation
+ * Newly constructed columns are insert into `root`'s children.
+ * `root` must be a list type.
+ *
+ * @param input Input JSON string device data
+ * @param tree Node tree representation of the JSON string
+ * @param col_ids Column ids of the nodes in the tree
+ * @param row_offsets Row offsets of the nodes in the tree
+ * @param root Root node of the `d_json_column` tree
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param options Parsing options specifying the parsing behaviour
+ * options affecting behaviour are
+ *   is_enabled_lines: Whether the input is a line-delimited JSON
+ *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the device memory
+ * of child_offets and validity members of `d_json_column`
+ */
+void make_device_json_column(device_span<SymbolT const> input,
+                             tree_meta_t& tree,
+                             device_span<NodeIndexT> col_ids,
+                             device_span<size_type> row_offsets,
+                             device_json_column& root,
+                             bool is_array_of_arrays,
+                             cudf::io::json_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+
+  bool const is_enabled_lines                 = options.is_enabled_lines();
+  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  auto const num_nodes                        = col_ids.size();
+  rmm::device_uvector<NodeIndexT> sorted_col_ids(col_ids.size(), stream);  // make a copy
+  thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin());
+
+  // sort by {col_id} on {node_ids} stable
+  rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
+  thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end());
+  thrust::stable_sort_by_key(
+    rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
+
+  NodeIndexT const row_array_parent_col_id =
+    get_row_array_parent_col_id(col_ids, is_enabled_lines, stream);
+
+  // 1. gather column information.
+  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
+    reduce_to_column_tree(tree,
+                          col_ids,
+                          sorted_col_ids,
+                          node_ids,
+                          row_offsets,
+                          is_array_of_arrays,
+                          row_array_parent_col_id,
+                          stream);
+  auto num_columns                      = d_unique_col_ids.size();
+  std::vector<std::string> column_names = copy_strings_to_host_sync(
+    input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
+  // array of arrays column names
+  if (is_array_of_arrays) {
+    auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
+    auto const column_parent_ids =
+      cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
+    TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
+    auto values_column_indices =
+      get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
+    auto h_values_column_indices =
+      cudf::detail::make_host_vector_sync(values_column_indices, stream);
+    std::transform(unique_col_ids.begin(),
+                   unique_col_ids.end(),
+                   column_names.begin(),
+                   column_names.begin(),
+                   [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
+                     auto col_id, auto name) mutable {
+                     return column_parent_ids[col_id] == row_array_parent_col_id
+                              ? std::to_string(h_values_column_indices[col_id])
+                              : name;
+                   });
+  }
+
+  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
+    if (is_enabled_mixed_types_as_string) {
+      return cudf::detail::make_std_vector_sync(
+        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
+    }
+    return std::vector<uint8_t>();
+  }();
+  auto [ignore_vals, columns] = build_tree(root,
+                                           is_str_column_all_nulls,
+                                           d_column_tree,
+                                           d_unique_col_ids,
+                                           d_max_row_offsets,
+                                           column_names,
+                                           row_array_parent_col_id,
+                                           is_array_of_arrays,
+                                           options,
+                                           stream,
+                                           mr);
+
+  scatter_offsets(tree,
+                  col_ids,
+                  row_offsets,
+                  node_ids,
+                  sorted_col_ids,
+                  d_column_tree,
+                  ignore_vals,
+                  columns,
+                  stream);
+}
+
+std::pair<cudf::detail::host_vector<uint8_t>,
+          std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>>
+build_tree(device_json_column& root,
+           std::vector<uint8_t> const& is_str_column_all_nulls,
+           tree_meta_t& d_column_tree,
+           device_span<NodeIndexT const> d_unique_col_ids,
+           device_span<size_type const> d_max_row_offsets,
+           std::vector<std::string> const& column_names,
+           NodeIndexT row_array_parent_col_id,
+           bool is_array_of_arrays,
+           cudf::io::json_reader_options const& options,
+           rmm::cuda_stream_view stream,
+           rmm::device_async_resource_ref mr)
+{
+  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
+  auto column_categories =
+    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
+  auto const column_parent_ids =
+    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
+  auto column_range_beg =
+    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
+  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
+  auto num_columns           = d_unique_col_ids.size();
+
+  auto to_json_col_type = [](auto category) {
+    switch (category) {
+      case NC_STRUCT: return json_col_t::StructColumn;
+      case NC_LIST: return json_col_t::ListColumn;
+      case NC_STR: [[fallthrough]];
+      case NC_VAL: return json_col_t::StringColumn;
+      default: return json_col_t::Unknown;
+    }
+  };
+  auto init_to_zero = [stream](auto& v) {
+    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
+  };
+
+  auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
+    if (column_category == NC_ERR || column_category == NC_FN) {
+      return;
+    } else if (column_category == NC_VAL || column_category == NC_STR) {
+      col.string_offsets.resize(max_row_offsets[i] + 1, stream);
+      col.string_lengths.resize(max_row_offsets[i] + 1, stream);
+      init_to_zero(col.string_offsets);
+      init_to_zero(col.string_lengths);
+    } else if (column_category == NC_LIST) {
+      col.child_offsets.resize(max_row_offsets[i] + 2, stream);
+      init_to_zero(col.child_offsets);
+    }
+    col.num_rows = max_row_offsets[i] + 1;
+    col.validity =
+      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
+    col.type = to_json_col_type(column_category);
+  };
+
+  auto reinitialize_as_string = [&](auto i, auto& col) {
+    col.string_offsets.resize(max_row_offsets[i] + 1, stream);
+    col.string_lengths.resize(max_row_offsets[i] + 1, stream);
+    init_to_zero(col.string_offsets);
+    init_to_zero(col.string_lengths);
+    col.num_rows = max_row_offsets[i] + 1;
+    col.validity =
+      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
+    col.type = json_col_t::StringColumn;
+    // destroy references of all child columns after this step, by calling remove_child_columns
+  };
+
+  path_from_tree tree_path{column_categories,
+                           column_parent_ids,
+                           column_names,
+                           is_array_of_arrays,
+                           row_array_parent_col_id};
+
+  // 2. generate nested columns tree and its device_memory
+  // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
+  auto h_range_col_id_it =
+    thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin());
+  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
+    return thrust::get<0>(a) < thrust::get<0>(b);
+  });
+
+  // use hash map because we may skip field name's col_ids
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
+  // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
+  std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
+  // find column_ids which are values, but should be ignored in validity
+  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
+  std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
+  std::vector<uint8_t> is_pruned(num_columns, 0);
+  // for columns that are not mixed type but have been forced as string
+  std::vector<bool> forced_as_string_column(num_columns);
+  columns.try_emplace(parent_node_sentinel, std::ref(root));
+
+  std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
+    [&](NodeIndexT this_col_id, device_json_column& col) {
+      for (auto col_name : col.column_order) {
+        auto child_id                  = mapped_columns[{this_col_id, col_name}];
+        is_mixed_type_column[child_id] = 1;
+        remove_child_columns(child_id, col.child_columns.at(col_name));
+        mapped_columns.erase({this_col_id, col_name});
+        columns.erase(child_id);
+      }
+      col.child_columns.clear();  // their references are deleted above.
+      col.column_order.clear();
+    };
+
+  auto name_and_parent_index = [&is_array_of_arrays,
+                                &row_array_parent_col_id,
+                                &column_parent_ids,
+                                &column_categories,
+                                &column_names](auto this_col_id) {
+    std::string name   = "";
+    auto parent_col_id = column_parent_ids[this_col_id];
+    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
+      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
+        name = column_names[this_col_id];
+      } else {
+        name = list_child_name;
+      }
+    } else if (column_categories[parent_col_id] == NC_FN) {
+      auto field_name_col_id = parent_col_id;
+      parent_col_id          = column_parent_ids[parent_col_id];
+      name                   = column_names[field_name_col_id];
+    } else {
+      CUDF_FAIL("Unexpected parent column category");
+    }
+    return std::pair{name, parent_col_id};
+  };
+
+  // Prune columns that are not required to be parsed.
+  if (options.is_enabled_prune_columns()) {
+    for (auto const this_col_id : unique_col_ids) {
+      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+        continue;
+      }
+      // Struct, List, String, Value
+      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
+      // get path of this column, and get its dtype if present in options
+      auto const nt                             = tree_path.get_path(this_col_id);
+      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
+        is_pruned[this_col_id] = 1;
+        continue;
+      } else {
+        // make sure all its parents are not pruned.
+        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
+          is_pruned[parent_col_id] = 0;
+          parent_col_id            = column_parent_ids[parent_col_id];
+        }
+      }
+    }
+  }
+
+  // Build the column tree, also, handles mixed types.
+  for (auto const this_col_id : unique_col_ids) {
+    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+      continue;
+    }
+    // Struct, List, String, Value
+    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
+
+    // if parent is mixed type column or this column is pruned or if parent
+    // has been forced as string, ignore this column.
+    if (parent_col_id != parent_node_sentinel &&
+          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
+        forced_as_string_column[parent_col_id]) {
+      ignore_vals[this_col_id] = 1;
+      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
+      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
+      continue;
+    }
+
+    // If the child is already found,
+    // replace if this column is a nested column and the existing was a value column
+    // ignore this column if this column is a value column and the existing was a nested column
+    auto it = columns.find(parent_col_id);
+    CUDF_EXPECTS(it != columns.end(), "Parent column not found");
+    auto& parent_col = it->second.get();
+    bool replaced    = false;
+    if (mapped_columns.count({parent_col_id, name}) > 0) {
+      auto const old_col_id = mapped_columns[{parent_col_id, name}];
+      // If mixed type as string is enabled, make both of them strings and merge them.
+      // All child columns will be ignored when parsing.
+      if (is_enabled_mixed_types_as_string) {
+        bool const is_mixed_type = [&]() {
+          // If new or old is STR and they are all not null, make it mixed type, else ignore.
+          if (column_categories[this_col_id] == NC_VAL ||
+              column_categories[this_col_id] == NC_STR) {
+            if (is_str_column_all_nulls[this_col_id]) return false;
+          }
+          if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
+            if (is_str_column_all_nulls[old_col_id]) return false;
+          }
+          return true;
+        }();
+        if (is_mixed_type) {
+          is_mixed_type_column[this_col_id] = 1;
+          is_mixed_type_column[old_col_id]  = 1;
+          // if old col type (not cat) is list or struct, replace with string.
+          auto& col = columns.at(old_col_id).get();
+          if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
+            reinitialize_as_string(old_col_id, col);
+            remove_child_columns(old_col_id, col);
+            // all its children (which are already inserted) are ignored later.
+          }
+          col.forced_as_string_column = true;
+          columns.try_emplace(this_col_id, columns.at(old_col_id));
+          continue;
+        }
+      }
+
+      if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
+        ignore_vals[this_col_id] = 1;
+        continue;
+      }
+      if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
+        // remap
+        ignore_vals[old_col_id] = 1;
+        mapped_columns.erase({parent_col_id, name});
+        columns.erase(old_col_id);
+        parent_col.child_columns.erase(name);
+        replaced = true;  // to skip duplicate name in column_order
+      } else {
+        // If this is a nested column but we're trying to insert either (a) a list node into a
+        // struct column or (b) a struct node into a list column, we fail
+        CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and
+                          column_categories[this_col_id] == NC_STRUCT) or
+                         (column_categories[old_col_id] == NC_STRUCT and
+                          column_categories[this_col_id] == NC_LIST)),
+                     "A mix of lists and structs within the same column is not supported");
+      }
+    }
+
+    auto this_column_category = column_categories[this_col_id];
+    // get path of this column, check if it is a struct/list forced as string, and enforce it
+    auto const nt                             = tree_path.get_path(this_col_id);
+    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      this_column_category = NC_STR;
+    }
+
+    CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
+    // move into parent
+    device_json_column col(stream, mr);
+    initialize_json_columns(this_col_id, col, this_column_category);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      col.forced_as_string_column          = true;
+      forced_as_string_column[this_col_id] = true;
+    }
+
+    auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
+    CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
+    if (not replaced) parent_col.column_order.push_back(name);
+    columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
+    mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
+  }
+
+  if (is_enabled_mixed_types_as_string) {
+    // ignore all children of mixed type columns
+    for (auto const this_col_id : unique_col_ids) {
+      auto parent_col_id = column_parent_ids[this_col_id];
+      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
+        is_mixed_type_column[this_col_id] = 1;
+        ignore_vals[this_col_id]          = 1;
+        columns.erase(this_col_id);
+      }
+      // Convert only mixed type columns as string (so to copy), but not its children
+      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and
+          is_mixed_type_column[this_col_id] == 1)
+        column_categories[this_col_id] = NC_STR;
+    }
+    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                    column_categories.data(),
+                                    column_categories.size() * sizeof(column_categories[0]),
+                                    cudf::detail::host_memory_kind::PAGEABLE,
+                                    stream);
+  }
+
+  // ignore all children of columns forced as string
+  for (auto const this_col_id : unique_col_ids) {
+    auto parent_col_id = column_parent_ids[this_col_id];
+    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
+      forced_as_string_column[this_col_id] = true;
+      ignore_vals[this_col_id]             = 1;
+    }
+    // Convert only mixed type columns as string (so to copy), but not its children
+    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
+        forced_as_string_column[this_col_id])
+      column_categories[this_col_id] = NC_STR;
+  }
+  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                  column_categories.data(),
+                                  column_categories.size() * sizeof(column_categories[0]),
+                                  cudf::detail::host_memory_kind::PAGEABLE,
+                                  stream);
+
+  // restore unique_col_ids order
+  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
+    return thrust::get<1>(a) < thrust::get<1>(b);
+  });
+  return {ignore_vals, columns};
+}
+
+void scatter_offsets(
+  tree_meta_t& tree,
+  device_span<NodeIndexT> col_ids,
+  device_span<size_type> row_offsets,
+  device_span<size_type> node_ids,
+  device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+  tree_meta_t& d_column_tree,
+  host_span<const uint8_t> ignore_vals,
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>& columns,
+  rmm::cuda_stream_view stream)
+{
+  auto const num_nodes   = col_ids.size();
+  auto const num_columns = d_column_tree.node_categories.size();
+  // move columns data to device.
+  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
+  for (auto& [col_id, col_ref] : columns) {
+    if (col_id == parent_node_sentinel) continue;
+    auto& col            = col_ref.get();
+    columns_data[col_id] = json_column_data{col.string_offsets.data(),
+                                            col.string_lengths.data(),
+                                            col.child_offsets.data(),
+                                            static_cast<bitmask_type*>(col.validity.data())};
+  }
+
+  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
+    ignore_vals, stream, cudf::get_current_device_resource_ref());
+  auto d_columns_data = cudf::detail::make_device_uvector_async(
+    columns_data, stream, cudf::get_current_device_resource_ref());
+
+  // 3. scatter string offsets to respective columns, set validity bits
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::counting_iterator<size_type>(0),
+    num_nodes,
+    [column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     row_offsets       = row_offsets.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     d_ignore_vals     = d_ignore_vals.begin(),
+     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
+      if (d_ignore_vals[col_ids[i]]) return;
+      auto const node_category = column_categories[col_ids[i]];
+      switch (node_category) {
+        case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
+        case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
+        case NC_STR: [[fallthrough]];
+        case NC_VAL:
+          if (d_ignore_vals[col_ids[i]]) break;
+          set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]);
+          d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i];
+          d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i];
+          break;
+        default: break;
+      }
+    });
+
+  // 4. scatter List offset
+  // copy_if only node's whose parent is list, (node_id, parent_col_id)
+  // stable_sort by parent_col_id of {node_id}.
+  // For all unique parent_node_id of (i==0, i-1!=i), write start offset.
+  //                                  (i==last, i+1!=i), write end offset.
+  //    unique_copy_by_key {parent_node_id} {row_offset} to
+  //    col[parent_col_id].child_offsets[row_offset[parent_node_id]]
+
+  auto& parent_col_ids = sorted_col_ids;  // reuse sorted_col_ids
+  auto parent_col_id   = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<size_type>(0),
+    cuda::proclaim_return_type<NodeIndexT>(
+      [col_ids         = col_ids.begin(),
+       parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
+        return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel
+                                                                  : col_ids[parent_node_ids[node_id]];
+      }));
+  auto const list_children_end = thrust::copy_if(
+    rmm::exec_policy(stream),
+    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
+    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
+      num_nodes,
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
+    [d_ignore_vals     = d_ignore_vals.begin(),
+     parent_node_ids   = tree.parent_node_ids.begin(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin()] __device__(size_type node_id) {
+      auto parent_node_id = parent_node_ids[node_id];
+      return parent_node_id != parent_node_sentinel and
+             column_categories[col_ids[parent_node_id]] == NC_LIST and
+             (!d_ignore_vals[col_ids[parent_node_id]]);
+    });
+
+  auto const num_list_children =
+    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
+  thrust::stable_sort_by_key(rmm::exec_policy(stream),
+                             parent_col_ids.begin(),
+                             parent_col_ids.begin() + num_list_children,
+                             node_ids.begin());
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    num_list_children,
+    [node_ids        = node_ids.begin(),
+     parent_node_ids = tree.parent_node_ids.begin(),
+     parent_col_ids  = parent_col_ids.begin(),
+     row_offsets     = row_offsets.begin(),
+     d_columns_data  = d_columns_data.begin(),
+     num_list_children] __device__(size_type i) {
+      auto const node_id        = node_ids[i];
+      auto const parent_node_id = parent_node_ids[node_id];
+      // scatter to list_offset
+      if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
+        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
+          row_offsets[node_id];
+      }
+      // last value of list child_offset is its size.
+      if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
+        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
+          row_offsets[node_id] + 1;
+      }
+    });
+
+  // 5. scan on offsets.
+  for (auto& [id, col_ref] : columns) {
+    auto& col = col_ref.get();
+    if (col.type == json_col_t::StringColumn) {
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                             col.string_offsets.begin(),
+                             col.string_offsets.end(),
+                             col.string_offsets.begin(),
+                             thrust::maximum<json_column::row_offset_t>{});
+    } else if (col.type == json_col_t::ListColumn) {
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                             col.child_offsets.begin(),
+                             col.child_offsets.end(),
+                             col.child_offsets.begin(),
+                             thrust::maximum<json_column::row_offset_t>{});
+    }
+  }
+  stream.synchronize();
+}
+
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 756047d383a..dfd9285f682 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -24,7 +24,6 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/detail/json.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -36,26 +35,18 @@
 
 #include <cuda/atomic>
 #include <cuda/functional>
-#include <thrust/count.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/gather.h>
-#include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
-#include <thrust/scan.h>
-#include <thrust/sort.h>
 #include <thrust/transform.h>
 #include <thrust/unique.h>
 
-#include <algorithm>
-#include <cstdint>
-
 namespace cudf::io::json::detail {
 
-// DEBUG prints
 auto to_cat = [](auto v) -> std::string {
   switch (v) {
     case NC_STRUCT: return " S";
@@ -114,18 +105,19 @@ void print_tree(host_span<SymbolT const> input,
  */
 std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
 reduce_to_column_tree(tree_meta_t& tree,
-                      device_span<NodeIndexT> original_col_ids,
-                      device_span<NodeIndexT> sorted_col_ids,
-                      device_span<NodeIndexT> ordered_node_ids,
-                      device_span<size_type> row_offsets,
+                      device_span<NodeIndexT const> original_col_ids,
+                      device_span<NodeIndexT const> sorted_col_ids,
+                      device_span<NodeIndexT const> ordered_node_ids,
+                      device_span<size_type const> row_offsets,
                       bool is_array_of_arrays,
                       NodeIndexT const row_array_parent_col_id,
                       rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
+
   // 1. column count for allocation
-  auto const num_columns =
-    thrust::unique_count(rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end());
+  auto const num_columns = thrust::unique_count(
+    rmm::exec_policy_nosync(stream), sorted_col_ids.begin(), sorted_col_ids.end());
 
   // 2. reduce_by_key {col_id}, {row_offset}, max.
   rmm::device_uvector<NodeIndexT> unique_col_ids(num_columns, stream);
@@ -170,30 +162,34 @@ reduce_to_column_tree(tree_meta_t& tree,
     });
 
   // 4. unique_copy parent_node_ids, ranges
-  rmm::device_uvector<TreeDepthT> column_levels(0, stream);  // not required
+  rmm::device_uvector<TreeDepthT> column_levels(num_columns, stream);  // not required
   rmm::device_uvector<NodeIndexT> parent_col_ids(num_columns, stream);
   rmm::device_uvector<SymbolOffsetT> col_range_begin(num_columns, stream);  // Field names
   rmm::device_uvector<SymbolOffsetT> col_range_end(num_columns, stream);
   rmm::device_uvector<size_type> unique_node_ids(num_columns, stream);
-  thrust::unique_by_key_copy(rmm::exec_policy(stream),
+  thrust::unique_by_key_copy(rmm::exec_policy_nosync(stream),
                              sorted_col_ids.begin(),
                              sorted_col_ids.end(),
                              ordered_node_ids.begin(),
                              thrust::make_discard_iterator(),
                              unique_node_ids.begin());
+
   thrust::copy_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_zip_iterator(
+      thrust::make_permutation_iterator(tree.node_levels.begin(), unique_node_ids.begin()),
       thrust::make_permutation_iterator(tree.parent_node_ids.begin(), unique_node_ids.begin()),
       thrust::make_permutation_iterator(tree.node_range_begin.begin(), unique_node_ids.begin()),
       thrust::make_permutation_iterator(tree.node_range_end.begin(), unique_node_ids.begin())),
     unique_node_ids.size(),
-    thrust::make_zip_iterator(
-      parent_col_ids.begin(), col_range_begin.begin(), col_range_end.begin()));
+    thrust::make_zip_iterator(column_levels.begin(),
+                              parent_col_ids.begin(),
+                              col_range_begin.begin(),
+                              col_range_end.begin()));
 
   // convert parent_node_ids to parent_col_ids
   thrust::transform(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     parent_col_ids.begin(),
     parent_col_ids.end(),
     parent_col_ids.begin(),
@@ -211,18 +207,17 @@ reduce_to_column_tree(tree_meta_t& tree,
              column_categories[parent_col_id] == NC_LIST &&
                (!is_array_of_arrays || parent_col_id != row_array_parent_col_id));
   };
+
   // Mixed types in List children go to different columns,
   // so all immediate children of list column should have same max_row_offsets.
   //   create list's children max_row_offsets array. (initialize to zero)
   //   atomicMax on  children max_row_offsets array.
   //   gather the max_row_offsets from children row offset array.
   {
-    rmm::device_uvector<NodeIndexT> list_parents_children_max_row_offsets(num_columns, stream);
-    thrust::fill(rmm::exec_policy(stream),
-                 list_parents_children_max_row_offsets.begin(),
-                 list_parents_children_max_row_offsets.end(),
-                 0);
-    thrust::for_each(rmm::exec_policy(stream),
+    auto list_parents_children_max_row_offsets =
+      cudf::detail::make_zeroed_device_uvector_async<NodeIndexT>(
+        static_cast<std::size_t>(num_columns), stream, cudf::get_current_device_resource_ref());
+    thrust::for_each(rmm::exec_policy_nosync(stream),
                      unique_col_ids.begin(),
                      unique_col_ids.end(),
                      [column_categories = column_categories.begin(),
@@ -238,8 +233,9 @@ reduce_to_column_tree(tree_meta_t& tree,
                          ref.fetch_max(max_row_offsets[col_id], cuda::std::memory_order_relaxed);
                        }
                      });
+
     thrust::gather_if(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       parent_col_ids.begin(),
       parent_col_ids.end(),
       parent_col_ids.begin(),
@@ -254,7 +250,7 @@ reduce_to_column_tree(tree_meta_t& tree,
   // copy lists' max_row_offsets to children.
   // all structs should have same size.
   thrust::transform_if(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     unique_col_ids.begin(),
     unique_col_ids.end(),
     max_row_offsets.begin(),
@@ -280,7 +276,7 @@ reduce_to_column_tree(tree_meta_t& tree,
 
   // For Struct and List (to avoid copying entire strings when mixed type as string is enabled)
   thrust::transform_if(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     col_range_begin.begin(),
     col_range_begin.end(),
     column_categories.begin(),
@@ -297,678 +293,6 @@ reduce_to_column_tree(tree_meta_t& tree,
                     std::move(max_row_offsets)};
 }
 
-/**
- * @brief Get the column indices for the values column for array of arrays rows
- *
- * @param row_array_children_level The level of the row array's children
- * @param d_tree The tree metadata
- * @param col_ids The column ids
- * @param num_columns The number of columns
- * @param stream The stream to use
- * @return The value columns' indices
- */
-rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_array_children_level,
-                                                          tree_meta_t const& d_tree,
-                                                          device_span<NodeIndexT> col_ids,
-                                                          size_type const num_columns,
-                                                          rmm::cuda_stream_view stream)
-{
-  CUDF_FUNC_RANGE();
-  auto [level2_nodes, level2_indices] = get_array_children_indices(
-    row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream);
-  auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin());
-  rmm::device_uvector<NodeIndexT> values_column_indices(num_columns, stream);
-  thrust::scatter(rmm::exec_policy(stream),
-                  level2_indices.begin(),
-                  level2_indices.end(),
-                  col_id_location,
-                  values_column_indices.begin());
-  return values_column_indices;
-}
-
-/**
- * @brief Copies strings specified by pair of begin, end offsets to host vector of strings.
- *
- * @param input String device buffer
- * @param node_range_begin Begin offset of the strings
- * @param node_range_end End offset of the strings
- * @param stream CUDA stream
- * @return Vector of strings
- */
-std::vector<std::string> copy_strings_to_host_sync(
-  device_span<SymbolT const> input,
-  device_span<SymbolOffsetT const> node_range_begin,
-  device_span<SymbolOffsetT const> node_range_end,
-  rmm::cuda_stream_view stream)
-{
-  CUDF_FUNC_RANGE();
-  auto const num_strings = node_range_begin.size();
-  rmm::device_uvector<size_type> string_offsets(num_strings, stream);
-  rmm::device_uvector<size_type> string_lengths(num_strings, stream);
-  auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
-  thrust::transform(rmm::exec_policy(stream),
-                    d_offset_pairs,
-                    d_offset_pairs + num_strings,
-                    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()),
-                    [] __device__(auto const& offsets) {
-                      // Note: first character for non-field columns
-                      return thrust::make_tuple(
-                        static_cast<size_type>(thrust::get<0>(offsets)),
-                        static_cast<size_type>(thrust::get<1>(offsets) - thrust::get<0>(offsets)));
-                    });
-
-  cudf::io::parse_options_view options_view{};
-  options_view.quotechar  = '\0';  // no quotes
-  options_view.keepquotes = true;
-  auto d_offset_length_it =
-    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin());
-  auto d_column_names = parse_data(input.data(),
-                                   d_offset_length_it,
-                                   num_strings,
-                                   data_type{type_id::STRING},
-                                   rmm::device_buffer{},
-                                   0,
-                                   options_view,
-                                   stream,
-                                   cudf::get_current_device_resource_ref());
-  auto to_host        = [stream](auto const& col) {
-    if (col.is_empty()) return std::vector<std::string>{};
-    auto const scv     = cudf::strings_column_view(col);
-    auto const h_chars = cudf::detail::make_host_vector_async<char>(
-      cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
-    auto const h_offsets = cudf::detail::make_host_vector_async(
-      cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
-                                               scv.size() + 1),
-      stream);
-    stream.synchronize();
-
-    // build std::string vector from chars and offsets
-    std::vector<std::string> host_data;
-    host_data.reserve(col.size());
-    std::transform(
-      std::begin(h_offsets),
-      std::end(h_offsets) - 1,
-      std::begin(h_offsets) + 1,
-      std::back_inserter(host_data),
-      [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
-    return host_data;
-  };
-  return to_host(d_column_names->view());
-}
-
-/**
- * @brief Checks if all strings in each string column in the tree are nulls.
- * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as
- * false.
- *
- * @param input Input JSON string device data
- * @param d_column_tree column tree representation of JSON string
- * @param tree Node tree representation of the JSON string
- * @param col_ids Column ids of the nodes in the tree
- * @param options Parsing options specifying the parsing behaviour
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Array of bytes where each byte indicate if it is all nulls string column.
- */
-rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const> input,
-                                                      tree_meta_t const& d_column_tree,
-                                                      tree_meta_t const& tree,
-                                                      device_span<NodeIndexT> col_ids,
-                                                      cudf::io::json_reader_options const& options,
-                                                      rmm::cuda_stream_view stream)
-{
-  auto const num_nodes = col_ids.size();
-  auto const num_cols  = d_column_tree.node_categories.size();
-  rmm::device_uvector<uint8_t> is_all_nulls(num_cols, stream);
-  thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
-
-  auto parse_opt = parsing_options(options, stream);
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::counting_iterator<size_type>(0),
-    num_nodes,
-    [options           = parse_opt.view(),
-     data              = input.data(),
-     column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin(),
-     range_begin       = tree.node_range_begin.begin(),
-     range_end         = tree.node_range_end.begin(),
-     is_all_nulls      = is_all_nulls.begin()] __device__(size_type i) {
-      auto const node_category = column_categories[col_ids[i]];
-      if (node_category == NC_STR or node_category == NC_VAL) {
-        auto const is_null_literal = serialized_trie_contains(
-          options.trie_na,
-          {data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
-        if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
-      }
-    });
-  return is_all_nulls;
-}
-
-/**
- * @brief Holds member data pointers of `d_json_column`
- *
- */
-struct json_column_data {
-  using row_offset_t = json_column::row_offset_t;
-  row_offset_t* string_offsets;
-  row_offset_t* string_lengths;
-  row_offset_t* child_offsets;
-  bitmask_type* validity;
-};
-
-/**
- * @brief Constructs `d_json_column` from node tree representation
- * Newly constructed columns are insert into `root`'s children.
- * `root` must be a list type.
- *
- * @param input Input JSON string device data
- * @param tree Node tree representation of the JSON string
- * @param col_ids Column ids of the nodes in the tree
- * @param row_offsets Row offsets of the nodes in the tree
- * @param root Root node of the `d_json_column` tree
- * @param is_array_of_arrays Whether the tree is an array of arrays
- * @param options Parsing options specifying the parsing behaviour
- * options affecting behaviour are
- *   is_enabled_lines: Whether the input is a line-delimited JSON
- *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the device memory
- * of child_offets and validity members of `d_json_column`
- */
-void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t& tree,
-                             device_span<NodeIndexT> col_ids,
-                             device_span<size_type> row_offsets,
-                             device_json_column& root,
-                             bool is_array_of_arrays,
-                             cudf::io::json_reader_options const& options,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-
-  bool const is_enabled_lines                 = options.is_enabled_lines();
-  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
-  auto const num_nodes                        = col_ids.size();
-  rmm::device_uvector<NodeIndexT> sorted_col_ids(col_ids.size(), stream);  // make a copy
-  thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin());
-
-  // sort by {col_id} on {node_ids} stable
-  rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
-  thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end());
-  thrust::stable_sort_by_key(
-    rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
-
-  NodeIndexT const row_array_parent_col_id = [&]() {
-    NodeIndexT value = parent_node_sentinel;
-    if (!col_ids.empty()) {
-      auto const list_node_index = is_enabled_lines ? 0 : 1;
-      CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
-                                    col_ids.data() + list_node_index,
-                                    sizeof(NodeIndexT),
-                                    cudaMemcpyDefault,
-                                    stream.value()));
-      stream.synchronize();
-    }
-    return value;
-  }();
-
-  // 1. gather column information.
-  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
-    reduce_to_column_tree(tree,
-                          col_ids,
-                          sorted_col_ids,
-                          node_ids,
-                          row_offsets,
-                          is_array_of_arrays,
-                          row_array_parent_col_id,
-                          stream);
-  auto num_columns    = d_unique_col_ids.size();
-  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
-  auto column_categories =
-    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
-  auto const column_parent_ids =
-    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
-  auto column_range_beg =
-    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
-  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
-  std::vector<std::string> column_names = copy_strings_to_host_sync(
-    input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
-  // array of arrays column names
-  if (is_array_of_arrays) {
-    TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
-    auto values_column_indices =
-      get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
-    auto h_values_column_indices =
-      cudf::detail::make_host_vector_sync(values_column_indices, stream);
-    std::transform(unique_col_ids.begin(),
-                   unique_col_ids.end(),
-                   column_names.begin(),
-                   column_names.begin(),
-                   [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
-                     auto col_id, auto name) mutable {
-                     return column_parent_ids[col_id] == row_array_parent_col_id
-                              ? std::to_string(h_values_column_indices[col_id])
-                              : name;
-                   });
-  }
-
-  auto to_json_col_type = [](auto category) {
-    switch (category) {
-      case NC_STRUCT: return json_col_t::StructColumn;
-      case NC_LIST: return json_col_t::ListColumn;
-      case NC_STR: [[fallthrough]];
-      case NC_VAL: return json_col_t::StringColumn;
-      default: return json_col_t::Unknown;
-    }
-  };
-  auto init_to_zero = [stream](auto& v) {
-    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
-  };
-
-  auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
-    if (column_category == NC_ERR || column_category == NC_FN) {
-      return;
-    } else if (column_category == NC_VAL || column_category == NC_STR) {
-      col.string_offsets.resize(max_row_offsets[i] + 1, stream);
-      col.string_lengths.resize(max_row_offsets[i] + 1, stream);
-      init_to_zero(col.string_offsets);
-      init_to_zero(col.string_lengths);
-    } else if (column_category == NC_LIST) {
-      col.child_offsets.resize(max_row_offsets[i] + 2, stream);
-      init_to_zero(col.child_offsets);
-    }
-    col.num_rows = max_row_offsets[i] + 1;
-    col.validity =
-      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = to_json_col_type(column_category);
-  };
-
-  auto reinitialize_as_string = [&](auto i, auto& col) {
-    col.string_offsets.resize(max_row_offsets[i] + 1, stream);
-    col.string_lengths.resize(max_row_offsets[i] + 1, stream);
-    init_to_zero(col.string_offsets);
-    init_to_zero(col.string_lengths);
-    col.num_rows = max_row_offsets[i] + 1;
-    col.validity =
-      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = json_col_t::StringColumn;
-    // destroy references of all child columns after this step, by calling remove_child_columns
-  };
-
-  path_from_tree tree_path{column_categories,
-                           column_parent_ids,
-                           column_names,
-                           is_array_of_arrays,
-                           row_array_parent_col_id};
-
-  // 2. generate nested columns tree and its device_memory
-  // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
-  auto h_range_col_id_it =
-    thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin());
-  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
-    return thrust::get<0>(a) < thrust::get<0>(b);
-  });
-
-  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
-    if (is_enabled_mixed_types_as_string) {
-      return cudf::detail::make_host_vector_sync(
-        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
-    }
-    return cudf::detail::make_empty_host_vector<uint8_t>(0, stream);
-  }();
-
-  // use hash map because we may skip field name's col_ids
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
-  // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
-  std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
-  // find column_ids which are values, but should be ignored in validity
-  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
-  std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
-  std::vector<uint8_t> is_pruned(num_columns, 0);
-  // for columns that are not mixed type but have been forced as string
-  std::vector<bool> forced_as_string_column(num_columns);
-  columns.try_emplace(parent_node_sentinel, std::ref(root));
-
-  std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
-    [&](NodeIndexT this_col_id, device_json_column& col) {
-      for (auto col_name : col.column_order) {
-        auto child_id                  = mapped_columns[{this_col_id, col_name}];
-        is_mixed_type_column[child_id] = 1;
-        remove_child_columns(child_id, col.child_columns.at(col_name));
-        mapped_columns.erase({this_col_id, col_name});
-        columns.erase(child_id);
-      }
-      col.child_columns.clear();  // their references are deleted above.
-      col.column_order.clear();
-    };
-
-  auto name_and_parent_index = [&is_array_of_arrays,
-                                &row_array_parent_col_id,
-                                &column_parent_ids,
-                                &column_categories,
-                                &column_names](auto this_col_id) {
-    std::string name   = "";
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
-      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
-        name = column_names[this_col_id];
-      } else {
-        name = list_child_name;
-      }
-    } else if (column_categories[parent_col_id] == NC_FN) {
-      auto field_name_col_id = parent_col_id;
-      parent_col_id          = column_parent_ids[parent_col_id];
-      name                   = column_names[field_name_col_id];
-    } else {
-      CUDF_FAIL("Unexpected parent column category");
-    }
-    return std::pair{name, parent_col_id};
-  };
-
-  // Prune columns that are not required to be parsed.
-  if (options.is_enabled_prune_columns()) {
-    for (auto const this_col_id : unique_col_ids) {
-      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-        continue;
-      }
-      // Struct, List, String, Value
-      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
-      // get path of this column, and get its dtype if present in options
-      auto const nt                             = tree_path.get_path(this_col_id);
-      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
-        is_pruned[this_col_id] = 1;
-        continue;
-      } else {
-        // make sure all its parents are not pruned.
-        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
-          is_pruned[parent_col_id] = 0;
-          parent_col_id            = column_parent_ids[parent_col_id];
-        }
-      }
-    }
-  }
-
-  // Build the column tree, also, handles mixed types.
-  for (auto const this_col_id : unique_col_ids) {
-    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-      continue;
-    }
-    // Struct, List, String, Value
-    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
-
-    // if parent is mixed type column or this column is pruned or if parent
-    // has been forced as string, ignore this column.
-    if (parent_col_id != parent_node_sentinel &&
-          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
-        forced_as_string_column[parent_col_id]) {
-      ignore_vals[this_col_id] = 1;
-      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
-      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
-      continue;
-    }
-
-    // If the child is already found,
-    // replace if this column is a nested column and the existing was a value column
-    // ignore this column if this column is a value column and the existing was a nested column
-    auto it = columns.find(parent_col_id);
-    CUDF_EXPECTS(it != columns.end(), "Parent column not found");
-    auto& parent_col = it->second.get();
-    bool replaced    = false;
-    if (mapped_columns.count({parent_col_id, name}) > 0) {
-      auto const old_col_id = mapped_columns[{parent_col_id, name}];
-      // If mixed type as string is enabled, make both of them strings and merge them.
-      // All child columns will be ignored when parsing.
-      if (is_enabled_mixed_types_as_string) {
-        bool const is_mixed_type = [&]() {
-          // If new or old is STR and they are all not null, make it mixed type, else ignore.
-          if (column_categories[this_col_id] == NC_VAL ||
-              column_categories[this_col_id] == NC_STR) {
-            if (is_str_column_all_nulls[this_col_id]) return false;
-          }
-          if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
-            if (is_str_column_all_nulls[old_col_id]) return false;
-          }
-          return true;
-        }();
-        if (is_mixed_type) {
-          is_mixed_type_column[this_col_id] = 1;
-          is_mixed_type_column[old_col_id]  = 1;
-          // if old col type (not cat) is list or struct, replace with string.
-          auto& col = columns.at(old_col_id).get();
-          if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
-            reinitialize_as_string(old_col_id, col);
-            remove_child_columns(old_col_id, col);
-            // all its children (which are already inserted) are ignored later.
-          }
-          col.forced_as_string_column = true;
-          columns.try_emplace(this_col_id, columns.at(old_col_id));
-          continue;
-        }
-      }
-
-      if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
-        ignore_vals[this_col_id] = 1;
-        continue;
-      }
-      if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
-        // remap
-        ignore_vals[old_col_id] = 1;
-        mapped_columns.erase({parent_col_id, name});
-        columns.erase(old_col_id);
-        parent_col.child_columns.erase(name);
-        replaced = true;  // to skip duplicate name in column_order
-      } else {
-        // If this is a nested column but we're trying to insert either (a) a list node into a
-        // struct column or (b) a struct node into a list column, we fail
-        CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and
-                          column_categories[this_col_id] == NC_STRUCT) or
-                         (column_categories[old_col_id] == NC_STRUCT and
-                          column_categories[this_col_id] == NC_LIST)),
-                     "A mix of lists and structs within the same column is not supported");
-      }
-    }
-
-    auto this_column_category = column_categories[this_col_id];
-    // get path of this column, check if it is a struct/list forced as string, and enforce it
-    auto const nt                             = tree_path.get_path(this_col_id);
-    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-    if ((column_categories[this_col_id] == NC_STRUCT or
-         column_categories[this_col_id] == NC_LIST) and
-        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-      this_column_category = NC_STR;
-    }
-
-    CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
-    // move into parent
-    device_json_column col(stream, mr);
-    initialize_json_columns(this_col_id, col, this_column_category);
-    if ((column_categories[this_col_id] == NC_STRUCT or
-         column_categories[this_col_id] == NC_LIST) and
-        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-      col.forced_as_string_column          = true;
-      forced_as_string_column[this_col_id] = true;
-    }
-
-    auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
-    CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
-    if (not replaced) parent_col.column_order.push_back(name);
-    columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
-    mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
-  }
-
-  if (is_enabled_mixed_types_as_string) {
-    // ignore all children of mixed type columns
-    for (auto const this_col_id : unique_col_ids) {
-      auto parent_col_id = column_parent_ids[this_col_id];
-      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
-        is_mixed_type_column[this_col_id] = 1;
-        ignore_vals[this_col_id]          = 1;
-        columns.erase(this_col_id);
-      }
-      // Convert only mixed type columns as string (so to copy), but not its children
-      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and
-          is_mixed_type_column[this_col_id] == 1)
-        column_categories[this_col_id] = NC_STR;
-    }
-    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                    column_categories.data(),
-                                    column_categories.size() * sizeof(column_categories[0]),
-                                    cudf::detail::host_memory_kind::PAGEABLE,
-                                    stream);
-  }
-
-  // ignore all children of columns forced as string
-  for (auto const this_col_id : unique_col_ids) {
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
-      forced_as_string_column[this_col_id] = true;
-      ignore_vals[this_col_id]             = 1;
-    }
-    // Convert only mixed type columns as string (so to copy), but not its children
-    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
-        forced_as_string_column[this_col_id])
-      column_categories[this_col_id] = NC_STR;
-  }
-  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                  column_categories.data(),
-                                  column_categories.size() * sizeof(column_categories[0]),
-                                  cudf::detail::host_memory_kind::PAGEABLE,
-                                  stream);
-
-  // restore unique_col_ids order
-  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
-    return thrust::get<1>(a) < thrust::get<1>(b);
-  });
-  // move columns data to device.
-  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
-  for (auto& [col_id, col_ref] : columns) {
-    if (col_id == parent_node_sentinel) continue;
-    auto& col            = col_ref.get();
-    columns_data[col_id] = json_column_data{col.string_offsets.data(),
-                                            col.string_lengths.data(),
-                                            col.child_offsets.data(),
-                                            static_cast<bitmask_type*>(col.validity.data())};
-  }
-
-  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
-    ignore_vals, stream, cudf::get_current_device_resource_ref());
-  auto d_columns_data = cudf::detail::make_device_uvector_async(
-    columns_data, stream, cudf::get_current_device_resource_ref());
-
-  // 3. scatter string offsets to respective columns, set validity bits
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::counting_iterator<size_type>(0),
-    num_nodes,
-    [column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin(),
-     row_offsets       = row_offsets.begin(),
-     range_begin       = tree.node_range_begin.begin(),
-     range_end         = tree.node_range_end.begin(),
-     d_ignore_vals     = d_ignore_vals.begin(),
-     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
-      if (d_ignore_vals[col_ids[i]]) return;
-      auto const node_category = column_categories[col_ids[i]];
-      switch (node_category) {
-        case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
-        case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
-        case NC_STR: [[fallthrough]];
-        case NC_VAL:
-          if (d_ignore_vals[col_ids[i]]) break;
-          set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]);
-          d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i];
-          d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i];
-          break;
-        default: break;
-      }
-    });
-
-  // 4. scatter List offset
-  // copy_if only node's whose parent is list, (node_id, parent_col_id)
-  // stable_sort by parent_col_id of {node_id}.
-  // For all unique parent_node_id of (i==0, i-1!=i), write start offset.
-  //                                  (i==last, i+1!=i), write end offset.
-  //    unique_copy_by_key {parent_node_id} {row_offset} to
-  //    col[parent_col_id].child_offsets[row_offset[parent_node_id]]
-
-  auto& parent_col_ids = sorted_col_ids;  // reuse sorted_col_ids
-  auto parent_col_id   = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0),
-    cuda::proclaim_return_type<NodeIndexT>(
-      [col_ids         = col_ids.begin(),
-       parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
-        return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel
-                                                                  : col_ids[parent_node_ids[node_id]];
-      }));
-  auto const list_children_end = thrust::copy_if(
-    rmm::exec_policy(stream),
-    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
-    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
-      num_nodes,
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
-    [d_ignore_vals     = d_ignore_vals.begin(),
-     parent_node_ids   = tree.parent_node_ids.begin(),
-     column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin()] __device__(size_type node_id) {
-      auto parent_node_id = parent_node_ids[node_id];
-      return parent_node_id != parent_node_sentinel and
-             column_categories[col_ids[parent_node_id]] == NC_LIST and
-             (!d_ignore_vals[col_ids[parent_node_id]]);
-    });
-
-  auto const num_list_children =
-    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
-  thrust::stable_sort_by_key(rmm::exec_policy(stream),
-                             parent_col_ids.begin(),
-                             parent_col_ids.begin() + num_list_children,
-                             node_ids.begin());
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    num_list_children,
-    [node_ids        = node_ids.begin(),
-     parent_node_ids = tree.parent_node_ids.begin(),
-     parent_col_ids  = parent_col_ids.begin(),
-     row_offsets     = row_offsets.begin(),
-     d_columns_data  = d_columns_data.begin(),
-     num_list_children] __device__(size_type i) {
-      auto const node_id        = node_ids[i];
-      auto const parent_node_id = parent_node_ids[node_id];
-      // scatter to list_offset
-      if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
-        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
-          row_offsets[node_id];
-      }
-      // last value of list child_offset is its size.
-      if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
-        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
-          row_offsets[node_id] + 1;
-      }
-    });
-
-  // 5. scan on offsets.
-  for (auto& [id, col_ref] : columns) {
-    auto& col = col_ref.get();
-    if (col.type == json_col_t::StringColumn) {
-      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                             col.string_offsets.begin(),
-                             col.string_offsets.end(),
-                             col.string_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
-    } else if (col.type == json_col_t::ListColumn) {
-      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                             col.child_offsets.begin(),
-                             col.child_offsets.end(),
-                             col.child_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
-    }
-  }
-  stream.synchronize();
-}
-
 std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_column_to_cudf_column(
   device_json_column& json_col,
   device_span<SymbolT const> d_input,
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 75639a0438f..93ef2b46be1 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -185,6 +185,55 @@ struct device_json_column {
   }
 };
 
+namespace experimental {
+/*
+ * @brief Sparse graph adjacency matrix stored in Compressed Sparse Row (CSR) format.
+ */
+struct compressed_sparse_row {
+  rmm::device_uvector<NodeIndexT> row_idx;
+  rmm::device_uvector<NodeIndexT> col_idx;
+};
+
+/*
+ * @brief Auxiliary column tree properties that are required to construct the device json
+ * column subtree, but not required for the final cudf column construction.
+ */
+struct column_tree_properties {
+  rmm::device_uvector<NodeT> categories;
+  rmm::device_uvector<size_type> max_row_offsets;
+  rmm::device_uvector<NodeIndexT> mapped_ids;
+};
+
+namespace detail {
+/**
+ * @brief Reduce node tree into column tree by aggregating each property of column.
+ *
+ * @param node_tree Node tree representation of JSON string
+ * @param original_col_ids Column ids of nodes
+ * @param sorted_col_ids Sorted column ids of nodes
+ * @param ordered_node_ids Node ids of nodes sorted by column ids
+ * @param row_offsets Row offsets of nodes
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Tuple of compressed_sparse_row struct storing adjacency information of the column tree,
+ * and column_tree_properties struct storing properties of each node i.e. column category, max
+ * number of rows in the column, and column id
+ */
+CUDF_EXPORT
+std::tuple<compressed_sparse_row, column_tree_properties> reduce_to_column_tree(
+  tree_meta_t& node_tree,
+  device_span<NodeIndexT const> original_col_ids,
+  device_span<NodeIndexT const> sorted_col_ids,
+  device_span<NodeIndexT const> ordered_node_ids,
+  device_span<size_type const> row_offsets,
+  bool is_array_of_arrays,
+  NodeIndexT row_array_parent_col_id,
+  rmm::cuda_stream_view stream);
+
+}  // namespace detail
+}  // namespace experimental
+
 namespace detail {
 
 // TODO: return device_uvector instead of passing pre-allocated memory
@@ -299,22 +348,59 @@ get_array_children_indices(TreeDepthT row_array_children_level,
                            device_span<TreeDepthT const> node_levels,
                            device_span<NodeIndexT const> parent_node_ids,
                            rmm::cuda_stream_view stream);
+
 /**
- * @brief Reduce node tree into column tree by aggregating each property of column.
+ * @brief Reduces node tree representation to column tree representation.
  *
- * @param tree json node tree to reduce (modified in-place, but restored to original state)
- * @param col_ids column ids of each node (modified in-place, but restored to original state)
- * @param row_offsets row offsets of each node (modified in-place, but restored to original state)
- * @param stream The CUDA stream to which kernels are dispatched
- * @return A tuple containing the column tree, identifier for each column and the maximum row index
- * in each column
+ * @param node_tree Node tree representation of JSON string
+ * @param original_col_ids Column ids of nodes
+ * @param sorted_col_ids Sorted column ids of nodes
+ * @param ordered_node_ids Node ids of nodes sorted by column ids
+ * @param row_offsets Row offsets of nodes
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A tuple of column tree representation of JSON string, column ids of columns, and
+ * max row offsets of columns
  */
+CUDF_EXPORT
 std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
-reduce_to_column_tree(tree_meta_t& tree,
-                      device_span<NodeIndexT> col_ids,
-                      device_span<size_type> row_offsets,
+reduce_to_column_tree(tree_meta_t& node_tree,
+                      device_span<NodeIndexT const> original_col_ids,
+                      device_span<NodeIndexT const> sorted_col_ids,
+                      device_span<NodeIndexT const> ordered_node_ids,
+                      device_span<size_type const> row_offsets,
+                      bool is_array_of_arrays,
+                      NodeIndexT const row_array_parent_col_id,
                       rmm::cuda_stream_view stream);
-
+/**
+ * @brief Constructs `d_json_column` from node tree representation
+ * Newly constructed columns are insert into `root`'s children.
+ * `root` must be a list type.
+ *
+ * @param input Input JSON string device data
+ * @param tree Node tree representation of the JSON string
+ * @param col_ids Column ids of the nodes in the tree
+ * @param row_offsets Row offsets of the nodes in the tree
+ * @param root Root node of the `d_json_column` tree
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param options Parsing options specifying the parsing behaviour
+ * options affecting behaviour are
+ *   is_enabled_lines: Whether the input is a line-delimited JSON
+ *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the device memory
+ * of child_offets and validity members of `d_json_column`
+ */
+void make_device_json_column(device_span<SymbolT const> input,
+                             tree_meta_t& tree,
+                             device_span<NodeIndexT> col_ids,
+                             device_span<size_type> row_offsets,
+                             device_json_column& root,
+                             bool is_array_of_arrays,
+                             cudf::io::json_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
 /**
  * @brief Retrieves the parse_options to be used for type inference and type casting
  *
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 748691fb7d1..2ec23e0dc6d 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -27,7 +27,6 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -377,16 +376,12 @@ conditional_inner_join(table_view const& left,
                        table_view const& right,
                        ast::expression const& binary_predicate,
                        std::optional<std::size_t> output_size,
+                       rmm::cuda_stream_view stream,
                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join(left,
-                                  right,
-                                  binary_predicate,
-                                  detail::join_kind::INNER_JOIN,
-                                  output_size,
-                                  cudf::get_default_stream(),
-                                  mr);
+  return detail::conditional_join(
+    left, right, binary_predicate, detail::join_kind::INNER_JOIN, output_size, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -395,16 +390,12 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size,
+                      rmm::cuda_stream_view stream,
                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join(left,
-                                  right,
-                                  binary_predicate,
-                                  detail::join_kind::LEFT_JOIN,
-                                  output_size,
-                                  cudf::get_default_stream(),
-                                  mr);
+  return detail::conditional_join(
+    left, right, binary_predicate, detail::join_kind::LEFT_JOIN, output_size, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -412,16 +403,12 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
+                      rmm::cuda_stream_view stream,
                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join(left,
-                                  right,
-                                  binary_predicate,
-                                  detail::join_kind::FULL_JOIN,
-                                  {},
-                                  cudf::get_default_stream(),
-                                  mr);
+  return detail::conditional_join(
+    left, right, binary_predicate, detail::join_kind::FULL_JOIN, {}, stream, mr);
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
@@ -429,16 +416,12 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join_anti_semi(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_SEMI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr);
+  return detail::conditional_join_anti_semi(
+    left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, output_size, stream, mr);
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
@@ -446,64 +429,56 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join_anti_semi(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_ANTI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr);
+  return detail::conditional_join_anti_semi(
+    left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, output_size, stream, mr);
 }
 
 std::size_t conditional_inner_join_size(table_view const& left,
                                         table_view const& right,
                                         ast::expression const& binary_predicate,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
-    left, right, binary_predicate, detail::join_kind::INNER_JOIN, cudf::get_default_stream(), mr);
+    left, right, binary_predicate, detail::join_kind::INNER_JOIN, stream, mr);
 }
 
 std::size_t conditional_left_join_size(table_view const& left,
                                        table_view const& right,
                                        ast::expression const& binary_predicate,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
-    left, right, binary_predicate, detail::join_kind::LEFT_JOIN, cudf::get_default_stream(), mr);
+    left, right, binary_predicate, detail::join_kind::LEFT_JOIN, stream, mr);
 }
 
 std::size_t conditional_left_semi_join_size(table_view const& left,
                                             table_view const& right,
                                             ast::expression const& binary_predicate,
+                                            rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::compute_conditional_join_output_size(left,
-                                                      right,
-                                                      binary_predicate,
-                                                      detail::join_kind::LEFT_SEMI_JOIN,
-                                                      cudf::get_default_stream(),
-                                                      mr);
+  return detail::compute_conditional_join_output_size(
+    left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, stream, mr);
 }
 
 std::size_t conditional_left_anti_join_size(table_view const& left,
                                             table_view const& right,
                                             ast::expression const& binary_predicate,
+                                            rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::compute_conditional_join_output_size(left,
-                                                      right,
-                                                      binary_predicate,
-                                                      detail::join_kind::LEFT_ANTI_JOIN,
-                                                      cudf::get_default_stream(),
-                                                      mr);
+  return detail::compute_conditional_join_output_size(
+    left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
index 4f6a9484e8c..303442e79ef 100644
--- a/cpp/src/join/conditional_join.hpp
+++ b/cpp/src/join/conditional_join.hpp
@@ -19,7 +19,6 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index eeb49736bac..15594fb60e3 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -25,7 +25,6 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -75,10 +74,11 @@ std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
 
 std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
                                         cudf::table_view const& right,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::cross_join(left, right, cudf::get_default_stream(), mr);
+  return detail::cross_join(left, right, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 0abff27667b..7b13c260364 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -20,7 +20,6 @@
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -120,10 +119,11 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 inner_join(table_view const& left,
            table_view const& right,
            null_equality compare_nulls,
+           rmm::cuda_stream_view stream,
            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::inner_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
+  return detail::inner_join(left, right, compare_nulls, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -131,10 +131,11 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 left_join(table_view const& left,
           table_view const& right,
           null_equality compare_nulls,
+          rmm::cuda_stream_view stream,
           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::left_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
+  return detail::left_join(left, right, compare_nulls, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -142,10 +143,11 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 full_join(table_view const& left,
           table_view const& right,
           null_equality compare_nulls,
+          rmm::cuda_stream_view stream,
           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::full_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
+  return detail::full_join(left, right, compare_nulls, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 573101cefd9..86402a0e7de 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -22,6 +22,7 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
+#include <cuco/static_map.cuh>
 #include <cuco/static_multimap.cuh>
 #include <cuda/atomic>
 
@@ -50,6 +51,11 @@ using mixed_multimap_type =
                         cudf::detail::cuco_allocator<char>,
                         cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
+using semi_map_type = cuco::legacy::static_map<hash_value_type,
+                                               size_type,
+                                               cuda::thread_scope_device,
+                                               cudf::detail::cuco_allocator<char>>;
+
 using row_hash_legacy =
   cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
 
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 8ff78dd47f4..820b81ee309 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -28,7 +28,6 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -484,6 +483,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -495,7 +495,7 @@ mixed_inner_join(
                             compare_nulls,
                             detail::join_kind::INNER_JOIN,
                             output_size_data,
-                            cudf::get_default_stream(),
+                            stream,
                             mr);
 }
 
@@ -506,6 +506,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -516,7 +517,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
                                                 binary_predicate,
                                                 compare_nulls,
                                                 detail::join_kind::INNER_JOIN,
-                                                cudf::get_default_stream(),
+                                                stream,
                                                 mr);
 }
 
@@ -530,6 +531,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -541,7 +543,7 @@ mixed_left_join(
                             compare_nulls,
                             detail::join_kind::LEFT_JOIN,
                             output_size_data,
-                            cudf::get_default_stream(),
+                            stream,
                             mr);
 }
 
@@ -552,6 +554,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -562,7 +565,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
                                                 binary_predicate,
                                                 compare_nulls,
                                                 detail::join_kind::LEFT_JOIN,
-                                                cudf::get_default_stream(),
+                                                stream,
                                                 mr);
 }
 
@@ -576,6 +579,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -587,7 +591,7 @@ mixed_full_join(
                             compare_nulls,
                             detail::join_kind::FULL_JOIN,
                             output_size_data,
-                            cudf::get_default_stream(),
+                            stream,
                             mr);
 }
 
diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh
index 89c13285cfe..19701816867 100644
--- a/cpp/src/join/mixed_join_common_utils.cuh
+++ b/cpp/src/join/mixed_join_common_utils.cuh
@@ -25,7 +25,6 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cub/cub.cuh>
-#include <cuco/static_set.cuh>
 
 namespace cudf {
 namespace detail {
@@ -161,38 +160,6 @@ struct pair_expression_equality : public expression_equality<has_nulls> {
   }
 };
 
-/**
- * @brief Equality comparator that composes two row_equality comparators.
- */
-struct double_row_equality_comparator {
-  row_equality const equality_comparator;
-  row_equality const conditional_comparator;
-
-  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
-  {
-    using experimental::row::lhs_index_type;
-    using experimental::row::rhs_index_type;
-
-    return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
-           conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
-  }
-};
-
-// A CUDA Cooperative Group of 4 threads for the hash set.
-auto constexpr DEFAULT_MIXED_JOIN_CG_SIZE = 4;
-
-// The hash set type used by mixed_semi_join with the build_table.
-using hash_set_type = cuco::static_set<size_type,
-                                       cuco::extent<size_t>,
-                                       cuda::thread_scope_device,
-                                       double_row_equality_comparator,
-                                       cuco::linear_probing<DEFAULT_MIXED_JOIN_CG_SIZE, row_hash>,
-                                       cudf::detail::cuco_allocator<char>,
-                                       cuco::storage<1>>;
-
-// The hash_set_ref_type used by mixed_semi_join kerenels for probing.
-using hash_set_ref_type = hash_set_type::ref_type<cuco::contains_tag>;
-
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index f2c5ff13638..7459ac3e99c 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -38,16 +38,12 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                   table_device_view right_table,
                   table_device_view probe,
                   table_device_view build,
+                  row_hash const hash_probe,
                   row_equality const equality_probe,
-                  hash_set_ref_type set_ref,
+                  cudf::detail::semi_map_type::device_view hash_table_view,
                   cudf::device_span<bool> left_table_keep_mask,
                   cudf::ast::detail::expression_device_view device_expression_data)
 {
-  auto constexpr cg_size = hash_set_ref_type::cg_size;
-
-  auto const tile =
-    cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
-
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
   // used to circumvent conflicts between arrays of different types between
@@ -56,24 +52,24 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
     reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
   auto thread_intermediate_storage =
-    &intermediate_storage[tile.meta_group_rank() * device_expression_data.num_intermediates];
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = left_num_rows;
 
-  cudf::size_type const outer_num_rows = left_table.num_rows();
-  auto const outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>() / cg_size;
+  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
     left_table, right_table, device_expression_data);
 
   if (outer_row_index < outer_num_rows) {
-    // Make sure to swap_tables here as hash_set will use probe table as the left one.
-    auto constexpr swap_tables = true;
     // Figure out the number of elements for this key.
     auto equality = single_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+      evaluator, thread_intermediate_storage, false, equality_probe};
 
-    auto const set_ref_equality = set_ref.with_key_eq(equality);
-    auto const result           = set_ref_equality.contains(tile, outer_row_index);
-    if (tile.thread_rank() == 0) left_table_keep_mask[outer_row_index] = result;
+    left_table_keep_mask[outer_row_index] =
+      hash_table_view.contains(outer_row_index, hash_probe, equality);
   }
 }
 
@@ -82,8 +78,9 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
+                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            hash_set_ref_type set_ref,
+                            cudf::detail::semi_map_type::device_view hash_table_view,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
@@ -97,8 +94,9 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
+        hash_probe,
         equality_probe,
-        set_ref,
+        hash_table_view,
         left_table_keep_mask,
         device_expression_data);
   } else {
@@ -108,8 +106,9 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
+        hash_probe,
         equality_probe,
-        set_ref,
+        hash_table_view,
         left_table_keep_mask,
         device_expression_data);
   }
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index b08298e64e4..43714ffb36a 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -45,8 +45,9 @@ namespace detail {
  * @param[in] right_table The right table
  * @param[in] probe The table with which to probe the hash table for matches.
  * @param[in] build The table with which the hash table was built.
+ * @param[in] hash_probe The hasher used for the probe table.
  * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] set_ref The hash table device view built from `build`.
+ * @param[in] hash_table_view The hash table built from `build`.
  * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating
  * the corresponding index from left table is present in output
  * @param[in] device_expression_data Container of device data required to evaluate the desired
@@ -57,8 +58,9 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
+                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            hash_set_ref_type set_ref,
+                            cudf::detail::semi_map_type::device_view hash_table_view,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 719b1d47105..aa4fa281159 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -29,7 +29,6 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -46,6 +45,45 @@
 namespace cudf {
 namespace detail {
 
+namespace {
+/**
+ * @brief Device functor to create a pair of hash value and index for a given row.
+ */
+struct make_pair_function_semi {
+  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
+  {
+    // The value is irrelevant since we only ever use the hash map to check for
+    // membership of a particular row index.
+    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
+  }
+};
+
+/**
+ * @brief Equality comparator that composes two row_equality comparators.
+ */
+class double_row_equality {
+ public:
+  double_row_equality(row_equality equality_comparator, row_equality conditional_comparator)
+    : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator}
+  {
+  }
+
+  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
+  {
+    using experimental::row::lhs_index_type;
+    using experimental::row::rhs_index_type;
+
+    return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
+           _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
+  }
+
+ private:
+  row_equality _equality_comparator;
+  row_equality _conditional_comparator;
+};
+
+}  // namespace
+
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -57,7 +95,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and
+  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
                  (join_type != join_kind::FULL_JOIN),
                "Inner, left, and full joins should use mixed_join.");
 
@@ -98,7 +136,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // output column and follow the null-supporting expression evaluation code
   // path.
   auto const has_nulls = cudf::nullate::DYNAMIC{
-    cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or
+    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
     binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)};
 
   auto const parser = ast::detail::expression_parser{
@@ -117,20 +155,27 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto right_conditional_view = table_device_view::create(right_conditional, stream);
 
   auto const preprocessed_build =
-    cudf::experimental::row::equality::preprocessed_table::create(build, stream);
+    experimental::row::equality::preprocessed_table::create(build, stream);
   auto const preprocessed_probe =
-    cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
+    experimental::row::equality::preprocessed_table::create(probe, stream);
   auto const row_comparator =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe};
+    cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
   auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
 
+  semi_map_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
+    cuco::empty_value{cudf::detail::JoinNoneValue},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+
   // Create hash table containing all keys found in right table
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const build_nulls    = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
   auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build};
-
+  auto const hash_build     = row_hash_build.device_hasher(build_nulls);
   // Since we may see multiple rows that are identical in the equality tables
   // but differ in the conditional tables, the equality comparator used for
   // insertion must account for both sets of tables. An alternative solution
@@ -145,28 +190,20 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const equality_build_equality =
     row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
   auto const preprocessed_build_condtional =
-    cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream);
+    experimental::row::equality::preprocessed_table::create(right_conditional, stream);
   auto const row_comparator_conditional_build =
     cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
                                                             preprocessed_build_condtional};
   auto const equality_build_conditional =
     row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
+  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
+  make_pair_function_semi pair_func_build{};
 
-  hash_set_type row_set{
-    {compute_hash_table_size(build.num_rows())},
-    cuco::empty_key{JoinNoneValue},
-    {equality_build_equality, equality_build_conditional},
-    {row_hash_build.device_hasher(build_nulls)},
-    {},
-    {},
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    {stream.value()}};
-
-  auto iter = thrust::make_counting_iterator(0);
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
 
   // skip rows that are null here.
   if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    row_set.insert(iter, iter + right_num_rows, stream.value());
+    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
     auto const [row_bitmask, _] =
@@ -174,19 +211,18 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
-    row_set.insert_if(iter, iter + right_num_rows, stencil, pred, stream.value());
+    hash_table.insert_if(
+      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
   }
 
+  auto hash_table_view = hash_table.get_device_view();
+
   detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block =
-    parser.shmem_per_thread *
-    cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
-  hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);
-
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
@@ -195,8 +231,9 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
                          *right_conditional_view,
                          *probe_view,
                          *build_view,
+                         hash_probe,
                          equality_probe,
-                         row_set_ref,
+                         hash_table_view,
                          cudf::device_span<bool>(left_table_keep_mask),
                          parser.device_expression_data,
                          config,
@@ -229,6 +266,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -239,7 +277,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
                                  binary_predicate,
                                  compare_nulls,
                                  detail::join_kind::LEFT_SEMI_JOIN,
-                                 cudf::get_default_stream(),
+                                 stream,
                                  mr);
 }
 
@@ -250,6 +288,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -260,7 +299,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
                                  binary_predicate,
                                  compare_nulls,
                                  detail::join_kind::LEFT_ANTI_JOIN,
-                                 cudf::get_default_stream(),
+                                 stream,
                                  mr);
 }
 
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index f69ded73e8d..d2ab2122c75 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -23,7 +23,6 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -98,22 +97,24 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
-    detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr);
+    detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, stream, mr);
 }
 
 std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
-    detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr);
+    detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 2dd25a7b890..e1c1d2e3002 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -1021,6 +1021,76 @@ struct group_key_func {
   }
 };
 
+// merges all the tdigests within each group. returns a table containing 2 columns:
+// the sorted means and weights.
+template <typename GroupOffsetIter>
+std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_merged_centroids(
+  tdigest_column_view const& tdv,
+  GroupOffsetIter group_offsets,
+  size_type num_groups,
+  rmm::cuda_stream_view stream)
+{
+  auto temp_mr = cudf::get_current_device_resource_ref();
+
+  auto const total_merged_centroids = tdv.means().size();
+
+  // output is the merged centroids (means, weights)
+  rmm::device_uvector<double> output_means(total_merged_centroids, stream, temp_mr);
+  rmm::device_uvector<double> output_weights(total_merged_centroids, stream, temp_mr);
+
+  // each group represents a collection of tdigest columns. each row is 1 tdigest.
+  // within each group, we want to sort all the centroids within all the tdigests
+  // in that group, using the means as the key. the "outer offsets" represent the indices of the
+  // tdigests, and the "inner offsets" represents the list of centroids for a particular tdigest.
+  //
+  //  rows
+  //  ----        centroid 0 ---------
+  //  tdigest 0   centroid 1
+  //  ----        centroid 2  group 0
+  //  tdigest 1   centroid 3
+  //  ----        centroid 4 ---------
+  //  tdigest 2   centroid 5
+  //  ----        centroid 6  group 1
+  //  tdigest 3   centroid 7
+  //              centroid 8
+  //  ----        centroid 9 --------
+  auto inner_offsets    = tdv.centroids().offsets();
+  auto centroid_offsets = cudf::detail::make_counting_transform_iterator(
+    0,
+    cuda::proclaim_return_type<size_type>(
+      [group_offsets, inner_offsets = tdv.centroids().offsets().begin<size_type>()] __device__(
+        size_type i) { return inner_offsets[group_offsets[i]]; }));
+
+  // perform the sort using the means as the key
+  size_t temp_size;
+  CUDF_CUDA_TRY(cub::DeviceSegmentedSort::SortPairs(nullptr,
+                                                    temp_size,
+                                                    tdv.means().begin<double>(),
+                                                    output_means.begin(),
+                                                    tdv.weights().begin<double>(),
+                                                    output_weights.begin(),
+                                                    total_merged_centroids,
+                                                    num_groups,
+                                                    centroid_offsets,
+                                                    centroid_offsets + 1,
+                                                    stream.value()));
+
+  rmm::device_buffer temp_mem(temp_size, stream, temp_mr);
+  CUDF_CUDA_TRY(cub::DeviceSegmentedSort::SortPairs(temp_mem.data(),
+                                                    temp_size,
+                                                    tdv.means().begin<double>(),
+                                                    output_means.begin(),
+                                                    tdv.weights().begin<double>(),
+                                                    output_weights.begin(),
+                                                    total_merged_centroids,
+                                                    num_groups,
+                                                    centroid_offsets,
+                                                    centroid_offsets + 1,
+                                                    stream.value()));
+
+  return {std::move(output_means), std::move(output_weights)};
+}
+
 template <typename HGroupOffsetIter, typename GroupOffsetIter, typename GroupLabelIter>
 std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                                        HGroupOffsetIter h_outer_offsets,
@@ -1032,59 +1102,6 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
-  // thrust::merge and thrust::merge_by_key don't provide what we need.  What we would need is an
-  // algorithm like a super-merge that takes two layers of keys: one which identifies the outer
-  // grouping of tdigests, and one which identifies the inner groupings of the tdigests within the
-  // outer groups.
-  // TODO: investigate replacing the iterative merge with a single stable_sort_by_key.
-
-  // bring tdigest offsets back to the host
-  auto tdigest_offsets = tdv.centroids().offsets();
-  std::vector<size_type> h_inner_offsets(tdigest_offsets.size());
-  cudaMemcpyAsync(h_inner_offsets.data(),
-                  tdigest_offsets.begin<size_type>(),
-                  sizeof(size_type) * tdigest_offsets.size(),
-                  cudaMemcpyDefault,
-                  stream);
-
-  stream.synchronize();
-
-  // extract all means and weights into a table
-  cudf::table_view tdigests_unsliced({tdv.means(), tdv.weights()});
-
-  // generate the merged (but not yet compressed) tdigests for each group.
-  std::vector<std::unique_ptr<table>> tdigests;
-  tdigests.reserve(num_groups);
-  std::transform(h_outer_offsets,
-                 h_outer_offsets + num_groups,
-                 std::next(h_outer_offsets),
-                 std::back_inserter(tdigests),
-                 [&](auto tdigest_start, auto tdigest_end) {
-                   // the range of tdigests in this group
-                   auto const num_tdigests = tdigest_end - tdigest_start;
-
-                   // slice each tdigest from the input
-                   std::vector<table_view> unmerged_tdigests;
-                   unmerged_tdigests.reserve(num_tdigests);
-                   auto offset_iter = std::next(h_inner_offsets.begin(), tdigest_start);
-                   std::transform(
-                     offset_iter,
-                     offset_iter + num_tdigests,
-                     std::next(offset_iter),
-                     std::back_inserter(unmerged_tdigests),
-                     [&](size_type start, size_type end) {
-                       return cudf::detail::slice(tdigests_unsliced, {start, end}, stream);
-                     });
-
-                   // merge
-                   return cudf::detail::merge(unmerged_tdigests,
-                                              {0},
-                                              {order::ASCENDING},
-                                              {},
-                                              stream,
-                                              cudf::get_current_device_resource_ref());
-                 });
-
   // generate min and max values
   auto merged_min_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -1121,7 +1138,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
   auto group_num_weights = cudf::detail::make_counting_transform_iterator(
     0,
     group_num_weights_func<decltype(group_offsets)>{group_offsets,
-                                                    tdigest_offsets.begin<size_type>()});
+                                                    tdv.centroids().offsets().begin<size_type>()});
   thrust::replace_if(rmm::exec_policy(stream),
                      merged_min_col->mutable_view().begin<double>(),
                      merged_min_col->mutable_view().end<double>(),
@@ -1135,29 +1152,33 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                      group_is_empty{},
                      0);
 
-  // concatenate all the merged tdigests back into one table.
-  std::vector<table_view> tdigest_views;
-  tdigest_views.reserve(num_groups);
-  std::transform(tdigests.begin(),
-                 tdigests.end(),
-                 std::back_inserter(tdigest_views),
-                 [](std::unique_ptr<table> const& t) { return t->view(); });
-  auto merged =
-    cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref());
+  auto temp_mr = cudf::get_current_device_resource_ref();
+
+  // merge the centroids
+  auto [merged_means, merged_weights] =
+    generate_merged_centroids(tdv, group_offsets, num_groups, stream);
+  size_t const num_centroids = tdv.means().size();
+  CUDF_EXPECTS(merged_means.size() == num_centroids,
+               "Unexpected number of centroids in merged result");
 
   // generate cumulative weights
-  auto merged_weights     = merged->get_column(1).view();
-  auto cumulative_weights = cudf::make_numeric_column(
-    data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream);
-  auto keys = cudf::detail::make_counting_transform_iterator(
-    0,
-    group_key_func<decltype(group_labels)>{
-      group_labels, tdigest_offsets.begin<size_type>(), tdigest_offsets.size()});
+  rmm::device_uvector<double> cumulative_weights(merged_weights.size(), stream, temp_mr);
+
+  // generate group keys for all centroids in the entire column
+  rmm::device_uvector<size_type> group_keys(num_centroids, stream, temp_mr);
+  auto iter          = thrust::make_counting_iterator(0);
+  auto inner_offsets = tdv.centroids().offsets();
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_centroids,
+                    group_keys.begin(),
+                    group_key_func<decltype(group_labels)>{
+                      group_labels, inner_offsets.begin<size_type>(), inner_offsets.size()});
   thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                keys,
-                                keys + cumulative_weights->size(),
-                                merged_weights.begin<double>(),
-                                cumulative_weights->mutable_view().begin<double>());
+                                group_keys.begin(),
+                                group_keys.begin() + num_centroids,
+                                merged_weights.begin(),
+                                cumulative_weights.begin());
 
   auto const delta = max_centroids;
 
@@ -1166,37 +1187,32 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
     delta,
     num_groups,
     nearest_value_centroid_weights<decltype(group_offsets)>{
-      cumulative_weights->view().begin<double>(),
-      group_offsets,
-      tdigest_offsets.begin<size_type>()},
-    centroid_group_info<decltype(group_offsets)>{cumulative_weights->view().begin<double>(),
-                                                 group_offsets,
-                                                 tdigest_offsets.begin<size_type>()},
+      cumulative_weights.begin(), group_offsets, inner_offsets.begin<size_type>()},
+    centroid_group_info<decltype(group_offsets)>{
+      cumulative_weights.begin(), group_offsets, inner_offsets.begin<size_type>()},
     cumulative_centroid_weight<decltype(group_labels), decltype(group_offsets)>{
-      cumulative_weights->view().begin<double>(),
+      cumulative_weights.begin(),
       group_labels,
       group_offsets,
-      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+      {inner_offsets.begin<size_type>(), static_cast<size_t>(inner_offsets.size())}},
     false,
     stream,
     mr);
 
   // input centroid values
   auto centroids = cudf::detail::make_counting_transform_iterator(
-    0,
-    make_weighted_centroid{merged->get_column(0).view().begin<double>(),
-                           merged_weights.begin<double>()});
+    0, make_weighted_centroid{merged_means.begin(), merged_weights.begin()});
 
   // compute the tdigest
   return compute_tdigests(
     delta,
     centroids,
-    centroids + merged->num_rows(),
+    centroids + merged_means.size(),
     cumulative_centroid_weight<decltype(group_labels), decltype(group_offsets)>{
-      cumulative_weights->view().begin<double>(),
+      cumulative_weights.begin(),
       group_labels,
       group_offsets,
-      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+      {inner_offsets.begin<size_type>(), static_cast<size_t>(inner_offsets.size())}},
     std::move(merged_min_col),
     std::move(merged_max_col),
     group_cluster_wl,
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 9d3a7ce5a4e..9824c472b20 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -132,6 +132,13 @@ struct cuda_event {
   cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); }
   virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); }
 
+  // Moveable but not copyable.
+  cuda_event(const cuda_event&)            = delete;
+  cuda_event& operator=(const cuda_event&) = delete;
+
+  cuda_event(cuda_event&&)            = default;
+  cuda_event& operator=(cuda_event&&) = default;
+
   operator cudaEvent_t() { return e_; }
 
  private:
@@ -147,11 +154,12 @@ struct cuda_event {
  */
 cudaEvent_t event_for_thread()
 {
-  thread_local std::vector<std::unique_ptr<cuda_event>> thread_events(get_num_cuda_devices());
+  // The program may crash if this function is called from the main thread and user application
+  // subsequently calls cudaDeviceReset().
+  // As a workaround, here we intentionally disable RAII and leak cudaEvent_t.
+  thread_local std::vector<cuda_event*> thread_events(get_num_cuda_devices());
   auto const device_id = get_current_cuda_device();
-  if (not thread_events[device_id.value()]) {
-    thread_events[device_id.value()] = std::make_unique<cuda_event>();
-  }
+  if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); }
   return *thread_events[device_id.value()];
 }
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 1bedb344a01..b67d922d377 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -329,6 +329,7 @@ ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cp
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
 ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp)
 ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu)
+ConfigureTest(JSON_TREE_CSR io/json/json_tree_csr.cu)
 ConfigureTest(
   DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
   GPUS 1
@@ -687,10 +688,12 @@ ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_DATETIME_TEST streams/datetime_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_JOIN_TEST streams/join_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu
new file mode 100644
index 00000000000..a336b327732
--- /dev/null
+++ b/cpp/tests/io/json/json_tree_csr.cu
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/json/nested_json.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/tokenize_json.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include <string>
+
+namespace cuio_json = cudf::io::json;
+
+struct h_tree_meta_t {
+  std::vector<cuio_json::NodeT> node_categories;
+  std::vector<cuio_json::NodeIndexT> parent_node_ids;
+  std::vector<cuio_json::SymbolOffsetT> node_range_begin;
+  std::vector<cuio_json::SymbolOffsetT> node_range_end;
+};
+
+struct h_column_tree {
+  // position of nnzs
+  std::vector<cuio_json::NodeIndexT> row_idx;
+  std::vector<cuio_json::NodeIndexT> col_idx;
+  // node properties
+  std::vector<cuio_json::NodeT> categories;
+  std::vector<cuio_json::NodeIndexT> column_ids;
+};
+
+// debug printing
+template <typename T>
+void print(cudf::host_span<T const> vec, std::string name)
+{
+  std::cout << name << " = ";
+  for (auto e : vec) {
+    std::cout << e << " ";
+  }
+  std::cout << std::endl;
+}
+
+bool check_equality(cuio_json::tree_meta_t& d_a,
+                    cudf::device_span<cudf::size_type const> d_a_max_row_offsets,
+                    cuio_json::experimental::compressed_sparse_row& d_b_csr,
+                    cuio_json::experimental::column_tree_properties& d_b_ctp,
+                    rmm::cuda_stream_view stream)
+{
+  // convert from tree_meta_t to column_tree_csr
+  stream.synchronize();
+
+  h_tree_meta_t a{cudf::detail::make_std_vector_async(d_a.node_categories, stream),
+                  cudf::detail::make_std_vector_async(d_a.parent_node_ids, stream),
+                  cudf::detail::make_std_vector_async(d_a.node_range_begin, stream),
+                  cudf::detail::make_std_vector_async(d_a.node_range_end, stream)};
+
+  h_column_tree b{cudf::detail::make_std_vector_async(d_b_csr.row_idx, stream),
+                  cudf::detail::make_std_vector_async(d_b_csr.col_idx, stream),
+                  cudf::detail::make_std_vector_async(d_b_ctp.categories, stream),
+                  cudf::detail::make_std_vector_async(d_b_ctp.mapped_ids, stream)};
+
+  auto a_max_row_offsets = cudf::detail::make_std_vector_async(d_a_max_row_offsets, stream);
+  auto b_max_row_offsets = cudf::detail::make_std_vector_async(d_b_ctp.max_row_offsets, stream);
+
+  stream.synchronize();
+
+  auto num_nodes = a.parent_node_ids.size();
+  if (num_nodes > 1) {
+    if (b.row_idx.size() != num_nodes + 1) { return false; }
+
+    for (auto pos = b.row_idx[0]; pos < b.row_idx[1]; pos++) {
+      auto v = b.col_idx[pos];
+      if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[0]) { return false; }
+    }
+    for (size_t u = 1; u < num_nodes; u++) {
+      auto v = b.col_idx[b.row_idx[u]];
+      if (a.parent_node_ids[b.column_ids[u]] != b.column_ids[v]) { return false; }
+
+      for (auto pos = b.row_idx[u] + 1; pos < b.row_idx[u + 1]; pos++) {
+        v = b.col_idx[pos];
+        if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) { return false; }
+      }
+    }
+    for (size_t u = 0; u < num_nodes; u++) {
+      if (a.node_categories[b.column_ids[u]] != b.categories[u]) { return false; }
+    }
+    for (size_t u = 0; u < num_nodes; u++) {
+      if (a_max_row_offsets[b.column_ids[u]] != b_max_row_offsets[u]) { return false; }
+    }
+  } else if (num_nodes == 1) {
+    if (b.row_idx.size() != num_nodes + 1) { return false; }
+
+    if (b.row_idx[0] != 0 || b.row_idx[1] != 1) return false;
+    if (!b.col_idx.empty()) return false;
+    for (size_t u = 0; u < num_nodes; u++) {
+      if (a.node_categories[b.column_ids[u]] != b.categories[u]) { return false; }
+    }
+
+    for (size_t u = 0; u < num_nodes; u++) {
+      if (a_max_row_offsets[b.column_ids[u]] != b_max_row_offsets[u]) { return false; }
+    }
+  }
+  return true;
+}
+
+void run_test(std::string const& input, bool enable_lines = true)
+{
+  auto const stream = cudf::get_default_stream();
+  cudf::string_scalar d_scalar(input, true, stream);
+  auto d_input = cudf::device_span<cuio_json::SymbolT const>{d_scalar.data(),
+                                                             static_cast<size_t>(d_scalar.size())};
+
+  cudf::io::json_reader_options options{};
+  options.enable_lines(enable_lines);
+  options.enable_mixed_types_as_string(true);
+
+  // Parse the JSON and get the token stream
+  auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, cudf::get_current_device_resource_ref());
+
+  // Get the JSON's tree representation
+  auto gpu_tree =
+    cuio_json::detail::get_tree_representation(tokens_gpu,
+                                               token_indices_gpu,
+                                               options.is_enabled_mixed_types_as_string(),
+                                               stream,
+                                               cudf::get_current_device_resource_ref());
+
+  bool const is_array_of_arrays = [&]() {
+    std::array<cuio_json::node_t, 2> h_node_categories = {cuio_json::NC_ERR, cuio_json::NC_ERR};
+    auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size());
+    CUDF_CUDA_TRY(cudaMemcpyAsync(h_node_categories.data(),
+                                  gpu_tree.node_categories.data(),
+                                  sizeof(cuio_json::node_t) * size_to_copy,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    stream.synchronize();
+    if (options.is_enabled_lines()) return h_node_categories[0] == cuio_json::NC_LIST;
+    return h_node_categories[0] == cuio_json::NC_LIST and
+           h_node_categories[1] == cuio_json::NC_LIST;
+  }();
+
+  auto tup =
+    cuio_json::detail::records_orient_tree_traversal(d_input,
+                                                     gpu_tree,
+                                                     is_array_of_arrays,
+                                                     options.is_enabled_lines(),
+                                                     stream,
+                                                     rmm::mr::get_current_device_resource());
+  auto& gpu_col_id      = std::get<0>(tup);
+  auto& gpu_row_offsets = std::get<1>(tup);
+
+  auto const num_nodes = gpu_col_id.size();
+  rmm::device_uvector<cudf::size_type> sorted_col_ids(gpu_col_id.size(), stream);  // make a copy
+  thrust::copy(
+    rmm::exec_policy(stream), gpu_col_id.begin(), gpu_col_id.end(), sorted_col_ids.begin());
+
+  // sort by {col_id} on {node_ids} stable
+  rmm::device_uvector<cudf::size_type> node_ids(gpu_col_id.size(), stream);
+  thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end());
+  thrust::stable_sort_by_key(
+    rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
+
+  cudf::size_type const row_array_parent_col_id = [&]() {
+    cudf::size_type value      = cuio_json::parent_node_sentinel;
+    auto const list_node_index = options.is_enabled_lines() ? 0 : 1;
+    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
+                                  gpu_col_id.data() + list_node_index,
+                                  sizeof(cudf::size_type),
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    stream.synchronize();
+    return value;
+  }();
+
+  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
+    cudf::io::json::detail::reduce_to_column_tree(gpu_tree,
+                                                  gpu_col_id,
+                                                  sorted_col_ids,
+                                                  node_ids,
+                                                  gpu_row_offsets,
+                                                  is_array_of_arrays,
+                                                  row_array_parent_col_id,
+                                                  stream);
+
+  auto [d_column_tree_csr, d_column_tree_properties] =
+    cudf::io::json::experimental::detail::reduce_to_column_tree(gpu_tree,
+                                                                gpu_col_id,
+                                                                sorted_col_ids,
+                                                                node_ids,
+                                                                gpu_row_offsets,
+                                                                is_array_of_arrays,
+                                                                row_array_parent_col_id,
+                                                                stream);
+
+  auto iseq = check_equality(
+    d_column_tree, d_max_row_offsets, d_column_tree_csr, d_column_tree_properties, stream);
+  // assert equality between csr and meta formats
+  ASSERT_TRUE(iseq);
+}
+
+struct JsonColumnTreeTests : public cudf::test::BaseFixture {};
+
+TEST_F(JsonColumnTreeTests, JSONL_Small)
+{
+  std::string const input =
+    R"(  {}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )";  // Prepare input & output buffers
+  run_test(input);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_Large)
+{
+  std::string const input =
+    R"(  {}
+    {}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )";
+  run_test(input);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_ListofStruct)
+{
+  std::string const input = R"(
+  { "Root": { "Key": [ { "EE": "A" } ] } }
+  { "Root": { "Key": {  } } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  )";
+  run_test(input);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_MissingEntries)
+{
+  std::string json_stringl = R"(
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}
+    {"a": 1, "b": {"0": "abc"          }, "c": false}
+    {"a": 1, "b": {}}
+    {"a": 1,                              "c": null}
+    )";
+  run_test(json_stringl);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_MoreMissingEntries)
+{
+  std::string json_stringl = R"(
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    )";
+  run_test(json_stringl);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_StillMoreMissingEntries)
+{
+  std::string json_stringl = R"(
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    { "foo1": ["123","456"], "bar": 123 }
+    { "foo2": { "b": 5 }, "car": 456 }
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    )";
+  run_test(json_stringl);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_MissingEntries)
+{
+  std::string json_string = R"([
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true},
+    {"a": 1, "b": {"0": "abc"          }, "c": false},
+    {"a": 1, "b": {}},
+    {"a": 1,                              "c": null}
+    ])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_StructOfStructs)
+{
+  std::string json_string =
+    R"([
+    {},
+    { "a": { "y" : 6, "z": [] }},
+    { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }}
+    ])";  // Prepare input & output buffers
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_ArrayOfArrays_NestedList)
+{
+  std::string json_string =
+    R"([123, [1,2,3]]
+       [456, null,  { "a": 1 }])";
+  run_test(json_string);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_ArrayofArrays_NestedList)
+{
+  std::string json_string = R"([[[1,2,3], null, 123],
+              [null, { "a": 1 }, 456 ]])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_CornerCase_Empty)
+{
+  std::string json_string = R"([])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_CornerCase_List)
+{
+  std::string json_string = R"([123])";
+  run_test(json_string, true);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_CornerCase_EmptyNestedList)
+{
+  std::string json_string = R"([[[]]])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_CornerCase_EmptyNestedLists)
+{
+  std::string json_string = R"([[], [], []])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_CornerCase_ListofLists)
+{
+  std::string json_string = R"([[1, 2, 3], [4, 5, null], []])";
+  run_test(json_string, true);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_CornerCase_EmptyListOfLists)
+{
+  std::string json_string = R"([[]])";
+  run_test(json_string, true);
+}
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index ab387a5c7f5..3431e941359 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -39,6 +39,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <limits>
 
 template <typename T>
@@ -60,6 +62,7 @@ template <std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             cudf::table_view const& left_keys,
             cudf::table_view const& right_keys,
             cudf::null_equality compare_nulls,
+            rmm::cuda_stream_view stream,
             rmm::device_async_resource_ref mr),
           cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK>
 std::unique_ptr<cudf::table> join_and_gather(
@@ -68,12 +71,13 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto left_selected  = left_input.select(left_on);
   auto right_selected = right_input.select(right_on);
   auto const [left_join_indices, right_join_indices] =
-    join_impl(left_selected, right_selected, compare_nulls, mr);
+    join_impl(left_selected, right_selected, compare_nulls, stream, mr);
 
   auto left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
   auto right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
@@ -2027,7 +2031,11 @@ struct JoinTestLists : public cudf::test::BaseFixture {
     auto const probe_tv = cudf::table_view{{probe}};
 
     auto const [left_result_map, right_result_map] =
-      join_func(build_tv, probe_tv, nulls_equal, cudf::get_current_device_resource_ref());
+      join_func(build_tv,
+                probe_tv,
+                nulls_equal,
+                cudf::get_default_stream(),
+                cudf::get_current_device_resource_ref());
 
     auto const left_result_table =
       sort_and_gather(build_tv, column_view_from_device_uvector(*left_result_map), oob_policy);
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index 08a0136700d..6c147c8a128 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -778,21 +778,6 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality)
              {1});
 }
 
-TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap)
-{
-  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
-  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-  auto left_one_greater_right_one =
-    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
-
-  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
-             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
-             {0},
-             {1},
-             left_one_greater_right_one,
-             {2, 7, 8});
-}
-
 TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)
 {
   this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}},
@@ -915,18 +900,3 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality)
              left_zero_eq_right_zero,
              {0, 1, 3});
 }
-
-TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap)
-{
-  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
-  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-  auto left_one_greater_right_one =
-    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
-
-  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
-             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
-             {0},
-             {1},
-             left_one_greater_right_one,
-             {0, 1, 3, 4, 5, 6, 9});
-}
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 3e279260b99..554d5754e39 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -28,8 +28,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 
 template <typename T>
@@ -51,6 +54,7 @@ template <std::unique_ptr<rmm::device_uvector<cudf::size_type>> (*join_impl)(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   cudf::null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)>
 std::unique_ptr<cudf::table> join_and_gather(
   cudf::table_view const& left_input,
@@ -58,11 +62,12 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto left_selected      = left_input.select(left_on);
   auto right_selected     = right_input.select(right_on);
-  auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, mr);
+  auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, stream, mr);
 
   auto left_indices_span = cudf::device_span<cudf::size_type const>{*join_indices};
   auto left_indices_col  = cudf::column_view{left_indices_span};
diff --git a/cpp/tests/streams/datetime_test.cpp b/cpp/tests/streams/datetime_test.cpp
new file mode 100644
index 00000000000..82629156fa6
--- /dev/null
+++ b/cpp/tests/streams/datetime_test.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/datetime.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+#include <cstdint>
+
+class DatetimeTest : public cudf::test::BaseFixture {
+ public:
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> timestamps{
+    -23324234,  // 1969-12-31 23:59:59.976675766 GMT
+    23432424,   // 1970-01-01 00:00:00.023432424 GMT
+    987234623   // 1970-01-01 00:00:00.987234623 GMT
+  };
+  cudf::test::fixed_width_column_wrapper<int32_t, int32_t> months{{1, -1, 3}};
+};
+
+TEST_F(DatetimeTest, ExtractYear)
+{
+  cudf::datetime::extract_year(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractMonth)
+{
+  cudf::datetime::extract_month(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractDay)
+{
+  cudf::datetime::extract_day(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractWeekday)
+{
+  cudf::datetime::extract_weekday(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractHour)
+{
+  cudf::datetime::extract_hour(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractMinute)
+{
+  cudf::datetime::extract_minute(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractSecond)
+{
+  cudf::datetime::extract_second(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractMillisecondFraction)
+{
+  cudf::datetime::extract_millisecond_fraction(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractMicrosecondFraction)
+{
+  cudf::datetime::extract_microsecond_fraction(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractNanosecondFraction)
+{
+  cudf::datetime::extract_nanosecond_fraction(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, LastDayOfMonth)
+{
+  cudf::datetime::last_day_of_month(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, DayOfYear)
+{
+  cudf::datetime::day_of_year(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, AddCalendricalMonths)
+{
+  cudf::datetime::add_calendrical_months(timestamps, months, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, AddCalendricalMonthsScalar)
+{
+  auto scalar = cudf::make_fixed_width_scalar(1, cudf::test::get_default_stream());
+
+  cudf::datetime::add_calendrical_months(timestamps, *scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, IsLeapYear)
+{
+  cudf::datetime::is_leap_year(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, DaysInMonth)
+{
+  cudf::datetime::days_in_month(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractQuarter)
+{
+  cudf::datetime::extract_quarter(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, CeilDatetimes)
+{
+  cudf::datetime::ceil_datetimes(
+    timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, FloorDatetimes)
+{
+  cudf::datetime::floor_datetimes(
+    timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, RoundDatetimes)
+{
+  cudf::datetime::round_datetimes(
+    timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp
new file mode 100644
index 00000000000..2811bb676fa
--- /dev/null
+++ b/cpp/tests/streams/join_test.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/join.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+class JoinTest : public cudf::test::BaseFixture {
+  static inline cudf::table make_table()
+  {
+    cudf::test::fixed_width_column_wrapper<int32_t> col0{{3, 1, 2, 0, 3}};
+    cudf::test::strings_column_wrapper col1{{"s0", "s1", "s2", "s4", "s1"}};
+    cudf::test::fixed_width_column_wrapper<int32_t> col2{{0, 1, 2, 4, 1}};
+
+    std::vector<std::unique_ptr<cudf::column>> columns;
+    columns.push_back(col0.release());
+    columns.push_back(col1.release());
+    columns.push_back(col2.release());
+
+    return cudf::table{std::move(columns)};
+  }
+
+ public:
+  cudf::table table0{make_table()};
+  cudf::table table1{make_table()};
+  cudf::table conditional0{make_table()};
+  cudf::table conditional1{make_table()};
+  cudf::ast::column_reference col_ref_left_0{0};
+  cudf::ast::column_reference col_ref_right_0{0, cudf::ast::table_reference::RIGHT};
+  cudf::ast::operation left_zero_eq_right_zero{
+    cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0};
+};
+
+TEST_F(JoinTest, InnerJoin)
+{
+  cudf::inner_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, LeftJoin)
+{
+  cudf::left_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, FullJoin)
+{
+  cudf::full_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, LeftSemiJoin)
+{
+  cudf::left_semi_join(
+    table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, LeftAntiJoin)
+{
+  cudf::left_anti_join(
+    table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, CrossJoin) { cudf::cross_join(table0, table1, cudf::test::get_default_stream()); }
+
+TEST_F(JoinTest, ConditionalInnerJoin)
+{
+  cudf::conditional_inner_join(
+    table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftJoin)
+{
+  cudf::conditional_left_join(
+    table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalFullJoin)
+{
+  cudf::conditional_full_join(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftSemiJoin)
+{
+  cudf::conditional_left_semi_join(
+    table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftAntiJoin)
+{
+  cudf::conditional_left_anti_join(
+    table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedInnerJoin)
+{
+  cudf::mixed_inner_join(table0,
+                         table1,
+                         conditional0,
+                         conditional1,
+                         left_zero_eq_right_zero,
+                         cudf::null_equality::EQUAL,
+                         std::nullopt,
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedLeftJoin)
+{
+  cudf::mixed_left_join(table0,
+                        table1,
+                        conditional0,
+                        conditional1,
+                        left_zero_eq_right_zero,
+                        cudf::null_equality::EQUAL,
+                        std::nullopt,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedFullJoin)
+{
+  cudf::mixed_full_join(table0,
+                        table1,
+                        conditional0,
+                        conditional1,
+                        left_zero_eq_right_zero,
+                        cudf::null_equality::EQUAL,
+                        std::nullopt,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedLeftSemiJoin)
+{
+  cudf::mixed_left_semi_join(table0,
+                             table1,
+                             conditional0,
+                             conditional1,
+                             left_zero_eq_right_zero,
+                             cudf::null_equality::EQUAL,
+                             cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedLeftAntiJoin)
+{
+  cudf::mixed_left_anti_join(table0,
+                             table1,
+                             conditional0,
+                             conditional1,
+                             left_zero_eq_right_zero,
+                             cudf::null_equality::EQUAL,
+                             cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedInnerJoinSize)
+{
+  cudf::mixed_inner_join_size(table0,
+                              table1,
+                              conditional0,
+                              conditional1,
+                              left_zero_eq_right_zero,
+                              cudf::null_equality::EQUAL,
+                              cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedLeftJoinSize)
+{
+  cudf::mixed_left_join_size(table0,
+                             table1,
+                             conditional0,
+                             conditional1,
+                             left_zero_eq_right_zero,
+                             cudf::null_equality::EQUAL,
+                             cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalInnerJoinSize)
+{
+  cudf::conditional_inner_join_size(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftJoinSize)
+{
+  cudf::conditional_left_join_size(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftSemiJoinSize)
+{
+  cudf::conditional_left_semi_join_size(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftAntiJoinSize)
+{
+  cudf::conditional_left_anti_join_size(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
diff --git a/dependencies.yaml b/dependencies.yaml
index 325f2dbcba7..911c443d294 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -43,15 +43,28 @@ files:
     includes:
       - cuda_version
       - test_cpp
-  test_python:
+  test_python_cudf_pandas:
     output: none
     includes:
       - cuda_version
       - py_version
       - test_python_common
       - test_python_cudf
-      - test_python_dask_cudf
       - test_python_cudf_pandas
+  test_python_cudf:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_python_common
+      - test_python_cudf
+  test_python_other:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_python_common
+      - test_python_dask_cudf
   test_java:
     output: none
     includes:
@@ -350,12 +363,12 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - fmt>=10.1.1,<11
+          - fmt>=11.0.2,<12
           - flatbuffers==24.3.25
           - librdkafka>=2.5.0,<2.6.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==4.0.1
-          - spdlog>=1.12.0,<1.13
+          - spdlog>=1.14.1,<1.15
   rapids_build_skbuild:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -650,7 +663,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.0,<1.3
+          - polars>=1.8,<1.9
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -707,9 +720,7 @@ dependencies:
           - matrix: {dependencies: "oldest"}
             packages:
               - numba==0.57.*
-              - numpy==1.23.*
               - pandas==2.0.*
-              - pyarrow==14.0.0
           - matrix:
             packages:
       - output_types: conda
@@ -764,6 +775,14 @@ dependencies:
           - &transformers transformers==4.39.3
           - tzdata
     specific:
+      - output_types: [conda, requirements]
+        matrices:
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - numpy==1.23.*
+              - pyarrow==14.0.0
+          - matrix:
+            packages:
       - output_types: conda
         matrices:
           - matrix:
@@ -783,6 +802,15 @@ dependencies:
         packages:
           - dask-cuda==24.12.*,>=0.0.0a0
           - *numba
+    specific:
+      - output_types: [conda, requirements]
+        matrices:
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - numpy==1.24.*
+              - pyarrow==14.0.1
+          - matrix:
+            packages:
   depends_on_libcudf:
     common:
       - output_types: conda
diff --git a/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png
new file mode 100644
index 00000000000..e472cf66612
Binary files /dev/null and b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png differ
diff --git a/docs/cudf/source/_static/compute_heavy_queries_polars.png b/docs/cudf/source/_static/compute_heavy_queries_polars.png
new file mode 100644
index 00000000000..6854ed5a436
Binary files /dev/null and b/docs/cudf/source/_static/compute_heavy_queries_polars.png differ
diff --git a/docs/cudf/source/_static/pds_benchmark_polars.png b/docs/cudf/source/_static/pds_benchmark_polars.png
new file mode 100644
index 00000000000..d0b48ab2901
Binary files /dev/null and b/docs/cudf/source/_static/pds_benchmark_polars.png differ
diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst
new file mode 100644
index 00000000000..0a3a0d86b2c
--- /dev/null
+++ b/docs/cudf/source/cudf_polars/index.rst
@@ -0,0 +1,41 @@
+Polars GPU engine
+=================
+
+cuDF provides an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API.
+The engine supports most of the core expressions and data types as well as a growing set of more advanced dataframe manipulations
+and data file formats. When using the GPU engine, Polars will convert expressions into an optimized query plan and determine
+whether the plan is supported on the GPU. If it is not, the execution will transparently fall back to the standard Polars engine
+and run on the CPU.
+
+Benchmark
+---------
+We reproduced the `Polars Decision Support (PDS) <https://github.com/pola-rs/polars-benchmark>`__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results:
+
+.. figure:: ../_static/pds_benchmark_polars.png
+   :width: 600px
+
+
+
+You can see up to 13x speedup using the GPU engine on the compute-heavy PDS queries involving complex aggregation and join operations. Below are the speedups for the top performing queries:
+
+
+.. figure:: ../_static/compute_heavy_queries_polars.png
+   :width: 1000px
+
+:emphasis:`PDS-H benchmark | GPU: NVIDIA H100 PCIe | CPU: Intel Xeon W9-3495X (Sapphire Rapids) | Storage: Local NVMe`
+
+You can reproduce the results by visiting the `Polars Decision Support (PDS) GitHub repository <https://github.com/pola-rs/polars-benchmark>`__.
+
+Learn More
+----------
+
+The GPU engine for Polars is now available in Open Beta and the engine is undergoing rapid development. To learn more, visit the `GPU Support page <https://docs.pola.rs/user-guide/gpu-support/>`__ on the Polars website.
+
+Launch on Google Colab
+----------------------
+
+.. figure:: ../_static/colab.png
+   :width: 200px
+   :target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb
+
+   Try out the GPU engine for Polars in a free GPU notebook environment. Sign in with your Google account and `launch the demo on Colab <https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb>`__.
diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst
index 3b8dfa5fe01..1b86cafeb48 100644
--- a/docs/cudf/source/index.rst
+++ b/docs/cudf/source/index.rst
@@ -29,5 +29,6 @@ other operations.
 
    user_guide/index
    cudf_pandas/index
+   cudf_polars/index
    libcudf_docs/index
    developer_guide/index
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst
new file mode 100644
index 00000000000..06f74a38709
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst
@@ -0,0 +1,6 @@
+=======
+extract
+=======
+
+.. automodule:: pylibcudf.strings.extract
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst
new file mode 100644
index 00000000000..9850ee10098
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst
@@ -0,0 +1,6 @@
+====
+find
+====
+
+.. automodule:: pylibcudf.strings.findall
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 1200ecba5d9..9b1a6b72a88 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -7,9 +7,12 @@ strings
     capitalize
     char_types
     contains
+    extract
     find
+    findall
     regex_flags
     regex_program
     repeat
     replace
     slice
+    strip
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst
new file mode 100644
index 00000000000..a79774b8e67
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst
@@ -0,0 +1,6 @@
+=====
+strip
+=====
+
+.. automodule:: pylibcudf.strings.strip
+   :members:
diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst
new file mode 100644
index 00000000000..6cd098da56d
--- /dev/null
+++ b/docs/dask_cudf/source/best_practices.rst
@@ -0,0 +1,320 @@
+.. _best-practices:
+
+Dask cuDF Best Practices
+========================
+
+This page outlines several important guidelines for using `Dask cuDF
+<https://docs.rapids.ai/api/dask-cudf/stable/>`__ effectively.
+
+.. note::
+  Since Dask cuDF is a backend extension for
+  `Dask DataFrame <https://docs.dask.org/en/stable/dataframe.html>`__,
+  the guidelines discussed in the `Dask DataFrames Best Practices
+  <https://docs.dask.org/en/stable/dataframe-best-practices.html>`__
+  documentation also apply to Dask cuDF (excluding any pandas-specific
+  details).
+
+
+Deployment and Configuration
+----------------------------
+
+Use Dask-CUDA
+~~~~~~~~~~~~~
+
+To execute a Dask workflow on multiple GPUs, a Dask cluster must
+be deployed with `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+and `Dask.distributed <https://distributed.dask.org/en/stable/>`__.
+
+When running on a single machine, the `LocalCUDACluster <https://docs.rapids.ai/api/dask-cuda/stable/api/#dask_cuda.LocalCUDACluster>`__
+convenience function is strongly recommended. No matter how many GPUs are
+available on the machine (even one!), using `Dask-CUDA has many advantages
+<https://docs.rapids.ai/api/dask-cuda/stable/#motivation>`__
+over default (threaded) execution. Just to list a few:
+
+* Dask-CUDA makes it easy to pin workers to specific devices.
+* Dask-CUDA makes it easy to configure memory-spilling options.
+* The distributed scheduler collects useful diagnostic information that can be viewed on a dashboard in real time.
+
+Please see `Dask-CUDA's API <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+and `Best Practices <https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/>`__
+documentation for detailed information. Typical ``LocalCUDACluster`` usage
+is also illustrated within the multi-GPU section of `Dask cuDF's
+<https://docs.rapids.ai/api/dask-cudf/stable/>`__ documentation.
+
+.. note::
+  When running on cloud infrastructure or HPC systems, it is usually best to
+  leverage system-specific deployment libraries like `Dask Operator
+  <https://docs.dask.org/en/latest/deploying-kubernetes.html>`__ and `Dask-Jobqueue
+  <https://jobqueue.dask.org/en/latest/>`__.
+
+  Please see `the RAPIDS deployment documentation <https://docs.rapids.ai/deployment/stable/>`__
+  for further details and examples.
+
+
+Use diagnostic tools
+~~~~~~~~~~~~~~~~~~~~
+
+The Dask ecosystem includes several diagnostic tools that you should absolutely use.
+These tools include an intuitive `browser dashboard
+<https://docs.dask.org/en/stable/dashboard.html>`__ as well as a dedicated
+`API for collecting performance profiles
+<https://distributed.dask.org/en/latest/diagnosing-performance.html#performance-reports>`__.
+
+No matter the workflow, using the dashboard is strongly recommended.
+It provides a visual representation of the worker resources and compute
+progress. It also shows basic GPU memory and utilization metrics (under
+the ``GPU`` tab). To visualize more detailed GPU metrics in JupyterLab,
+use `NVDashboard <https://github.com/rapidsai/jupyterlab-nvdashboard>`__.
+
+
+Enable cuDF spilling
+~~~~~~~~~~~~~~~~~~~~
+
+When using Dask cuDF for classic ETL workloads, it is usually best
+to enable `native spilling support in cuDF
+<https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory>`__.
+When using :func:`LocalCUDACluster`, this is easily accomplished by
+setting ``enable_cudf_spill=True``.
+
+When a Dask cuDF workflow includes conversion between DataFrame and Array
+representations, native cuDF spilling may be insufficient. For these cases,
+`JIT-unspill <https://docs.rapids.ai/api/dask-cuda/nightly/spilling/#jit-unspill>`__
+is likely to produce better protection from out-of-memory (OOM) errors.
+Please see `Dask-CUDA's spilling documentation
+<https://docs.rapids.ai/api/dask-cuda/stable/spilling/>`__ for further details
+and guidance.
+
+Use RMM
+~~~~~~~
+
+Memory allocations in cuDF are significantly faster and more efficient when
+the `RAPIDS Memory Manager (RMM) <https://docs.rapids.ai/api/rmm/stable/>`__
+library is configured appropriately on worker processes. In most cases, the best way to manage
+memory is by initializing an RMM pool on each worker before executing a
+workflow. When using :func:`LocalCUDACluster`, this is easily accomplished
+by setting ``rmm_pool_size`` to a large fraction (e.g. ``0.9``).
+
+See the `Dask-CUDA memory-management documentation
+<https://docs.rapids.ai/api/dask-cuda/nightly/examples/best-practices/#gpu-memory-management>`__
+for more details.
+
+Use the Dask DataFrame API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Although Dask cuDF provides a public ``dask_cudf`` Python module, we
+strongly recommended that you use the CPU/GPU portable ``dask.dataframe``
+API instead. Simply `use the Dask configuration system
+<https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html>`__
+to set the ``"dataframe.backend"`` option to ``"cudf"``, and the
+``dask_cudf`` module will be imported and used implicitly.
+
+Be sure to use the :func:`to_backend` method if you need to convert
+between the different DataFrame backends. For example::
+
+  df = df.to_backend("pandas")  # This gives us a pandas-backed collection
+
+.. note::
+  Although :func:`to_backend` makes it easy to move data between pandas
+  and cuDF, repetitive CPU-GPU data movement can degrade performance
+  significantly. For optimal results, keep your data on the GPU as much
+  as possible.
+
+Avoid eager execution
+~~~~~~~~~~~~~~~~~~~~~
+
+Although Dask DataFrame collections are lazy by default, there are several
+notable methods that will result in the immediate execution of the
+underlying task graph:
+
+:func:`compute`: Calling ``ddf.compute()`` will materialize the result of
+``ddf`` and return a single cuDF object. This is done by executing the entire
+task graph associated with ``ddf`` and concatenating its partitions in
+local memory on the client process.
+
+.. note::
+  Never call :func:`compute` on a large collection that cannot fit comfortably
+  in the memory of a single GPU!
+
+:func:`persist`: Like :func:`compute`, calling ``ddf.persist()`` will
+execute the entire task graph associated with ``ddf``. The most important
+difference is that the computed partitions will remain in distributed
+worker memory instead of being concatenated together on the client process.
+Another difference is that :func:`persist` will return immediately when
+executing on a distributed cluster. If you need a blocking synchronization
+point in your workflow, simply use the :func:`wait` function::
+
+  ddf = ddf.persist()
+  wait(ddf)
+
+.. note::
+  Avoid calling :func:`persist` on a large collection that cannot fit comfortably
+  in global worker memory. If the total sum of the partition sizes is larger
+  than the sum of all GPU memory, calling persist will result in significant
+  spilling from device memory. If the individual partition sizes are large, this
+  is likely to produce an OOM error.
+
+:func:`len` / :func:`head` / :func:`tail`: Although these operations are used
+often within pandas/cuDF code to quickly inspect data, it is best to avoid
+them in Dask DataFrame. In most cases, these operations will execute some or all
+of the underlying task graph to materialize the collection.
+
+:func:`sort_values` / :func:`set_index` : These operations both require Dask to
+eagerly collect quantile information about the column(s) being targeted by the
+global sort operation. See `Avoid Sorting`__ for notes on sorting considerations.
+
+.. note::
+  When using :func:`set_index`, be sure to pass in ``sort=False`` whenever the
+  global collection does not **need** to be sorted by the new index.
+
+Avoid Sorting
+~~~~~~~~~~~~~
+
+`The design of Dask DataFrame <https://docs.dask.org/en/stable/dataframe-design.html#dask-dataframe-design>`__
+makes it advantageous to work with data that is already sorted along its index at
+creation time. For most other cases, it is best to avoid sorting unless the logic
+of the workflow makes global ordering absolutely necessary.
+
+If the purpose of a :func:`sort_values` operation is to ensure that all unique
+values in ``by`` will be moved to the same output partition, then `shuffle
+<https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.shuffle.html>`__
+is often the better option.
+
+
+Reading Data
+------------
+
+Tune the partition size
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The ideal partition size is usually between 1/32 and 1/8 the memory
+capacity of a single GPU. Increasing the partition size will typically
+reduce the number of tasks in your workflow and improve the GPU utilization
+for each task. However, if the partitions are too large, the risk of OOM
+errors can become significant.
+
+.. note::
+  As a general rule of thumb, start with 1/32-1/16 for shuffle-intensive workflows
+  (e.g. large-scale sorting and joining), and 1/16-1/8 otherwise. For pathologically
+  skewed data distributions, it may be necessary to target 1/64 or smaller.
+  This rule of thumb comes from anecdotal optimization and OOM-debugging
+  experience. Since every workflow is different, choosing the best partition
+  size is both an art and a science.
+
+The easiest way to tune the partition size is when the DataFrame collection
+is first created by a function like :func:`read_parquet`, :func:`read_csv`,
+or :func:`from_map`. For example, both :func:`read_parquet` and :func:`read_csv`
+expose a ``blocksize`` argument for adjusting the maximum partition size.
+
+If the partition size cannot be tuned effectively at creation time, the
+`repartition <https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.repartition.html>`__
+method can be used as a last resort.
+
+
+Use Parquet
+~~~~~~~~~~~
+
+`Parquet <https://parquet.apache.org/docs/file-format/>`__ is the recommended
+file format for Dask cuDF. It provides efficient columnar storage and enables
+Dask to perform valuable query optimizations like column projection and
+predicate pushdown.
+
+The most important arguments to :func:`read_parquet` are ``blocksize`` and
+``aggregate_files``:
+
+``blocksize``: Use this argument to specify the maximum partition size.
+The default is `"256 MiB"`, but larger values are usually more performant
+on GPUs with more than 8 GiB of memory. Dask will use the ``blocksize``
+value to map a discrete number of Parquet row-groups (or files) to each
+output partition. This mapping will only account for the uncompressed
+storage size of each row group, which is usually smaller than the
+correspondng ``cudf.DataFrame``.
+
+``aggregate_files``: Use this argument to specify whether Dask should
+map multiple files to the same DataFrame partition. The default is
+``False``, but ``aggregate_files=True`` is usually more performant when
+the dataset contains many files that are smaller than half of ``blocksize``.
+
+If you know that your files correspond to a reasonable partition size
+before splitting or aggregation, set ``blocksize=None`` to disallow
+file splitting. In the absence of column-projection pushdown, this will
+result in a simple 1-to-1 mapping between files and output partitions.
+
+.. note::
+  If your workflow requires a strict 1-to-1 mapping between files and
+  partitions, use :func:`from_map` to manually construct your partitions
+  with ``cudf.read_parquet``. When :func:`dd.read_parquet` is used,
+  query-planning optimizations may automatically aggregate distinct files
+  into the same partition (even when ``aggregate_files=False``).
+
+.. note::
+  Metadata collection can be extremely slow when reading from remote
+  storage (e.g. S3 and GCS). When reading many remote files that all
+  correspond to a reasonable partition size, use ``blocksize=None``
+  to avoid unnecessary metadata collection.
+
+
+Use :func:`from_map`
+~~~~~~~~~~~~~~~~~~~~
+
+To implement custom DataFrame-creation logic that is not covered by
+existing APIs (like :func:`read_parquet`), use :func:`dask.dataframe.from_map`
+whenever possible. The :func:`from_map` API has several advantages
+over :func:`from_delayed`:
+
+* It allows proper lazy execution of your custom logic
+* It enables column projection (as long as the mapped function supports a ``columns`` key-word argument)
+
+See the `from_map API documentation <https://docs.dask.org/en/stable/generated/dask_expr.from_map.html#dask_expr.from_map>`__
+for more details.
+
+.. note::
+  Whenever possible, be sure to specify the ``meta`` argument to
+  :func:`from_map`. If this argument is excluded, Dask will need to
+  materialize the first partition eagerly. If a large RMM pool is in
+  use on the first visible device, this eager execution on the client
+  may lead to an OOM error.
+
+
+Sorting, Joining, and Grouping
+------------------------------
+
+Sorting, joining, and grouping operations all have the potential to
+require the global shuffling of data between distinct partitions.
+When the initial data fits comfortably in global GPU memory, these
+"all-to-all" operations are typically bound by worker-to-worker
+communication. When the data is larger than global GPU memory, the
+bottleneck is typically device-to-host memory spilling.
+
+Although every workflow is different, the following guidelines
+are often recommended:
+
+* `Use a distributed cluster with Dask-CUDA workers <Use Dask-CUDA>`_
+* `Use native cuDF spilling whenever possible <Enable cuDF Spilling>`_
+* Avoid shuffling whenever possible
+  * Use ``split_out=1`` for low-cardinality groupby aggregations
+  * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``)
+* `Use UCX <https://docs.rapids.ai/api/dask-cuda/nightly/examples/ucx/>`__ if communication is a bottleneck.
+
+.. note::
+  UCX enables Dask-CUDA workers to communicate using high-performance
+  tansport technologies like `NVLink <https://www.nvidia.com/en-us/data-center/nvlink/>`__
+  and Infiniband. Without UCX, inter-process communication will rely
+  on TCP sockets.
+
+
+User-defined functions
+----------------------
+
+Most real-world Dask DataFrame workflows use `map_partitions
+<https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html>`__
+to map user-defined functions across every partition of the underlying data.
+This API is a fantastic way to apply custom operations in an intuitive and
+scalable way. With that said, the :func:`map_partitions` method will produce
+an opaque DataFrame expression that blocks the query-planning `optimizer
+<https://docs.dask.org/en/stable/dataframe-optimizer.html>`__ from performing
+useful optimizations (like projection and filter pushdown).
+
+Since column-projection pushdown is often the most effective optimization,
+it is important to select the necessary columns both before and after calling
+:func:`map_partitions`. You can also add explicit filter operations to further
+mitigate the loss of filter pushdown.
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 7fe6cbd45fa..23ca7e49753 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -15,7 +15,7 @@ as the ``"cudf"`` dataframe backend for
 .. note::
   Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU
   or multi-node execution on their own. You must also deploy a
-  `dask.distributed <https://distributed.dask.org/en/stable/>` cluster
+  `dask.distributed <https://distributed.dask.org/en/stable/>`__ cluster
   to leverage multiple GPUs. We strongly recommend using `Dask-CUDA
   <https://docs.rapids.ai/api/dask-cuda/stable/>`__ to simplify the
   setup of the cluster, taking advantage of all features of the GPU
@@ -29,6 +29,10 @@ minutes to Dask
 by `10 minutes to cuDF and Dask cuDF
 <https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html>`__.
 
+After reviewing the sections below, please see the
+:ref:`Best Practices <best-practices>` page for further guidance on
+using Dask cuDF effectively.
+
 
 Using Dask cuDF
 ---------------
@@ -36,7 +40,7 @@ Using Dask cuDF
 The Dask DataFrame API (Recommended)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Simply use the `Dask configuration <dask:configuration>` system to
+Simply use the `Dask configuration <dask:configuration>`__ system to
 set the ``"dataframe.backend"`` option to ``"cudf"``. From Python,
 this can be achieved like so::
 
@@ -50,14 +54,14 @@ environment before running your code.
 Once this is done, the public Dask DataFrame API will leverage
 ``cudf`` automatically when a new DataFrame collection is created
 from an on-disk format using any of the following ``dask.dataframe``
-functions::
+functions:
 
-* :func:`dask.dataframe.read_parquet`
-* :func:`dask.dataframe.read_json`
-* :func:`dask.dataframe.read_csv`
-* :func:`dask.dataframe.read_orc`
-* :func:`dask.dataframe.read_hdf`
-* :func:`dask.dataframe.from_dict`
+* :func:`read_parquet`
+* :func:`read_json`
+* :func:`read_csv`
+* :func:`read_orc`
+* :func:`read_hdf`
+* :func:`from_dict`
 
 For example::
 
@@ -112,8 +116,8 @@ performance benefit over the CPU/GPU-portable ``dask.dataframe`` API.
 Also, using some parts of the explicit API are incompatible with
 automatic query planning (see the next section).
 
-The explicit Dask cuDF API
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+Query Planning
+~~~~~~~~~~~~~~
 
 Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+).
 As long as the ``"dataframe.query-planning"`` configuration is set to
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index c8308ca17ec..2bb74c3e3b1 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -38,6 +38,8 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean allowLeadingZeros;
   private final boolean allowNonNumericNumbers;
   private final boolean allowUnquotedControlChars;
+  private final boolean cudfPruneSchema;
+  private final byte lineDelimiter;
 
   private JSONOptions(Builder builder) {
     super(builder);
@@ -52,6 +54,16 @@ private JSONOptions(Builder builder) {
     allowLeadingZeros = builder.allowLeadingZeros;
     allowNonNumericNumbers = builder.allowNonNumericNumbers;
     allowUnquotedControlChars = builder.allowUnquotedControlChars;
+    cudfPruneSchema = builder.cudfPruneSchema;
+    lineDelimiter = builder.lineDelimiter;
+  }
+
+  public boolean shouldCudfPruneSchema() {
+    return cudfPruneSchema;
+  }
+
+  public byte getLineDelimiter() {
+    return lineDelimiter;
   }
 
   public boolean isDayFirst() {
@@ -123,6 +135,22 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean mixedTypesAsStrings = false;
     private boolean keepQuotes = false;
 
+    private boolean cudfPruneSchema = false;
+    private byte lineDelimiter = '\n';
+
+    public Builder withCudfPruneSchema(boolean prune) {
+      cudfPruneSchema = prune;
+      return this;
+    }
+
+    public Builder withLineDelimiter(char delimiter) {
+      if (delimiter > Byte.MAX_VALUE) {
+        throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter);
+      }
+      lineDelimiter = (byte)delimiter;
+      return this;
+    }
+
     /**
      * Should json validation be strict or not
      */
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 09da43374ae..6d370ca27b2 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -258,7 +258,9 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean strictValidation,
                                         boolean allowLeadingZeros,
                                         boolean allowNonNumericNumbers,
-                                        boolean allowUnquotedControl) throws CudfException;
+                                        boolean allowUnquotedControl,
+                                        boolean pruneColumns,
+                                        byte lineDelimiter) throws CudfException;
 
   private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
                                       int[] dTypeIds, int[] dTypeScales,
@@ -272,6 +274,8 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean allowLeadingZeros,
                                       boolean allowNonNumericNumbers,
                                       boolean allowUnquotedControl,
+                                      boolean pruneColumns,
+                                      byte lineDelimiter,
                                       long dsHandle) throws CudfException;
 
   private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
@@ -284,6 +288,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool
                                       boolean allowLeadingZeros,
                                       boolean allowNonNumericNumbers,
                                       boolean allowUnquotedControl,
+                                      byte lineDelimiter,
                                       long dsHandle) throws CudfException;
 
   private static native long readAndInferJSON(long address, long length,
@@ -297,7 +302,8 @@ private static native long readAndInferJSON(long address, long length,
                                               boolean strictValidation,
                                               boolean allowLeadingZeros,
                                               boolean allowNonNumericNumbers,
-                                              boolean allowUnquotedControl) throws CudfException;
+                                              boolean allowUnquotedControl,
+                                              byte lineDelimiter) throws CudfException;
 
   /**
    * Read in Parquet formatted data.
@@ -1308,6 +1314,10 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
    * @return the file parsed as a table on the GPU.
    */
   public static Table readJSON(Schema schema, JSONOptions opts, File path) {
+    // only prune the schema if one is provided
+    boolean cudfPruneSchema = schema.getColumnNames() != null &&
+        schema.getColumnNames().length != 0 &&
+        opts.shouldCudfPruneSchema();
     try (TableWithMeta twm = new TableWithMeta(
             readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
                     schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
@@ -1321,7 +1331,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     opts.strictValidation(),
                     opts.leadingZerosAllowed(),
                     opts.nonNumericNumbersAllowed(),
-                    opts.unquotedControlChars()))) {
+                    opts.unquotedControlChars(),
+                    cudfPruneSchema,
+                    opts.getLineDelimiter()))) {
 
       return gatherJSONColumns(schema, twm, -1);
     }
@@ -1404,7 +1416,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
         opts.strictValidation(),
         opts.leadingZerosAllowed(),
         opts.nonNumericNumbersAllowed(),
-        opts.unquotedControlChars()));
+        opts.unquotedControlChars(),
+        opts.getLineDelimiter()));
   }
 
   /**
@@ -1426,6 +1439,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
           opts.leadingZerosAllowed(),
           opts.nonNumericNumbersAllowed(),
           opts.unquotedControlChars(),
+          opts.getLineDelimiter(),
           dsHandle));
         return twm;
       } finally {
@@ -1465,6 +1479,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     assert len > 0;
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
+    // only prune the schema if one is provided
+    boolean cudfPruneSchema = schema.getColumnNames() != null &&
+        schema.getColumnNames().length != 0 &&
+        opts.shouldCudfPruneSchema();
     try (TableWithMeta twm = new TableWithMeta(readJSON(
             schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
             schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
@@ -1479,7 +1497,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
             opts.strictValidation(),
             opts.leadingZerosAllowed(),
             opts.nonNumericNumbersAllowed(),
-            opts.unquotedControlChars()))) {
+            opts.unquotedControlChars(),
+            cudfPruneSchema,
+            opts.getLineDelimiter()))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
     }
   }
@@ -1505,6 +1525,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
    */
   public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) {
     long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
+    // only prune the schema if one is provided
+    boolean cudfPruneSchema = schema.getColumnNames() != null &&
+        schema.getColumnNames().length != 0 &&
+        opts.shouldCudfPruneSchema();
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
         schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
         opts.isDayFirst(),
@@ -1518,6 +1542,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
         opts.leadingZerosAllowed(),
         opts.nonNumericNumbersAllowed(),
         opts.unquotedControlChars(),
+        cudfPruneSchema,
+        opts.getLineDelimiter(),
         dsHandle))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
     } finally {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 92e213bcb60..0f77da54152 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1627,6 +1627,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
                                                          jboolean allow_leading_zeros,
                                                          jboolean allow_nonnumeric_numbers,
                                                          jboolean allow_unquoted_control,
+                                                         jbyte line_delimiter,
                                                          jlong ds_handle)
 {
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
@@ -1646,8 +1647,10 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
-        .keep_quotes(keep_quotes);
+        .keep_quotes(keep_quotes)
+        .prune_columns(false);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
@@ -1676,7 +1679,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
                                            jboolean strict_validation,
                                            jboolean allow_leading_zeros,
                                            jboolean allow_nonnumeric_numbers,
-                                           jboolean allow_unquoted_control)
+                                           jboolean allow_unquoted_control,
+                                           jbyte line_delimiter)
 {
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1700,6 +1704,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .strict_validation(strict_validation)
         .mixed_types_as_string(mixed_types_as_string)
+        .prune_columns(false)
+        .delimiter(static_cast<char>(line_delimiter))
         .keep_quotes(keep_quotes);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
@@ -1814,6 +1820,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
                                                  jboolean allow_leading_zeros,
                                                  jboolean allow_nonnumeric_numbers,
                                                  jboolean allow_unquoted_control,
+                                                 jboolean prune_columns,
+                                                 jbyte line_delimiter,
                                                  jlong ds_handle)
 {
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
@@ -1848,8 +1856,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
-        .keep_quotes(keep_quotes);
+        .keep_quotes(keep_quotes)
+        .prune_columns(prune_columns);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
@@ -1908,7 +1918,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
                                                            jboolean strict_validation,
                                                            jboolean allow_leading_zeros,
                                                            jboolean allow_nonnumeric_numbers,
-                                                           jboolean allow_unquoted_control)
+                                                           jboolean allow_unquoted_control,
+                                                           jboolean prune_columns,
+                                                           jbyte line_delimiter)
 {
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1957,8 +1969,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
-        .keep_quotes(keep_quotes);
+        .keep_quotes(keep_quotes)
+        .prune_columns(prune_columns);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 830f2b33b32..c7fcb1756b6 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -40,7 +40,6 @@
 import org.apache.parquet.schema.GroupType;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.OriginalType;
-import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 
 import java.io.*;
@@ -656,6 +655,24 @@ void testJSONValidationUnquotedControl() {
     }
   }
 
+  private static final byte[] CR_JSON_TEST_BUFFER = ("{\"a\":\"12\n3\"}\0" +
+      "{\"a\":\"AB\nC\"}\0").getBytes(StandardCharsets.UTF_8);
+
+  @Test
+  void testReadJSONDelim() {
+    Schema schema = Schema.builder().addColumn(DType.STRING, "a").build();
+    JSONOptions opts = JSONOptions.builder()
+        .withLines(true)
+        .withLineDelimiter('\0')
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column("12\n3", "AB\nC")
+        .build();
+        Table found = Table.readJSON(schema, opts, CR_JSON_TEST_BUFFER)) {
+      assertTablesAreEqual(expected, found);
+    }
+  }
+
   private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" +
       "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" +
       "{\"d\":[1,2,3]}\n" +
diff --git a/python/cudf/benchmarks/pytest.ini b/python/cudf/benchmarks/pytest.ini
index db24415ef9e..187d91996b2 100644
--- a/python/cudf/benchmarks/pytest.ini
+++ b/python/cudf/benchmarks/pytest.ini
@@ -6,3 +6,4 @@ python_classes = Bench
 python_functions = bench_*
 markers =
     pandas_incompatible: mark a benchmark that cannot be run with pandas
+addopts = --tb=native
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index e27c595edda..99e4c21df8a 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -599,7 +599,6 @@ cdef class Column:
             children=tuple(children)
         )
 
-    #  TODO: Actually support exposed data pointers.
     @staticmethod
     def from_pylibcudf(
         col, bint data_ptr_exposed=False
@@ -616,7 +615,7 @@ cdef class Column:
         col : pylibcudf.Column
             The object to copy.
         data_ptr_exposed : bool
-            This parameter is not yet supported
+            Whether the data buffer is exposed.
 
         Returns
         -------
@@ -639,16 +638,18 @@ cdef class Column:
         dtype = dtype_from_pylibcudf_column(col)
 
         return cudf.core.column.build_column(
-            data=as_buffer(col.data().obj) if col.data() is not None else None,
+            data=as_buffer(
+                col.data().obj, exposed=data_ptr_exposed
+            ) if col.data() is not None else None,
             dtype=dtype,
             size=col.size(),
             mask=as_buffer(
-                col.null_mask().obj
+                col.null_mask().obj, exposed=data_ptr_exposed
             ) if col.null_mask() is not None else None,
             offset=col.offset(),
             null_count=col.null_count(),
             children=tuple([
-                Column.from_pylibcudf(child)
+                Column.from_pylibcudf(child, data_ptr_exposed=data_ptr_exposed)
                 for child in col.children()
             ])
         )
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index e661059faa3..e6c2d136f0d 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -23,9 +23,9 @@ def concat_columns(object columns):
 def concat_tables(object tables, bool ignore_index=False):
     plc_tables = []
     for table in tables:
-        cols = table._data.columns
+        cols = table._columns
         if not ignore_index:
-            cols = table._index._data.columns + cols
+            cols = table._index._columns + cols
         plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols]))
 
     return data_from_pylibcudf_table(
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 16182e31c08..49714091f46 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -384,7 +384,7 @@ cdef class _CPackedColumns:
 
         p.column_names = input_table._column_names
         p.column_dtypes = {}
-        for name, col in input_table._data.items():
+        for name, col in input_table._column_labels_and_values:
             if isinstance(col.dtype, cudf.core.dtypes._BaseDtype):
                 p.column_dtypes[name] = col.dtype
 
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 058e884e08b..9ad96f610b3 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -273,7 +273,7 @@ def read_csv(
         elif isinstance(dtype, abc.Collection):
             for index, col_dtype in enumerate(dtype):
                 if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
-                    col_name = df._data.names[index]
+                    col_name = df._column_names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
     if names is not None and len(names) and isinstance(names[0], int):
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 483250dd36f..bc5e085ec39 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -17,6 +17,8 @@ from pylibcudf.libcudf.types cimport size_type
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def add_months(Column col, Column months):
@@ -38,43 +40,9 @@ def add_months(Column col, Column months):
 
 @acquire_spill_lock()
 def extract_datetime_component(Column col, object field):
-
-    cdef unique_ptr[column] c_result
-    cdef column_view col_view = col.view()
-
-    with nogil:
-        if field == "year":
-            c_result = move(libcudf_datetime.extract_year(col_view))
-        elif field == "month":
-            c_result = move(libcudf_datetime.extract_month(col_view))
-        elif field == "day":
-            c_result = move(libcudf_datetime.extract_day(col_view))
-        elif field == "weekday":
-            c_result = move(libcudf_datetime.extract_weekday(col_view))
-        elif field == "hour":
-            c_result = move(libcudf_datetime.extract_hour(col_view))
-        elif field == "minute":
-            c_result = move(libcudf_datetime.extract_minute(col_view))
-        elif field == "second":
-            c_result = move(libcudf_datetime.extract_second(col_view))
-        elif field == "millisecond":
-            c_result = move(
-                libcudf_datetime.extract_millisecond_fraction(col_view)
-            )
-        elif field == "microsecond":
-            c_result = move(
-                libcudf_datetime.extract_microsecond_fraction(col_view)
-            )
-        elif field == "nanosecond":
-            c_result = move(
-                libcudf_datetime.extract_nanosecond_fraction(col_view)
-            )
-        elif field == "day_of_year":
-            c_result = move(libcudf_datetime.day_of_year(col_view))
-        else:
-            raise ValueError(f"Invalid datetime field: '{field}'")
-
-    result = Column.from_unique_ptr(move(c_result))
+    result = Column.from_pylibcudf(
+        plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field)
+    )
 
     if field == "weekday":
         # Pandas counts Monday-Sunday as 0-6
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index b1900138d94..564daefbae2 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -179,7 +179,7 @@ cdef update_struct_field_names(
 ):
     # Deprecated, remove in favor of add_col_struct_names
     # when a reader is ported to pylibcudf
-    for i, (name, col) in enumerate(table._data.items()):
+    for i, (name, col) in enumerate(table._column_labels_and_values):
         table._data[name] = update_column_struct_field_names(
             col, schema_info[i]
         )
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index e6c9d60b05b..fa2690c7f21 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -235,16 +235,16 @@ cdef object _process_metadata(object df,
             df._index = idx
         elif set(index_col).issubset(names):
             index_data = df[index_col]
-            actual_index_names = list(index_col_names.values())
-            if len(index_data._data) == 1:
+            actual_index_names = iter(index_col_names.values())
+            if index_data._num_columns == 1:
                 idx = cudf.Index._from_column(
-                    index_data._data.columns[0],
-                    name=actual_index_names[0]
+                    index_data._columns[0],
+                    name=next(actual_index_names)
                 )
             else:
                 idx = cudf.MultiIndex.from_frame(
                     index_data,
-                    names=actual_index_names
+                    names=list(actual_index_names)
                 )
             df.drop(columns=index_col, inplace=True)
             df._index = idx
@@ -252,7 +252,7 @@ cdef object _process_metadata(object df,
             if use_pandas_metadata:
                 df.index.names = index_col
 
-    if len(df._data.names) == 0 and column_index_type is not None:
+    if df._num_columns == 0 and column_index_type is not None:
         df._data.label_dtype = cudf.dtype(column_index_type)
 
     return df
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 8d463829a19..60a6795a402 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -20,13 +20,7 @@ from pylibcudf.libcudf.strings.convert.convert_booleans cimport (
     to_booleans as cpp_to_booleans,
 )
 from pylibcudf.libcudf.strings.convert.convert_datetime cimport (
-    from_timestamps as cpp_from_timestamps,
     is_timestamp as cpp_is_timestamp,
-    to_timestamps as cpp_to_timestamps,
-)
-from pylibcudf.libcudf.strings.convert.convert_durations cimport (
-    from_durations as cpp_from_durations,
-    to_durations as cpp_to_durations,
 )
 from pylibcudf.libcudf.strings.convert.convert_floats cimport (
     from_floats as cpp_from_floats,
@@ -48,8 +42,12 @@ from pylibcudf.libcudf.types cimport data_type, type_id
 
 from cudf._lib.types cimport underlying_type_t_type_id
 
+import pylibcudf as plc
+
 import cudf
 
+from cudf._lib.types cimport dtype_to_pylibcudf_type
+
 
 def floating_to_string(Column input_col):
     cdef column_view input_column_view = input_col.view()
@@ -522,19 +520,14 @@ def int2timestamp(
     A Column with date-time represented in string format
 
     """
-    cdef column_view input_column_view = input_col.view()
     cdef string c_timestamp_format = format.encode("UTF-8")
-    cdef column_view input_strings_names = names.view()
-
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_timestamps(
-                input_column_view,
-                c_timestamp_format,
-                input_strings_names))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.convert.convert_datetime.from_timestamps(
+            input_col.to_pylibcudf(mode="read"),
+            c_timestamp_format,
+            names.to_pylibcudf(mode="read")
+        )
+    )
 
 
 def timestamp2int(Column input_col, dtype, format):
@@ -551,23 +544,15 @@ def timestamp2int(Column input_col, dtype, format):
     A Column with string represented in date-time format
 
     """
-    cdef column_view input_column_view = input_col.view()
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype]
+    dtype = dtype_to_pylibcudf_type(dtype)
+    cdef string c_timestamp_format = format.encode('UTF-8')
+    return Column.from_pylibcudf(
+        plc.strings.convert.convert_datetime.to_timestamps(
+            input_col.to_pylibcudf(mode="read"),
+            dtype,
+            c_timestamp_format
         )
     )
-    cdef data_type out_type = data_type(tid)
-    cdef string c_timestamp_format = format.encode('UTF-8')
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_to_timestamps(
-                input_column_view,
-                out_type,
-                c_timestamp_format))
-
-    return Column.from_unique_ptr(move(c_result))
 
 
 def istimestamp(Column input_col, str format):
@@ -613,23 +598,15 @@ def timedelta2int(Column input_col, dtype, format):
     A Column with string represented in TimeDelta format
 
     """
-    cdef column_view input_column_view = input_col.view()
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype]
+    dtype = dtype_to_pylibcudf_type(dtype)
+    cdef string c_timestamp_format = format.encode('UTF-8')
+    return Column.from_pylibcudf(
+        plc.strings.convert.convert_durations.to_durations(
+            input_col.to_pylibcudf(mode="read"),
+            dtype,
+            c_timestamp_format
         )
     )
-    cdef data_type out_type = data_type(tid)
-    cdef string c_duration_format = format.encode('UTF-8')
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_to_durations(
-                input_column_view,
-                out_type,
-                c_duration_format))
-
-    return Column.from_unique_ptr(move(c_result))
 
 
 def int2timedelta(Column input_col, str format):
@@ -647,16 +624,13 @@ def int2timedelta(Column input_col, str format):
 
     """
 
-    cdef column_view input_column_view = input_col.view()
     cdef string c_duration_format = format.encode('UTF-8')
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_durations(
-                input_column_view,
-                c_duration_format))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.convert.convert_durations.from_durations(
+            input_col.to_pylibcudf(mode="read"),
+            c_duration_format
+        )
+    )
 
 
 def int2ip(Column input_col):
diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx
index 63f4d57e562..5bf336f4f3c 100644
--- a/python/cudf/cudf/_lib/strings/extract.pyx
+++ b/python/cudf/cudf/_lib/strings/extract.pyx
@@ -1,21 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.extract cimport extract as cpp_extract
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.table.table cimport table
-
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -26,21 +17,8 @@ def extract(Column source_strings, object pattern, uint32_t flags):
     The returning data contains one row for each subject string,
     and one column for each group.
     """
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_extract(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags)
+    plc_result = plc.strings.extract.extract(
+        source_strings.to_pylibcudf(mode="read"), prog
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns()))
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index 3cf2084e30a..0e758d5b322 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -1,21 +1,13 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.findall cimport findall as cpp_findall
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def findall(Column source_strings, object pattern, uint32_t flags):
@@ -23,18 +15,11 @@ def findall(Column source_strings, object pattern, uint32_t flags):
     Returns data with all non-overlapping matches of `pattern`
     in each string of `source_strings` as a lists column.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_findall(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    prog = plc.strings.regex_program.RegexProgram.create(
+        str(pattern), flags
+    )
+    plc_result = plc.strings.findall.findall(
+        source_strings.to_pylibcudf(mode="read"),
+        prog,
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx
index acf52cb7b9f..38ecb21a94c 100644
--- a/python/cudf/cudf/_lib/strings/strip.pyx
+++ b/python/cudf/cudf/_lib/strings/strip.pyx
@@ -13,6 +13,7 @@ from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip
 
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -25,23 +26,14 @@ def strip(Column source_strings,
     """
 
     cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        repl.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.strip.strip(
+            source_strings.to_pylibcudf(mode="read"),
+            plc.strings.SideType.BOTH,
+            repl.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_strip(
-            source_view,
-            side_type.BOTH,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def lstrip(Column source_strings,
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index baa08a545ec..40d0c9eac3a 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -3,41 +3,26 @@
 from numba.np import numpy_support
 
 import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 from cudf.core._internals.expressions import parse_expression
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.utils import cudautils
 
 from cython.operator cimport dereference
-from libc.stdint cimport uintptr_t
 from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
-from libcpp.string cimport string
 from libcpp.utility cimport move
 
 cimport pylibcudf.libcudf.transform as libcudf_transform
 from pylibcudf cimport transform as plc_transform
 from pylibcudf.expressions cimport Expression
 from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.expressions cimport expression
-from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport (
-    bitmask_type,
-    data_type,
-    size_type,
-    type_id,
-)
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.types cimport underlying_type_t_type_id
-from cudf._lib.utils cimport (
-    columns_from_unique_ptr,
-    data_from_table_view,
-    table_view_from_columns,
-)
+from cudf._lib.utils cimport table_view_from_columns
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -46,17 +31,8 @@ def bools_to_mask(Column col):
     Given an int8 (boolean) column, compress the data from booleans to bits and
     return a Buffer
     """
-    cdef column_view col_view = col.view()
-    cdef pair[unique_ptr[device_buffer], size_type] cpp_out
-    cdef unique_ptr[device_buffer] up_db
-
-    with nogil:
-        cpp_out = move(libcudf_transform.bools_to_mask(col_view))
-        up_db = move(cpp_out.first)
-
-    rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
-    buf = as_buffer(rmm_db)
-    return buf
+    mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read"))
+    return as_buffer(mask)
 
 
 @acquire_spill_lock()
@@ -68,22 +44,15 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
     if not isinstance(mask_buffer, cudf.core.buffer.Buffer):
         raise TypeError("mask_buffer is not an instance of "
                         "cudf.core.buffer.Buffer")
-    cdef bitmask_type* bit_mask = <bitmask_type*><uintptr_t>(
-        mask_buffer.get_ptr(mode="read")
+    plc_column = plc_transform.mask_to_bools(
+        mask_buffer.get_ptr(mode="read"), begin_bit, end_bit
     )
-
-    cdef unique_ptr[column] result
-    with nogil:
-        result = move(
-            libcudf_transform.mask_to_bools(bit_mask, begin_bit, end_bit)
-        )
-
-    return Column.from_unique_ptr(move(result))
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
 def nans_to_nulls(Column input):
-    (mask, _) = plc_transform.nans_to_nulls(
+    mask, _ = plc_transform.nans_to_nulls(
         input.to_pylibcudf(mode="read")
     )
     return as_buffer(mask)
@@ -91,80 +60,45 @@ def nans_to_nulls(Column input):
 
 @acquire_spill_lock()
 def transform(Column input, op):
-    cdef column_view c_input = input.view()
-    cdef string c_str
-    cdef type_id c_tid
-    cdef data_type c_dtype
-
     nb_type = numpy_support.from_dtype(input.dtype)
     nb_signature = (nb_type,)
     compiled_op = cudautils.compile_udf(op, nb_signature)
-    c_str = compiled_op[0].encode('UTF-8')
     np_dtype = cudf.dtype(compiled_op[1])
 
-    try:
-        c_tid = <type_id> (
-            <underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
-                np_dtype
-            ]
-        )
-        c_dtype = data_type(c_tid)
-
-    except KeyError:
-        raise TypeError(
-            "Result of window function has unsupported dtype {}"
-            .format(np_dtype)
-        )
-
-    with nogil:
-        c_output = move(libcudf_transform.transform(
-            c_input,
-            c_str,
-            c_dtype,
-            True
-        ))
-
-    return Column.from_unique_ptr(move(c_output))
+    plc_column = plc_transform.transform(
+        input.to_pylibcudf(mode="read"),
+        compiled_op[0],
+        plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
+        True
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def table_encode(list source_columns):
-    cdef table_view c_input = table_view_from_columns(source_columns)
-    cdef pair[unique_ptr[table], unique_ptr[column]] c_result
-
-    with nogil:
-        c_result = move(libcudf_transform.encode(c_input))
+    plc_table, plc_column = plc_transform.encode(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns])
+    )
 
     return (
-        columns_from_unique_ptr(move(c_result.first)),
-        Column.from_unique_ptr(move(c_result.second))
+        [Column.from_pylibcudf(col) for col in plc_table.columns()],
+        Column.from_pylibcudf(plc_column)
     )
 
 
 def one_hot_encode(Column input_column, Column categories):
-    cdef column_view c_view_input = input_column.view()
-    cdef column_view c_view_categories = categories.view()
-    cdef pair[unique_ptr[column], table_view] c_result
-
-    with nogil:
-        c_result = move(
-            libcudf_transform.one_hot_encode(c_view_input, c_view_categories)
-        )
-
-    # Notice, the data pointer of `owner` has been exposed
-    # through `c_result.second` at this point.
-    owner = Column.from_unique_ptr(
-        move(c_result.first), data_ptr_exposed=True
-    )
-
-    pylist_categories = categories.to_arrow().to_pylist()
-    encodings, _ = data_from_table_view(
-        move(c_result.second),
-        owner=owner,
-        column_names=[
-            x if x is not None else '<NA>' for x in pylist_categories
-        ]
+    plc_table = plc_transform.one_hot_encode(
+        input_column.to_pylibcudf(mode="read"),
+        categories.to_pylibcudf(mode="read"),
     )
-    return encodings
+    result_columns = [
+        Column.from_pylibcudf(col, data_ptr_exposed=True)
+        for col in plc_table.columns()
+    ]
+    result_labels = [
+        x if x is not None else '<NA>'
+        for x in categories.to_arrow().to_pylist()
+    ]
+    return dict(zip(result_labels, result_columns))
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index cae28d02ef4..8660cca9322 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -49,9 +49,9 @@ cdef table_view table_view_from_table(tbl, ignore_index=False) except*:
         If True, don't include the index in the columns.
     """
     return table_view_from_columns(
-        tbl._index._data.columns + tbl._data.columns
+        tbl._index._columns + tbl._columns
         if not ignore_index and tbl._index is not None
-        else tbl._data.columns
+        else tbl._columns
     )
 
 
@@ -62,7 +62,7 @@ cpdef generate_pandas_metadata(table, index):
     index_descriptors = []
     columns_to_convert = list(table._columns)
     # Columns
-    for name, col in table._data.items():
+    for name, col in table._column_labels_and_values:
         if cudf.get_option("mode.pandas_compatible"):
             # in pandas-compat mode, non-string column names are stringified.
             col_names.append(str(name))
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index ff114474aa4..a6abd63d042 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1951,7 +1951,7 @@ def drop_duplicates(
         return self._from_columns_like_self(
             drop_duplicates(
                 list(self._columns),
-                keys=range(len(self._data)),
+                keys=range(len(self._columns)),
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
             ),
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index e059917b0b8..4463e3280df 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -623,11 +623,9 @@ def extract(
                 "unsupported value for `flags` parameter"
             )
 
-        data, _ = libstrings.extract(self._column, pat, flags)
+        data = libstrings.extract(self._column, pat, flags)
         if len(data) == 1 and expand is False:
-            data = next(iter(data.values()))
-        else:
-            data = data
+            _, data = data.popitem()
         return self._return_or_inplace(data, expand=expand)
 
     def contains(
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 09b0f453692..bc093fdaa9a 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -151,9 +151,9 @@ def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None:
         self.set_by_label(key, value)
 
     def __delitem__(self, key: abc.Hashable) -> None:
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         del self._data[key]
-        new_ncols = len(self._data)
+        new_ncols = len(self)
         self._clear_cache(old_ncols, new_ncols)
 
     def __len__(self) -> int:
@@ -213,7 +213,7 @@ def level_names(self) -> tuple[abc.Hashable, ...]:
 
     @property
     def nlevels(self) -> int:
-        if len(self._data) == 0:
+        if len(self) == 0:
             return 0
         if not self.multiindex:
             return 1
@@ -226,7 +226,7 @@ def name(self) -> abc.Hashable:
 
     @cached_property
     def nrows(self) -> int:
-        if len(self._data) == 0:
+        if len(self) == 0:
             return 0
         else:
             return len(next(iter(self.values())))
@@ -257,9 +257,9 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
         Parameters
         ----------
         old_ncols: int
-            len(self._data) before self._data was modified
+            len(self) before self._data was modified
         new_ncols: int
-            len(self._data) after self._data was modified
+            len(self) after self._data was modified
         """
         cached_properties = ("columns", "names", "_grouped_data")
         for attr in cached_properties:
@@ -335,7 +335,7 @@ def insert(
         if name in self._data:
             raise ValueError(f"Cannot insert '{name}', already exists")
 
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         if loc == -1:
             loc = old_ncols
         elif not (0 <= loc <= old_ncols):
@@ -414,7 +414,7 @@ def get_labels_by_index(self, index: Any) -> tuple:
         tuple
         """
         if isinstance(index, slice):
-            start, stop, step = index.indices(len(self._data))
+            start, stop, step = index.indices(len(self))
             return self.names[start:stop:step]
         elif pd.api.types.is_integer(index):
             return (self.names[index],)
@@ -526,9 +526,9 @@ def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None:
         if len(self) > 0 and len(value) != self.nrows:
             raise ValueError("All columns must be of equal length")
 
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         self._data[key] = value
-        new_ncols = len(self._data)
+        new_ncols = len(self)
         self._clear_cache(old_ncols, new_ncols)
 
     def _select_by_label_list_like(self, key: tuple) -> Self:
@@ -718,12 +718,12 @@ def droplevel(self, level: int) -> None:
         if level < 0:
             level += self.nlevels
 
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         self._data = {
             _remove_key_level(key, level): value  # type: ignore[arg-type]
             for key, value in self._data.items()
         }
-        new_ncols = len(self._data)
+        new_ncols = len(self)
         self._level_names = (
             self._level_names[:level] + self._level_names[level + 1 :]
         )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d73ad8225ca..16b0aa95c35 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -176,7 +176,7 @@ def _can_downcast_to_series(self, df, arg):
         return False
 
     @_performance_tracking
-    def _downcast_to_series(self, df, arg):
+    def _downcast_to_series(self, df: DataFrame, arg):
         """
         "Downcast" from a DataFrame to a Series
         based on Pandas indexing rules
@@ -203,16 +203,16 @@ def _downcast_to_series(self, df, arg):
 
         # take series along the axis:
         if axis == 1:
-            return df[df._data.names[0]]
+            return df[df._column_names[0]]
         else:
             if df._num_columns > 0:
                 dtypes = df.dtypes.values.tolist()
                 normalized_dtype = np.result_type(*dtypes)
-                for name, col in df._data.items():
+                for name, col in df._column_labels_and_values:
                     df[name] = col.astype(normalized_dtype)
 
             sr = df.T
-            return sr[sr._data.names[0]]
+            return sr[sr._column_names[0]]
 
 
 class _DataFrameLocIndexer(_DataFrameIndexer):
@@ -258,7 +258,7 @@ def _getitem_tuple_arg(self, arg):
                     and len(arg) > 1
                     and is_scalar(arg[1])
                 ):
-                    return result._data.columns[0].element_indexing(0)
+                    return result._columns[0].element_indexing(0)
                 return result
         else:
             if isinstance(arg[0], slice):
@@ -310,7 +310,7 @@ def _getitem_tuple_arg(self, arg):
                 else:
                     tmp_col_name = str(uuid4())
                     cantor_name = "_" + "_".join(
-                        map(str, columns_df._data.names)
+                        map(str, columns_df._column_names)
                     )
                     if columns_df._data.multiindex:
                         # column names must be appropriate length tuples
@@ -1412,7 +1412,7 @@ def __setitem__(self, arg, value):
                             else column.column_empty_like(
                                 col, masked=True, newsize=length
                             )
-                            for key, col in self._data.items()
+                            for key, col in self._column_labels_and_values
                         )
                         self._data = self._data._from_columns_like_self(
                             new_columns, verify=False
@@ -1494,8 +1494,8 @@ def __delitem__(self, name):
 
     @_performance_tracking
     def memory_usage(self, index=True, deep=False) -> cudf.Series:
-        mem_usage = [col.memory_usage for col in self._data.columns]
-        names = [str(name) for name in self._data.names]
+        mem_usage = [col.memory_usage for col in self._columns]
+        names = [str(name) for name in self._column_names]
         if index:
             mem_usage.append(self.index.memory_usage())
             names.append("Index")
@@ -1725,7 +1725,7 @@ def _concat(
                 []
                 if are_all_range_index
                 or (ignore_index and not empty_has_index)
-                else list(f.index._data.columns)
+                else list(f.index._columns)
             )
             + [f._data[name] if name in f._data else None for name in names]
             for f in objs
@@ -1808,7 +1808,7 @@ def _concat(
                 out.index.dtype, cudf.CategoricalDtype
             ):
                 out = out.set_index(out.index)
-        for name, col in out._data.items():
+        for name, col in out._column_labels_and_values:
             out._data[name] = col._with_type_metadata(
                 tables[0]._data[name].dtype
             )
@@ -1831,13 +1831,13 @@ def astype(
         errors: Literal["raise", "ignore"] = "raise",
     ):
         if is_dict_like(dtype):
-            if len(set(dtype.keys()) - set(self._data.names)) > 0:
+            if len(set(dtype.keys()) - set(self._column_names)) > 0:
                 raise KeyError(
                     "Only a column name can be used for the "
                     "key in a dtype mappings argument."
                 )
         else:
-            dtype = {cc: dtype for cc in self._data.names}
+            dtype = {cc: dtype for cc in self._column_names}
         return super().astype(dtype, copy, errors)
 
     def _clean_renderable_dataframe(self, output):
@@ -2601,7 +2601,7 @@ def equals(self, other) -> bool:
         # If all other checks matched, validate names.
         if ret:
             for self_name, other_name in zip(
-                self._data.names, other._data.names
+                self._column_names, other._column_names
             ):
                 if self_name != other_name:
                     ret = False
@@ -2676,7 +2676,7 @@ def columns(self, columns):
             )
 
         self._data = ColumnAccessor(
-            data=dict(zip(pd_columns, self._data.columns)),
+            data=dict(zip(pd_columns, self._columns)),
             multiindex=multiindex,
             level_names=level_names,
             label_dtype=label_dtype,
@@ -2698,7 +2698,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None:
                 f"got {len(self)} elements"
             )
         self._data = ColumnAccessor(
-            data=dict(zip(other.names, self._data.columns)),
+            data=dict(zip(other.names, self._columns)),
             multiindex=other.multiindex,
             rangeindex=other.rangeindex,
             level_names=other.level_names,
@@ -2983,7 +2983,7 @@ def set_index(
             elif isinstance(col, (MultiIndex, pd.MultiIndex)):
                 if isinstance(col, pd.MultiIndex):
                     col = MultiIndex.from_pandas(col)
-                data_to_add.extend(col._data.columns)
+                data_to_add.extend(col._columns)
                 names.extend(col.names)
             elif isinstance(
                 col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
@@ -3110,7 +3110,9 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
             )
 
         out = []
-        for (name, col), other_col in zip(self._data.items(), other_cols):
+        for (name, col), other_col in zip(
+            self._column_labels_and_values, other_cols
+        ):
             source_col, other_col = _check_and_cast_columns_with_other(
                 source_col=col,
                 other=other_col,
@@ -3314,7 +3316,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
                             column.column_empty_like(
                                 col_data, masked=True, newsize=length
                             )
-                            for col_data in self._data.values()
+                            for col_data in self._columns
                         ),
                         verify=False,
                     )
@@ -3664,7 +3666,7 @@ def rename(
                             name: col.find_and_replace(
                                 to_replace, vals, is_all_na
                             )
-                            for name, col in self.index._data.items()
+                            for name, col in self.index._column_labels_and_values
                         }
                     )
                 except OverflowError:
@@ -3686,9 +3688,7 @@ def add_prefix(self, prefix, axis=None):
             raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
-        out.columns = [
-            prefix + col_name for col_name in list(self._data.keys())
-        ]
+        out.columns = [prefix + col_name for col_name in self._column_names]
         return out
 
     @_performance_tracking
@@ -3697,9 +3697,7 @@ def add_suffix(self, suffix, axis=None):
             raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
-        out.columns = [
-            col_name + suffix for col_name in list(self._data.keys())
-        ]
+        out.columns = [col_name + suffix for col_name in self._column_names]
         return out
 
     @_performance_tracking
@@ -4805,7 +4803,7 @@ def _func(x):  # pragma: no cover
         # TODO: naive implementation
         # this could be written as a single kernel
         result = {}
-        for name, col in self._data.items():
+        for name, col in self._column_labels_and_values:
             apply_sr = Series._from_column(col)
             result[name] = apply_sr.apply(_func)._column
 
@@ -5444,7 +5442,7 @@ def to_pandas(
         out_index = self.index.to_pandas()
         out_data = {
             i: col.to_pandas(nullable=nullable, arrow_type=arrow_type)
-            for i, col in enumerate(self._data.columns)
+            for i, col in enumerate(self._columns)
         }
 
         out_df = pd.DataFrame(out_data, index=out_index)
@@ -5665,14 +5663,16 @@ def to_arrow(self, preserve_index=None) -> pa.Table:
                     index = index._as_int_index()
                     index.name = "__index_level_0__"
                 if isinstance(index, MultiIndex):
-                    index_descr = list(index._data.names)
+                    index_descr = index._column_names
                     index_levels = index.levels
                 else:
                     index_descr = (
                         index.names if index.name is not None else ("index",)
                     )
                 data = data.copy(deep=False)
-                for gen_name, col_name in zip(index_descr, index._data.names):
+                for gen_name, col_name in zip(
+                    index_descr, index._column_names
+                ):
                     data._insert(
                         data.shape[1],
                         gen_name,
@@ -5681,7 +5681,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table:
 
         out = super(DataFrame, data).to_arrow()
         metadata = pa.pandas_compat.construct_metadata(
-            columns_to_convert=[self[col] for col in self._data.names],
+            columns_to_convert=[self[col] for col in self._column_names],
             df=self,
             column_names=out.schema.names,
             index_levels=index_levels,
@@ -5724,12 +5724,12 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None):
                 "column_dtypes is currently not supported."
             )
         members = [("index", self.index.dtype)] if index else []
-        members += [(col, self[col].dtype) for col in self._data.names]
+        members += list(self._dtypes)
         dtype = np.dtype(members)
         ret = np.recarray(len(self), dtype=dtype)
         if index:
             ret["index"] = self.index.to_numpy()
-        for col in self._data.names:
+        for col in self._column_names:
             ret[col] = self[col].to_numpy()
         return ret
 
@@ -6059,7 +6059,7 @@ def quantile(
             )
 
         if columns is None:
-            columns = data_df._data.names
+            columns = set(data_df._column_names)
 
         if isinstance(q, numbers.Number):
             q_is_number = True
@@ -6084,7 +6084,7 @@ def quantile(
             # Ensure that qs is non-scalar so that we always get a column back.
             interpolation = interpolation or "linear"
             result = {}
-            for k in data_df._data.names:
+            for k in data_df._column_names:
                 if k in columns:
                     ser = data_df[k]
                     res = ser.quantile(
@@ -6198,7 +6198,7 @@ def make_false_column_like_self():
                 if isinstance(values, DataFrame)
                 else {name: values._column for name in self._data}
             )
-            for col, self_col in self._data.items():
+            for col, self_col in self._column_labels_and_values:
                 if col in other_cols:
                     other_col = other_cols[col]
                     self_is_cat = isinstance(self_col, CategoricalColumn)
@@ -6231,13 +6231,13 @@ def make_false_column_like_self():
                 else:
                     result[col] = make_false_column_like_self()
         elif is_dict_like(values):
-            for name, col in self._data.items():
+            for name, col in self._column_labels_and_values:
                 if name in values:
                     result[name] = col.isin(values[name])
                 else:
                     result[name] = make_false_column_like_self()
         elif is_list_like(values):
-            for name, col in self._data.items():
+            for name, col in self._column_labels_and_values:
                 result[name] = col.isin(values)
         else:
             raise TypeError(
@@ -6292,7 +6292,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
                     name: filtered._data[name]._get_mask_as_column()
                     if filtered._data[name].nullable
                     else as_column(True, length=len(filtered._data[name]))
-                    for name in filtered._data.names
+                    for name in filtered._column_names
                 }
             )
             mask = mask.all(axis=1)
@@ -6342,7 +6342,7 @@ def count(self, axis=0, numeric_only=False):
         length = len(self)
         return Series._from_column(
             as_column([length - col.null_count for col in self._columns]),
-            index=cudf.Index(self._data.names),
+            index=cudf.Index(self._column_names),
         )
 
     _SUPPORT_AXIS_LOOKUP = {
@@ -6409,7 +6409,7 @@ def _reduce(
             return source._apply_cupy_method_axis_1(op, **kwargs)
         else:
             axis_0_results = []
-            for col_label, col in source._data.items():
+            for col_label, col in source._column_labels_and_values:
                 try:
                     axis_0_results.append(getattr(col, op)(**kwargs))
                 except AttributeError as err:
@@ -6634,7 +6634,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
         prepared, mask, common_dtype = self._prepare_for_rowwise_op(
             method, skipna, numeric_only
         )
-        for col in prepared._data.names:
+        for col in prepared._column_names:
             if prepared._data[col].nullable:
                 prepared._data[col] = (
                     prepared._data[col]
@@ -6820,7 +6820,7 @@ def select_dtypes(self, include=None, exclude=None):
         # remove all exclude types
         inclusion = inclusion - exclude_subtypes
 
-        for k, col in self._data.items():
+        for k, col in self._column_labels_and_values:
             infered_type = cudf_dtype_from_pydata_dtype(col.dtype)
             if infered_type in inclusion:
                 df._insert(len(df._data), k, col)
@@ -7192,7 +7192,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
         # Compute the column indices that serves as the input for
         # `interleave_columns`
         column_idx_df = pd.DataFrame(
-            data=range(len(self._data)), index=named_levels
+            data=range(self._num_columns), index=named_levels
         )
 
         column_indices: list[list[int]] = []
@@ -7392,17 +7392,17 @@ def to_struct(self, name=None):
         -----
         Note: a copy of the columns is made.
         """
-        if not all(isinstance(name, str) for name in self._data.names):
+        if not all(isinstance(name, str) for name in self._column_names):
             warnings.warn(
                 "DataFrame contains non-string column name(s). Struct column "
                 "requires field name to be string. Non-string column names "
                 "will be casted to string as the field name."
             )
-        fields = {str(name): col.dtype for name, col in self._data.items()}
+        fields = {str(name): dtype for name, dtype in self._dtypes}
         col = StructColumn(
             data=None,
             dtype=cudf.StructDtype(fields=fields),
-            children=tuple(col.copy(deep=True) for col in self._data.columns),
+            children=tuple(col.copy(deep=True) for col in self._columns),
             size=len(self),
             offset=0,
         )
@@ -7984,7 +7984,7 @@ def value_counts(
             diff = set(subset) - set(self._data)
             if len(diff) != 0:
                 raise KeyError(f"columns {diff} do not exist")
-        columns = list(self._data.names) if subset is None else subset
+        columns = list(self._column_names) if subset is None else subset
         result = (
             self.groupby(
                 by=columns,
@@ -8105,7 +8105,7 @@ def func(left, right, output):
                 right._column_names
             )
         elif _is_scalar_or_zero_d_array(right):
-            for name, col in output._data.items():
+            for name, col in output._column_labels_and_values:
                 output._data[name] = col.fillna(value)
             return output
         else:
@@ -8387,7 +8387,7 @@ def extract_col(df, col):
             and col not in df.index._data
             and not isinstance(df.index, MultiIndex)
         ):
-            return df.index._data.columns[0]
+            return df.index._column
         return df.index._data[col]
 
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 7b2bc85b13b..37ad6b8fabb 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -75,8 +75,15 @@ def _columns(self) -> tuple[ColumnBase, ...]:
         return self._data.columns
 
     @property
-    def _dtypes(self) -> abc.Iterable:
-        return zip(self._data.names, (col.dtype for col in self._data.columns))
+    def _column_labels_and_values(
+        self,
+    ) -> abc.Iterable[tuple[abc.Hashable, ColumnBase]]:
+        return zip(self._column_names, self._columns)
+
+    @property
+    def _dtypes(self) -> abc.Generator[tuple[abc.Hashable, Dtype], None, None]:
+        for label, col in self._column_labels_and_values:
+            yield label, col.dtype
 
     @property
     def ndim(self) -> int:
@@ -87,7 +94,7 @@ def serialize(self):
         # TODO: See if self._data can be serialized outright
         header = {
             "type-serialized": pickle.dumps(type(self)),
-            "column_names": pickle.dumps(tuple(self._data.names)),
+            "column_names": pickle.dumps(self._column_names),
             "column_rangeindex": pickle.dumps(self._data.rangeindex),
             "column_multiindex": pickle.dumps(self._data.multiindex),
             "column_label_dtype": pickle.dumps(self._data.label_dtype),
@@ -156,7 +163,7 @@ def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Self | None:
         if inplace:
-            for col in self._data:
+            for col in self._column_names:
                 if col in result._data:
                     self._data[col]._mimic_inplace(
                         result._data[col], inplace=True
@@ -267,7 +274,7 @@ def __len__(self) -> int:
     def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
         casted = (
             col.astype(dtype.get(col_name, col.dtype), copy=copy)
-            for col_name, col in self._data.items()
+            for col_name, col in self._column_labels_and_values
         )
         ca = self._data._from_columns_like_self(casted, verify=False)
         return self._from_data_like_self(ca)
@@ -338,9 +345,7 @@ def equals(self, other) -> bool:
 
         return all(
             self_col.equals(other_col, check_dtypes=True)
-            for self_col, other_col in zip(
-                self._data.values(), other._data.values()
-            )
+            for self_col, other_col in zip(self._columns, other._columns)
         )
 
     @_performance_tracking
@@ -434,11 +439,9 @@ def to_array(
 
         if dtype is None:
             if ncol == 1:
-                dtype = next(iter(self._data.values())).dtype
+                dtype = next(self._dtypes)[1]
             else:
-                dtype = find_common_type(
-                    [col.dtype for col in self._data.values()]
-                )
+                dtype = find_common_type([dtype for _, dtype in self._dtypes])
 
             if not isinstance(dtype, numpy.dtype):
                 raise NotImplementedError(
@@ -446,12 +449,12 @@ def to_array(
                 )
 
         if self.ndim == 1:
-            return to_array(self._data.columns[0], dtype)
+            return to_array(self._columns[0], dtype)
         else:
             matrix = module.empty(
                 shape=(len(self), ncol), dtype=dtype, order="F"
             )
-            for i, col in enumerate(self._data.values()):
+            for i, col in enumerate(self._columns):
                 # TODO: col.values may fail if there is nullable data or an
                 # unsupported dtype. We may want to catch and provide a more
                 # suitable error.
@@ -751,7 +754,7 @@ def fillna(
 
         filled_columns = [
             col.fillna(value[name], method) if name in value else col.copy()
-            for name, col in self._data.items()
+            for name, col in self._column_labels_and_values
         ]
 
         return self._mimic_inplace(
@@ -764,11 +767,15 @@ def fillna(
         )
 
     @_performance_tracking
-    def _drop_column(self, name):
-        """Drop a column by *name*"""
-        if name not in self._data:
-            raise KeyError(f"column '{name}' does not exist")
-        del self._data[name]
+    def _drop_column(
+        self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise"
+    ) -> None:
+        """Drop a column by *name* inplace."""
+        try:
+            del self._data[name]
+        except KeyError as err:
+            if errors != "ignore":
+                raise KeyError(f"column '{name}' does not exist") from err
 
     @_performance_tracking
     def _quantile_table(
@@ -988,7 +995,10 @@ def to_arrow(self):
         index: [[1,2,3]]
         """
         return pa.Table.from_pydict(
-            {str(name): col.to_arrow() for name, col in self._data.items()}
+            {
+                str(name): col.to_arrow()
+                for name, col in self._column_labels_and_values
+            }
         )
 
     @_performance_tracking
@@ -1012,7 +1022,9 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
 
         See `ColumnBase._with_type_metadata` for more information.
         """
-        for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes):
+        for (name, col), (_, dtype) in zip(
+            self._column_labels_and_values, other._dtypes
+        ):
             self._data.set_by_label(name, col._with_type_metadata(dtype))
 
         return self
@@ -1422,7 +1434,7 @@ def _split(self, splits):
         """
         return [
             self._from_columns_like_self(
-                libcudf.copying.columns_split([*self._data.columns], splits)[
+                libcudf.copying.columns_split(list(self._columns), splits)[
                     split_idx
                 ],
                 self._column_names,
@@ -1432,7 +1444,7 @@ def _split(self, splits):
 
     @_performance_tracking
     def _encode(self):
-        columns, indices = libcudf.transform.table_encode([*self._columns])
+        columns, indices = libcudf.transform.table_encode(list(self._columns))
         keys = self._from_columns_like_self(columns)
         return keys, indices
 
@@ -1578,7 +1590,7 @@ def __neg__(self):
                     col.unary_operator("not")
                     if col.dtype.kind == "b"
                     else -1 * col
-                    for col in self._data.columns
+                    for col in self._columns
                 )
             )
         )
@@ -1840,9 +1852,7 @@ def __copy__(self):
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
-            self._data._from_columns_like_self(
-                (~col for col in self._data.columns)
-            )
+            self._data._from_columns_like_self((~col for col in self._columns))
         )
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6424c8af877..cb8cd0cd28b 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -751,10 +751,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
             ) and not libgroupby._is_all_scan_aggregate(normalized_aggs):
                 # Even with `sort=False`, pandas guarantees that
                 # groupby preserves the order of rows within each group.
-                left_cols = list(
-                    self.grouping.keys.drop_duplicates()._data.columns
-                )
-                right_cols = list(result_index._data.columns)
+                left_cols = list(self.grouping.keys.drop_duplicates()._columns)
+                right_cols = list(result_index._columns)
                 join_keys = [
                     _match_join_keys(lcol, rcol, "left")
                     for lcol, rcol in zip(left_cols, right_cols)
@@ -1483,7 +1481,7 @@ def _post_process_chunk_results(
                     # the column name should be, especially if we applied
                     # a nameless UDF.
                     result = result.to_frame(
-                        name=grouped_values._data.names[0]
+                        name=grouped_values._column_names[0]
                     )
                 else:
                     index_data = group_keys._data.copy(deep=True)
@@ -1632,7 +1630,7 @@ def mult(df):
             if func in {"sum", "product"}:
                 # For `sum` & `product`, boolean types
                 # will need to result in `int64` type.
-                for name, col in res._data.items():
+                for name, col in res._column_labels_and_values:
                     if col.dtype.kind == "b":
                         res._data[name] = col.astype("int")
             return res
@@ -2715,11 +2713,8 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
     def _reduce_numeric_only(self, op: str):
         columns = list(
             name
-            for name in self.obj._data.names
-            if (
-                is_numeric_dtype(self.obj._data[name].dtype)
-                and name not in self.grouping.names
-            )
+            for name, dtype in self.obj._dtypes
+            if (is_numeric_dtype(dtype) and name not in self.grouping.names)
         )
         return self[columns].agg(op)
 
@@ -3209,7 +3204,7 @@ def values(self) -> cudf.core.frame.Frame:
         """
         # If the key columns are in `obj`, filter them out
         value_column_names = [
-            x for x in self._obj._data.names if x not in self._named_columns
+            x for x in self._obj._column_names if x not in self._named_columns
         ]
         value_columns = self._obj._data.select_by_label(value_column_names)
         return self._obj.__class__._from_data(value_columns)
@@ -3224,8 +3219,8 @@ def _handle_series(self, by):
         self.names.append(by.name)
 
     def _handle_index(self, by):
-        self._key_columns.extend(by._data.columns)
-        self.names.extend(by._data.names)
+        self._key_columns.extend(by._columns)
+        self.names.extend(by._column_names)
 
     def _handle_mapping(self, by):
         by = cudf.Series(by.values(), index=by.keys())
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b2bd20c4982..cd07c58c5d9 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -122,13 +122,13 @@ def _lexsorted_equal_range(
         sort_inds = None
         sort_vals = idx
     lower_bound = search_sorted(
-        [*sort_vals._data.columns],
+        list(sort_vals._columns),
         keys,
         side="left",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
     upper_bound = search_sorted(
-        [*sort_vals._data.columns],
+        list(sort_vals._columns),
         keys,
         side="right",
         ascending=sort_vals.is_monotonic_increasing,
@@ -286,6 +286,20 @@ def name(self):
     def name(self, value):
         self._name = value
 
+    @property
+    @_performance_tracking
+    def _column_names(self) -> tuple[Any]:
+        return (self.name,)
+
+    @property
+    @_performance_tracking
+    def _columns(self) -> tuple[ColumnBase]:
+        return (self._values,)
+
+    @property
+    def _column_labels_and_values(self) -> Iterable:
+        return zip(self._column_names, self._columns)
+
     @property  # type: ignore
     @_performance_tracking
     def start(self) -> int:
@@ -1068,7 +1082,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
             else:
                 inputs = {
                     name: (col, None, False, None)
-                    for name, col in self._data.items()
+                    for name, col in self._column_labels_and_values
                 }
 
             data = self._apply_cupy_ufunc_to_operands(
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index fd6bf37f0e6..5952815deef 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3,7 +3,6 @@
 
 from __future__ import annotations
 
-import numbers
 import operator
 import textwrap
 import warnings
@@ -150,24 +149,14 @@
 )
 
 
-def _get_host_unique(array):
+def _get_unique_drop_labels(array):
+    """Return labels to be dropped for IndexFrame.drop."""
     if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)):
-        return array.unique.to_pandas()
-    elif isinstance(array, (str, numbers.Number)):
-        return [array]
+        yield from np.unique(as_column(array).values_host)
+    elif is_scalar(array):
+        yield array
     else:
-        return set(array)
-
-
-def _drop_columns(f: Frame, columns: abc.Iterable, errors: str):
-    for c in columns:
-        try:
-            f._drop_column(c)
-        except KeyError as e:
-            if errors == "ignore":
-                pass
-            else:
-                raise e
+        yield from set(array)
 
 
 def _indices_from_labels(obj, labels):
@@ -294,7 +283,7 @@ def _num_rows(self) -> int:
 
     @property
     def _index_names(self) -> tuple[Any, ...]:  # TODO: Tuple[str]?
-        return self.index._data.names
+        return self.index._column_names
 
     @classmethod
     def _from_data(
@@ -307,6 +296,7 @@ def _from_data(
             raise ValueError(
                 f"index must be None or a cudf.Index not {type(index).__name__}"
             )
+        # out._num_rows requires .index to be defined
         out._index = RangeIndex(out._data.nrows) if index is None else index
         return out
 
@@ -882,7 +872,7 @@ def replace(
                 columns_dtype_map=dict(self._dtypes),
             )
             copy_data = []
-            for name, col in self._data.items():
+            for name, col in self._column_labels_and_values:
                 try:
                     replaced = col.find_and_replace(
                         to_replace_per_column[name],
@@ -2703,11 +2693,11 @@ def sort_index(
                         by.extend(
                             filter(
                                 lambda n: n not in handled,
-                                self.index._data.names,
+                                self.index._column_names,
                             )
                         )
                 else:
-                    by = list(idx._data.names)
+                    by = list(idx._column_names)
 
                 inds = idx._get_sorted_inds(
                     by=by, ascending=ascending, na_position=na_position
@@ -3013,7 +3003,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
 
         columns_to_slice = [
             *(
-                self.index._data.columns
+                self.index._columns
                 if keep_index and not has_range_index
                 else []
             ),
@@ -3210,7 +3200,7 @@ def _empty_like(self, keep_index=True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
                 [
-                    *(self.index._data.columns if keep_index else ()),
+                    *(self.index._columns if keep_index else ()),
                     *self._columns,
                 ]
             ),
@@ -3227,7 +3217,7 @@ def _split(self, splits, keep_index=True):
 
         columns_split = libcudf.copying.columns_split(
             [
-                *(self.index._data.columns if keep_index else []),
+                *(self.index._columns if keep_index else []),
                 *self._columns,
             ],
             splits,
@@ -3763,8 +3753,8 @@ def _reindex(
             idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
                 _is_same_dtype(left_dtype, right_dtype)
                 for left_dtype, right_dtype in zip(
-                    (col.dtype for col in df.index._data.columns),
-                    (col.dtype for col in index._data.columns),
+                    (dtype for _, dtype in df.index._dtypes),
+                    (dtype for _, dtype in index._dtypes),
                 )
             )
 
@@ -3783,7 +3773,7 @@ def _reindex(
                         (name or 0)
                         if isinstance(self, cudf.Series)
                         else name: col
-                        for name, col in df._data.items()
+                        for name, col in df._column_labels_and_values
                     },
                     index=df.index,
                 )
@@ -3794,7 +3784,7 @@ def _reindex(
         index = index if index is not None else df.index
 
         if column_names is None:
-            names = list(df._data.names)
+            names = list(df._column_names)
             level_names = self._data.level_names
             multiindex = self._data.multiindex
             rangeindex = self._data.rangeindex
@@ -3948,7 +3938,7 @@ def round(self, decimals=0, how="half_even"):
             col.round(decimals[name], how=how)
             if name in decimals and col.dtype.kind in "fiu"
             else col.copy(deep=True)
-            for name, col in self._data.items()
+            for name, col in self._column_labels_and_values
         )
         return self._from_data_like_self(
             self._data._from_columns_like_self(cols)
@@ -4270,7 +4260,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
             else:
                 thresh = len(df)
 
-        for name, col in df._data.items():
+        for name, col in df._column_labels_and_values:
             check_col = col.nans_to_nulls()
             no_threshold_valid_count = (
                 len(col) - check_col.null_count
@@ -4305,7 +4295,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
 
         return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(
-                [*self.index._data.columns, *data_columns],
+                [*self.index._columns, *data_columns],
                 how=how,
                 keys=self._positions_from_column_names(
                     subset, offset_by_index_columns=True
@@ -4853,7 +4843,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
                 # This works for Index too
                 inputs = {
                     name: (col, None, False, None)
-                    for name, col in self._data.items()
+                    for name, col in self._column_labels_and_values
                 }
                 index = self.index
 
@@ -4933,7 +4923,7 @@ def repeat(self, repeats, axis=None):
         """
         res = self._from_columns_like_self(
             Frame._repeat(
-                [*self.index._data.columns, *self._columns], repeats, axis
+                [*self.index._columns, *self._columns], repeats, axis
             ),
             self._column_names,
             self._index_names,
@@ -5261,15 +5251,14 @@ def drop(
             out = self.copy()
 
         if axis in (1, "columns"):
-            target = _get_host_unique(target)
-
-            _drop_columns(out, target, errors)
+            for label in _get_unique_drop_labels(target):
+                out._drop_column(label, errors=errors)
         elif axis in (0, "index"):
             dropped = _drop_rows_by_labels(out, target, level, errors)
 
             if columns is not None:
-                columns = _get_host_unique(columns)
-                _drop_columns(dropped, columns, errors)
+                for label in _get_unique_drop_labels(columns):
+                    dropped._drop_column(label, errors=errors)
 
             out._mimic_inplace(dropped, inplace=True)
 
@@ -6224,7 +6213,7 @@ def _preprocess_subset(self, subset):
             not np.iterable(subset)
             or isinstance(subset, str)
             or isinstance(subset, tuple)
-            and subset in self._data.names
+            and subset in self._column_names
         ):
             subset = (subset,)
         diff = set(subset) - set(self._data)
@@ -6306,8 +6295,8 @@ def rank(
                 )
             numeric_cols = (
                 name
-                for name in self._data.names
-                if _is_non_decimal_numeric_dtype(self._data[name])
+                for name, dtype in self._dtypes
+                if _is_non_decimal_numeric_dtype(dtype)
             )
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index b65bc7af832..cfeaca00888 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -140,11 +140,15 @@ def __init__(
         # right_on.
         self._using_left_index = bool(left_index)
         left_on = (
-            lhs.index._data.names if left_index else left_on if left_on else on
+            lhs.index._column_names
+            if left_index
+            else left_on
+            if left_on
+            else on
         )
         self._using_right_index = bool(right_index)
         right_on = (
-            rhs.index._data.names
+            rhs.index._column_names
             if right_index
             else right_on
             if right_on
@@ -334,18 +338,18 @@ def _merge_results(
         # All columns from the left table make it into the output. Non-key
         # columns that share a name with a column in the right table are
         # suffixed with the provided suffix.
-        common_names = set(left_result._data.names) & set(
-            right_result._data.names
+        common_names = set(left_result._column_names) & set(
+            right_result._column_names
         )
         cols_to_suffix = common_names - self._key_columns_with_same_name
         data = {
             (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col
-            for name, col in left_result._data.items()
+            for name, col in left_result._column_labels_and_values
         }
 
         # The right table follows the same rule as the left table except that
         # key columns from the right table are removed.
-        for name, col in right_result._data.items():
+        for name, col in right_result._column_labels_and_values:
             if name in common_names:
                 if name not in self._key_columns_with_same_name:
                     data[f"{name}{self.rsuffix}"] = col
@@ -399,7 +403,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         # producing the input result.
         by: list[Any] = []
         if self._using_left_index and self._using_right_index:
-            by.extend(result.index._data.columns)
+            by.extend(result.index._columns)
         if not self._using_left_index:
             by.extend([result._data[col.name] for col in self._left_keys])
         if not self._using_right_index:
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index b86ad38c944..6de3981ba66 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -233,8 +233,8 @@ def names(self, value):
             # to unexpected behavior in some cases. This is
             # definitely buggy, but we can't disallow non-unique
             # names either...
-            self._data = self._data.__class__(
-                dict(zip(value, self._data.values())),
+            self._data = type(self._data)(
+                dict(zip(value, self._columns)),
                 level_names=self._data.level_names,
                 verify=False,
             )
@@ -693,19 +693,25 @@ def where(self, cond, other=None, inplace=False):
     @_performance_tracking
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
-        lookup = cudf.DataFrame()
+        lookup_dict = {}
         for i, row in enumerate(row_tuple):
             if isinstance(row, slice) and row == slice(None):
                 continue
-            lookup[i] = cudf.Series(row)
-        frame = cudf.DataFrame(dict(enumerate(index._data.columns)))
+            lookup_dict[i] = row
+        lookup = cudf.DataFrame(lookup_dict)
+        frame = cudf.DataFrame._from_data(
+            ColumnAccessor(dict(enumerate(index._columns)), verify=False)
+        )
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", FutureWarning)
             data_table = cudf.concat(
                 [
                     frame,
                     cudf.DataFrame._from_data(
-                        {"idx": column.as_column(range(len(frame)))}
+                        ColumnAccessor(
+                            {"idx": column.as_column(range(len(frame)))},
+                            verify=False,
+                        )
                     ),
                 ],
                 axis=1,
@@ -716,7 +722,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
         # TODO: Remove this after merge/join
         # obtain deterministic ordering.
         if cudf.get_option("mode.pandas_compatible"):
-            lookup_order = "_" + "_".join(map(str, lookup._data.names))
+            lookup_order = "_" + "_".join(map(str, lookup._column_names))
             lookup[lookup_order] = column.as_column(range(len(lookup)))
             postprocess = operator.methodcaller(
                 "sort_values", by=[lookup_order, "idx"]
@@ -784,7 +790,7 @@ def _index_and_downcast(self, result, index, index_key):
             out_index.insert(
                 out_index._num_columns,
                 k,
-                cudf.Series._from_column(index._data.columns[k]),
+                cudf.Series._from_column(index._columns[k]),
             )
 
         # determine if we should downcast from a DataFrame to a Series
@@ -800,19 +806,19 @@ def _index_and_downcast(self, result, index, index_key):
         )
         if need_downcast:
             result = result.T
-            return result[result._data.names[0]]
+            return result[result._column_names[0]]
 
         if len(result) == 0 and not slice_access:
             # Pandas returns an empty Series with a tuple as name
             # the one expected result column
             result = cudf.Series._from_data(
-                {}, name=tuple(col[0] for col in index._data.columns)
+                {}, name=tuple(col[0] for col in index._columns)
             )
         elif out_index._num_columns == 1:
             # If there's only one column remaining in the output index, convert
             # it into an Index and name the final index values according
             # to that column's name.
-            *_, last_column = index._data.columns
+            last_column = index._columns[-1]
             out_index = cudf.Index._from_column(
                 last_column, name=index.names[-1]
             )
@@ -894,7 +900,7 @@ def __eq__(self, other):
                 [
                     self_col.equals(other_col)
                     for self_col, other_col in zip(
-                        self._data.values(), other._data.values()
+                        self._columns, other._columns
                     )
                 ]
             )
@@ -1475,10 +1481,10 @@ def swaplevel(self, i=-2, j=-1) -> Self:
             ('aa', 'b')],
            )
         """
-        name_i = self._data.names[i] if isinstance(i, int) else i
-        name_j = self._data.names[j] if isinstance(j, int) else j
+        name_i = self._column_names[i] if isinstance(i, int) else i
+        name_j = self._column_names[j] if isinstance(j, int) else j
         new_data = {}
-        for k, v in self._data.items():
+        for k, v in self._column_labels_and_values:
             if k not in (name_i, name_j):
                 new_data[k] = v
             elif k == name_i:
@@ -1916,7 +1922,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         join_keys = [
             _match_join_keys(lcol, rcol, "inner")
-            for lcol, rcol in zip(target._data.columns, self._data.columns)
+            for lcol, rcol in zip(target._columns, self._columns)
         ]
         join_keys = map(list, zip(*join_keys))
         scatter_map, indices = libcudf.join.join(
@@ -2113,7 +2119,7 @@ def _split_columns_by_levels(
             lv if isinstance(lv, int) else level_names.index(lv)
             for lv in levels
         }
-        for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
+        for i, (name, col) in enumerate(zip(self.names, self._columns)):
             if in_levels and i in level_indices:
                 name = f"level_{i}" if name is None else name
                 yield name, col
@@ -2154,9 +2160,7 @@ def _columns_for_reset_index(
     ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
         """Return the columns and column names for .reset_index"""
         if levels is None:
-            for i, (col, name) in enumerate(
-                zip(self._data.columns, self.names)
-            ):
+            for i, (col, name) in enumerate(zip(self._columns, self.names)):
                 yield f"level_{i}" if name is None else name, col
         else:
             yield from self._split_columns_by_levels(levels, in_levels=True)
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index c951db00c9a..401fef67ee6 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -410,7 +410,7 @@ def concat(
         result_columns = None
         if keys_objs is None:
             for o in objs:
-                for name, col in o._data.items():
+                for name, col in o._column_labels_and_values:
                     if name in result_data:
                         raise NotImplementedError(
                             f"A Column with duplicate name found: {name}, cuDF "
@@ -438,7 +438,7 @@ def concat(
         else:
             # All levels in the multiindex label must have the same type
             has_multiple_level_types = (
-                len({type(name) for o in objs for name in o._data.keys()}) > 1
+                len({type(name) for o in objs for name in o._column_names}) > 1
             )
             if has_multiple_level_types:
                 raise NotImplementedError(
@@ -447,7 +447,7 @@ def concat(
                     "the labels to the same type."
                 )
             for k, o in zip(keys_objs, objs):
-                for name, col in o._data.items():
+                for name, col in o._column_labels_and_values:
                     # if only series, then only keep keys_objs as column labels
                     # if the existing column is multiindex, prepend it
                     # to handle cases where dfs and srs are concatenated
@@ -843,7 +843,7 @@ def get_dummies(
         else:
             result_data = {
                 col_name: col
-                for col_name, col in data._data.items()
+                for col_name, col in data._column_labels_and_values
                 if col_name not in columns
             }
 
@@ -943,7 +943,7 @@ def _merge_sorted(
 
     columns = [
         [
-            *(obj.index._data.columns if not ignore_index else ()),
+            *(obj.index._columns if not ignore_index else ()),
             *obj._columns,
         ]
         for obj in objs
@@ -985,7 +985,7 @@ def as_tuple(x):
             return x if isinstance(x, tuple) else (x,)
 
         nrows = len(index_labels)
-        for col_label, col in df._data.items():
+        for col_label, col in df._column_labels_and_values:
             names = [
                 as_tuple(col_label) + as_tuple(name) for name in column_labels
             ]
@@ -1009,7 +1009,7 @@ def as_tuple(x):
     ca = ColumnAccessor(
         result,
         multiindex=True,
-        level_names=(None,) + columns._data.names,
+        level_names=(None,) + columns._column_names,
         verify=False,
     )
     return cudf.DataFrame._from_data(
@@ -1087,11 +1087,7 @@ def pivot(data, columns=None, index=no_default, values=no_default):
     # Create a DataFrame composed of columns from both
     # columns and index
     ca = ColumnAccessor(
-        dict(
-            enumerate(
-                itertools.chain(index._data.columns, columns._data.columns)
-            )
-        ),
+        dict(enumerate(itertools.chain(index._columns, columns._columns))),
         verify=False,
     )
     columns_index = cudf.DataFrame._from_data(ca)
@@ -1560,7 +1556,7 @@ def pivot_table(
     if values_passed and not values_multi and table._data.multiindex:
         column_names = table._data.level_names[1:]
         table_columns = tuple(
-            map(lambda column: column[1:], table._data.names)
+            map(lambda column: column[1:], table._column_names)
         )
         table.columns = pd.MultiIndex.from_tuples(
             tuples=table_columns, names=column_names
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 7197560b5a4..68f34fa28ff 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -186,7 +186,7 @@ def to_datetime(
         if isinstance(arg, cudf.DataFrame):
             # we require at least Ymd
             required = ["year", "month", "day"]
-            req = list(set(required) - set(arg._data.names))
+            req = list(set(required) - set(arg._column_names))
             if len(req):
                 err_req = ",".join(req)
                 raise ValueError(
@@ -196,7 +196,7 @@ def to_datetime(
                 )
 
             # replace passed column name with values in _unit_map
-            got_units = {k: get_units(k) for k in arg._data.names}
+            got_units = {k: get_units(k) for k in arg._column_names}
             unit_rev = {v: k for k, v in got_units.items()}
 
             # keys we don't recognize
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 265b87350ae..3af662b62ea 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -210,7 +210,7 @@ def _can_be_jitted(frame, func, args):
         # See https://github.com/numba/numba/issues/4587
         return False
 
-    if any(col.has_nulls() for col in frame._data.values()):
+    if any(col.has_nulls() for col in frame._columns):
         return False
     np_field_types = np.dtype(
         list(
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index 6d7362952c9..bfe716f0afc 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -126,25 +126,23 @@ def _get_udf_return_type(argty, func: Callable, args=()):
 
 def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES):
     return {
-        colname: col.dtype
-        if str(col.dtype) in supported_types
-        else np.dtype("O")
-        for colname, col in frame._data.items()
+        colname: dtype if str(dtype) in supported_types else np.dtype("O")
+        for colname, dtype in frame._dtypes
     }
 
 
 def _supported_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES):
     return {
-        colname: col.dtype
-        for colname, col in frame._data.items()
-        if str(col.dtype) in supported_types
+        colname: dtype
+        for colname, dtype in frame._dtypes
+        if str(dtype) in supported_types
     }
 
 
 def _supported_cols_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES):
     return {
         colname: col
-        for colname, col in frame._data.items()
+        for colname, col in frame._column_labels_and_values
         if str(col.dtype) in supported_types
     }
 
@@ -232,8 +230,8 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"):
         *cudautils.make_cache_key(
             func, tuple(_all_dtypes_from_frame(frame).values())
         ),
-        *(col.mask is None for col in frame._data.values()),
-        *frame._data.keys(),
+        *(col.mask is None for col in frame._columns),
+        *frame._column_names,
         scalar_argtypes,
         suffix,
     )
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index a9c20150930..3dc8915bfd1 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -186,13 +186,13 @@ def to_csv(
                 "Dataframe doesn't have the labels provided in columns"
             )
 
-    for col in df._data.columns:
-        if isinstance(col, cudf.core.column.ListColumn):
+    for _, dtype in df._dtypes:
+        if isinstance(dtype, cudf.ListDtype):
             raise NotImplementedError(
                 "Writing to csv format is not yet supported with "
                 "list columns."
             )
-        elif isinstance(col, cudf.core.column.StructColumn):
+        elif isinstance(dtype, cudf.StructDtype):
             raise NotImplementedError(
                 "Writing to csv format is not yet supported with "
                 "Struct columns."
@@ -203,12 +203,11 @@ def to_csv(
     # workaround once following issue is fixed:
     # https://github.com/rapidsai/cudf/issues/6661
     if any(
-        isinstance(col, cudf.core.column.CategoricalColumn)
-        for col in df._data.columns
+        isinstance(dtype, cudf.CategoricalDtype) for _, dtype in df._dtypes
     ) or isinstance(df.index, cudf.CategoricalIndex):
         df = df.copy(deep=False)
-        for col_name, col in df._data.items():
-            if isinstance(col, cudf.core.column.CategoricalColumn):
+        for col_name, col in df._column_labels_and_values:
+            if isinstance(col.dtype, cudf.CategoricalDtype):
                 df._data[col_name] = col.astype(col.categories.dtype)
 
         if isinstance(df.index, cudf.CategoricalIndex):
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index 1347b2cc38f..fe8e446f9c0 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -79,13 +79,13 @@ def to_dlpack(cudf_obj):
         )
 
     if any(
-        not cudf.api.types._is_non_decimal_numeric_dtype(col.dtype)
-        for col in gdf._data.columns
+        not cudf.api.types._is_non_decimal_numeric_dtype(dtype)
+        for _, dtype in gdf._dtypes
     ):
         raise TypeError("non-numeric data not yet supported")
 
     dtype = cudf.utils.dtypes.find_common_type(
-        [col.dtype for col in gdf._data.columns]
+        [dtype for _, dtype in gdf._dtypes]
     )
     gdf = gdf.astype(dtype)
 
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index fd246c6215f..c54293badbe 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -396,8 +396,8 @@ def to_orc(
 ):
     """{docstring}"""
 
-    for col in df._data.columns:
-        if isinstance(col, cudf.core.column.CategoricalColumn):
+    for _, dtype in df._dtypes:
+        if isinstance(dtype, cudf.CategoricalDtype):
             raise NotImplementedError(
                 "Writing to ORC format is not yet supported with "
                 "Categorical columns."
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index bf2ee6ae624..0c1cda8810b 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -881,6 +881,12 @@ def _assert_fast_slow_eq(left, right):
         assert_eq(left, right)
 
 
+class ProxyFallbackError(Exception):
+    """Raised when fallback occurs"""
+
+    pass
+
+
 def _fast_function_call():
     """
     Placeholder fast function for pytest profiling purposes.
@@ -957,6 +963,10 @@ def _fast_slow_function_call(
                             f"The exception was {e}."
                         )
     except Exception as err:
+        if _env_get_bool("CUDF_PANDAS_FAIL_ON_FALLBACK", False):
+            raise ProxyFallbackError(
+                f"The operation failed with cuDF, the reason was {type(err)}: {err}."
+            ) from err
         with nvtx.annotate(
             "EXECUTE_SLOW",
             color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 31ad24a4664..668e7a77454 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -676,7 +676,7 @@ def assert_frame_equal(
 
     if check_like:
         left, right = left.reindex(index=right.index), right
-        right = right[list(left._data.names)]
+        right = right[list(left._column_names)]
 
     # index comparison
     assert_index_equal(
diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
index 2136bca0e28..d05ba9aaacc 100644
--- a/python/cudf/cudf/tests/pytest.ini
+++ b/python/cudf/cudf/tests/pytest.ini
@@ -14,3 +14,6 @@ filterwarnings =
     ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
     # PerformanceWarning from cupy warming up the JIT cache
     ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning
+    # Ignore numba PEP 456 warning specific to arm machines
+    ignore:FNV hashing is not implemented in Numba.*:UserWarning
+addopts = --tb=native
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index 773141ee71a..979c936a182 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -33,9 +33,10 @@ def __array_function__(self, *args, **kwargs):
 
 missing_arrfunc_reason = "NEP-18 support is not available in NumPy"
 
+np.random.seed(0)
+
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
     "func",
     [
@@ -47,7 +48,8 @@ def __array_function__(self, *args, **kwargs):
         lambda x: np.linalg.norm(x),
     ],
 )
-def test_array_func_cudf_series(np_ar, func):
+def test_array_func_cudf_series(func):
+    np_ar = np.random.random(100)
     cudf_ser = cudf.Series(np_ar)
     expect = func(np_ar)
     got = func(cudf_ser)
@@ -58,9 +60,6 @@ def test_array_func_cudf_series(np_ar, func):
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize(
-    "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))]
-)
 @pytest.mark.parametrize(
     "func",
     [
@@ -74,7 +73,8 @@ def test_array_func_cudf_series(np_ar, func):
         lambda x: np.prod(x, axis=1),
     ],
 )
-def test_array_func_cudf_dataframe(pd_df, func):
+def test_array_func_cudf_dataframe(func):
+    pd_df = pd.DataFrame(np.random.uniform(size=(100, 10)))
     cudf_df = cudf.from_pandas(pd_df)
     expect = func(pd_df)
     got = func(cudf_df)
@@ -82,9 +82,6 @@ def test_array_func_cudf_dataframe(pd_df, func):
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize(
-    "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))]
-)
 @pytest.mark.parametrize(
     "func",
     [
@@ -93,21 +90,22 @@ def test_array_func_cudf_dataframe(pd_df, func):
         lambda x: np.linalg.det(x),
     ],
 )
-def test_array_func_missing_cudf_dataframe(pd_df, func):
+def test_array_func_missing_cudf_dataframe(func):
+    pd_df = pd.DataFrame(np.random.uniform(size=(100, 10)))
     cudf_df = cudf.from_pandas(pd_df)
     with pytest.raises(TypeError):
         func(cudf_df)
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
     "func",
     [
         lambda x: np.unique(x),
     ],
 )
-def test_array_func_cudf_index(np_ar, func):
+def test_array_func_cudf_index(func):
+    np_ar = np.random.random(100)
     cudf_index = cudf.Index(cudf.Series(np_ar))
     expect = func(np_ar)
     got = func(cudf_index)
@@ -118,7 +116,6 @@ def test_array_func_cudf_index(np_ar, func):
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
     "func",
     [
@@ -127,7 +124,8 @@ def test_array_func_cudf_index(np_ar, func):
         lambda x: np.linalg.det(x),
     ],
 )
-def test_array_func_missing_cudf_index(np_ar, func):
+def test_array_func_missing_cudf_index(func):
+    np_ar = np.random.random(100)
     cudf_index = cudf.Index(cudf.Series(np_ar))
     with pytest.raises(TypeError):
         func(cudf_index)
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 9d69e626c3d..5acdf36de80 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -236,6 +236,7 @@ def test_avro_compression(rows, codec):
             },
         ],
         rows,
+        seed=0,
     )
     expected_df = cudf.DataFrame.from_arrow(df)
 
@@ -599,7 +600,7 @@ def test_avro_reader_multiblock(
     else:
         assert dtype in ("float32", "float64")
         avro_type = "float" if dtype == "float32" else "double"
-
+        np.random.seed(0)
         # We don't use rand_dataframe() here, because it increases the
         # execution time of each test by a factor of 10 or more (it appears
         # to use a very costly approach to generating random data).
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index f4d1578bda7..6f88d942746 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -515,6 +515,17 @@ def test_dataframe_drop_columns(pdf, columns, inplace):
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize("obj", ["Index", "Series"])
+def test_drop_cudf_obj_columns(obj):
+    pdf = pd.DataFrame({"A": [1], "B": [1]})
+    gdf = cudf.from_pandas(pdf)
+
+    columns = ["B"]
+    expected = pdf.drop(labels=getattr(pd, obj)(columns), axis=1)
+    actual = gdf.drop(columns=getattr(cudf, obj)(columns), axis=1)
+    assert_eq(expected, actual)
+
+
 @pytest.mark.parametrize(
     "pdf",
     [
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 0aaa71e50d7..848bc259e7b 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -2470,6 +2470,7 @@ def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     pdf.columns = ["x", "y", "z"]
@@ -2602,6 +2603,7 @@ def test_groupby_shift_row_mixed_numerics(
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2639,6 +2641,7 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2687,6 +2690,7 @@ def test_groupby_shift_row_mixed_fill(
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2732,6 +2736,7 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     gdf = cudf.from_pandas(t.to_pandas())
 
@@ -2782,6 +2787,7 @@ def test_groupby_diff_row_mixed_numerics(nelem, shift_perc, direction):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2815,6 +2821,7 @@ def test_groupby_diff_row_zero_shift(nelem):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     gdf = cudf.from_pandas(t.to_pandas())
 
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index b1e095e8853..c41be3e4428 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -813,8 +813,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep):
         mi1 = gdf.groupby(["Date", "Symbol"]).mean().index
         mi2 = mi1.copy(deep=deep)
 
-        lchildren = [col.children for _, col in mi1._data.items()]
-        rchildren = [col.children for _, col in mi2._data.items()]
+        lchildren = [col.children for col in mi1._columns]
+        rchildren = [col.children for col in mi2._columns]
 
         # Flatten
         lchildren = reduce(operator.add, lchildren)
@@ -849,12 +849,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep):
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
         # Assert ._data identity
-        lptrs = [
-            d.base_data.get_ptr(mode="read") for _, d in mi1._data.items()
-        ]
-        rptrs = [
-            d.base_data.get_ptr(mode="read") for _, d in mi2._data.items()
-        ]
+        lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns]
+        rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns]
 
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
     cudf.set_option("copy_on_write", original_cow_setting)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index c4ab4b0a853..2bbed40e34e 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -26,7 +26,11 @@
 
 from cudf.core._compat import PANDAS_GE_220
 from cudf.pandas import LOADED, Profiler
-from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object
+from cudf.pandas.fast_slow_proxy import (
+    ProxyFallbackError,
+    _Unusable,
+    is_proxy_object,
+)
 from cudf.testing import assert_eq
 
 if not LOADED:
@@ -1738,3 +1742,13 @@ def add_one_ufunc(a):
         return a + 1
 
     assert_eq(cp.asarray(add_one_ufunc(arr1)), cp.asarray(add_one_ufunc(arr2)))
+
+
+@pytest.mark.xfail(
+    reason="Fallback expected because casting to object is not supported",
+)
+def test_fallback_raises_error(monkeypatch):
+    with monkeypatch.context() as monkeycontext:
+        monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True")
+        with pytest.raises(ProxyFallbackError):
+            pd.Series(range(2)).astype(object)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py
new file mode 100644
index 00000000000..896256bf6d7
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from cudf.pandas import LOADED
+
+if not LOADED:
+    raise ImportError("These tests must be run with cudf.pandas loaded")
+
+import numpy as np
+import pandas as pd
+
+
+@pytest.fixture(autouse=True)
+def fail_on_fallback(monkeypatch):
+    monkeypatch.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True")
+
+
+@pytest.fixture
+def dataframe():
+    df = pd.DataFrame(
+        {
+            "a": [1, 1, 1, 2, 3],
+            "b": [1, 2, 3, 4, 5],
+            "c": [1.2, 1.3, 1.5, 1.7, 1.11],
+        }
+    )
+    return df
+
+
+@pytest.fixture
+def series(dataframe):
+    return dataframe["a"]
+
+
+@pytest.fixture
+def array(series):
+    return series.values
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "sum",
+        "min",
+        "max",
+        "mean",
+        "std",
+        "var",
+        "prod",
+        "median",
+    ],
+)
+def test_no_fallback_in_reduction_ops(series, op):
+    s = series
+    getattr(s, op)()
+
+
+def test_groupby(dataframe):
+    df = dataframe
+    df.groupby("a", sort=True).max()
+
+
+def test_no_fallback_in_binops(dataframe):
+    df = dataframe
+    df + df
+    df - df
+    df * df
+    df**df
+    df[["a", "b"]] & df[["a", "b"]]
+    df <= df
+
+
+def test_no_fallback_in_groupby_rolling_sum(dataframe):
+    df = dataframe
+    df.groupby("a").rolling(2).sum()
+
+
+def test_no_fallback_in_concat(dataframe):
+    df = dataframe
+    pd.concat([df, df])
+
+
+def test_no_fallback_in_get_shape(dataframe):
+    df = dataframe
+    df.shape
+
+
+def test_no_fallback_in_array_ufunc_op(array):
+    np.add(array, array)
+
+
+def test_no_fallback_in_merge(dataframe):
+    df = dataframe
+    pd.merge(df * df, df + df, how="inner")
+    pd.merge(df * df, df + df, how="outer")
+    pd.merge(df * df, df + df, how="left")
+    pd.merge(df * df, df + df, how="right")
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
index 817d98e6ba2..98459035298 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
@@ -1,3 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
 [pytest]
 xfail_strict=true
 markers=
@@ -5,3 +7,4 @@ markers=
     xfail_gold: this test is expected to fail in the gold pass
     xfail_cudf_pandas: this test is expected to fail in the cudf_pandas pass
     xfail_compare: this test is expected to fail in the comparison pass
+addopts = --tb=native
diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/cudf_kafka/cudf_kafka/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
index 41d06f8631b..66c15f694ee 100644
--- a/python/cudf_polars/cudf_polars/__init__.py
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -14,6 +14,12 @@
 from cudf_polars.callback import execute_with_cudf
 from cudf_polars.dsl.translate import translate_ir
 
+# Check we have a supported polars version
+from cudf_polars.utils.versions import _ensure_polars_version
+
+_ensure_polars_version()
+del _ensure_polars_version
+
 __all__: list[str] = [
     "execute_with_cudf",
     "translate_ir",
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index f31193aa938..76816ee0a61 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -5,19 +5,26 @@
 
 from __future__ import annotations
 
+import contextlib
 import os
 import warnings
-from functools import partial
+from functools import cache, partial
 from typing import TYPE_CHECKING
 
 import nvtx
 
-from polars.exceptions import PerformanceWarning
+from polars.exceptions import ComputeError, PerformanceWarning
+
+import rmm
+from rmm._cuda import gpu
 
 from cudf_polars.dsl.translate import translate_ir
 
 if TYPE_CHECKING:
+    from collections.abc import Generator
+
     import polars as pl
+    from polars import GPUEngine
 
     from cudf_polars.dsl.ir import IR
     from cudf_polars.typing import NodeTraverser
@@ -25,23 +32,126 @@
 __all__: list[str] = ["execute_with_cudf"]
 
 
+@cache
+def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
+    """
+    Return the default memory resource for cudf-polars.
+
+    Parameters
+    ----------
+    device
+        Disambiguating device id when selecting the device. Must be
+        the active device when this function is called.
+
+    Returns
+    -------
+    rmm.mr.DeviceMemoryResource
+        The default memory resource that cudf-polars uses. Currently
+        an async pool resource.
+    """
+    try:
+        return rmm.mr.CudaAsyncMemoryResource()
+    except RuntimeError as e:  # pragma: no cover
+        msg, *_ = e.args
+        if (
+            msg.startswith("RMM failure")
+            and msg.find("not supported with this CUDA driver/runtime version") > -1
+        ):
+            raise ComputeError(
+                "GPU engine requested, but incorrect cudf-polars package installed. "
+                "If your system has a CUDA 11 driver, please uninstall `cudf-polars-cu12` "
+                "and install `cudf-polars-cu11`"
+            ) from None
+        else:
+            raise
+
+
+@contextlib.contextmanager
+def set_memory_resource(
+    mr: rmm.mr.DeviceMemoryResource | None,
+) -> Generator[rmm.mr.DeviceMemoryResource, None, None]:
+    """
+    Set the current memory resource for an execution block.
+
+    Parameters
+    ----------
+    mr
+        Memory resource to use. If `None`, calls :func:`default_memory_resource`
+        to obtain an mr on the currently active device.
+
+    Returns
+    -------
+    Memory resource used.
+
+    Notes
+    -----
+    At exit, the memory resource is restored to whatever was current
+    at entry. If a memory resource is provided, it must be valid to
+    use with the currently active device.
+    """
+    if mr is None:
+        device: int = gpu.getDevice()
+        mr = default_memory_resource(device)
+    previous = rmm.mr.get_current_device_resource()
+    rmm.mr.set_current_device_resource(mr)
+    try:
+        yield mr
+    finally:
+        rmm.mr.set_current_device_resource(previous)
+
+
+@contextlib.contextmanager
+def set_device(device: int | None) -> Generator[int, None, None]:
+    """
+    Set the device the query is executed on.
+
+    Parameters
+    ----------
+    device
+        Device to use. If `None`, uses the current device.
+
+    Returns
+    -------
+    Device active for the execution of the block.
+
+    Notes
+    -----
+    At exit, the device is restored to whatever was current at entry.
+    """
+    previous: int = gpu.getDevice()
+    if device is not None:
+        gpu.setDevice(device)
+    try:
+        yield previous
+    finally:
+        gpu.setDevice(previous)
+
+
 def _callback(
     ir: IR,
     with_columns: list[str] | None,
     pyarrow_predicate: str | None,
     n_rows: int | None,
+    *,
+    device: int | None,
+    memory_resource: int | None,
 ) -> pl.DataFrame:
     assert with_columns is None
     assert pyarrow_predicate is None
     assert n_rows is None
-    with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"):
+    with (
+        nvtx.annotate(message="ExecuteIR", domain="cudf_polars"),
+        # Device must be set before memory resource is obtained.
+        set_device(device),
+        set_memory_resource(memory_resource),
+    ):
         return ir.evaluate(cache={}).to_polars()
 
 
 def execute_with_cudf(
     nt: NodeTraverser,
     *,
-    raise_on_fail: bool = False,
+    config: GPUEngine,
     exception: type[Exception] | tuple[type[Exception], ...] = Exception,
 ) -> None:
     """
@@ -52,9 +162,8 @@ def execute_with_cudf(
     nt
         NodeTraverser
 
-    raise_on_fail
-        Should conversion raise an exception rather than continuing
-        without setting a callback.
+    config
+        GPUEngine configuration object
 
     exception
         Optional exception, or tuple of exceptions, to catch during
@@ -62,9 +171,23 @@ def execute_with_cudf(
 
     The NodeTraverser is mutated if the libcudf executor can handle the plan.
     """
+    device = config.device
+    memory_resource = config.memory_resource
+    raise_on_fail = config.config.get("raise_on_fail", False)
+    if unsupported := (config.config.keys() - {"raise_on_fail"}):
+        raise ValueError(
+            f"Engine configuration contains unsupported settings {unsupported}"
+        )
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
-            nt.set_udf(partial(_callback, translate_ir(nt)))
+            nt.set_udf(
+                partial(
+                    _callback,
+                    translate_ir(nt),
+                    device=device,
+                    memory_resource=memory_resource,
+                )
+            )
     except exception as e:
         if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
             warnings.warn(
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index dd3b771e305..3fe3e5557cb 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -84,6 +84,34 @@ def sorted_like(self, like: Column, /) -> Self:
             is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
         )
 
+    # TODO: Return Column once #16272 is fixed.
+    def astype(self, dtype: plc.DataType) -> plc.Column:
+        """
+        Return the backing column as the requested dtype.
+
+        Parameters
+        ----------
+        dtype
+            Datatype to cast to.
+
+        Returns
+        -------
+        Column of requested type.
+
+        Raises
+        ------
+        RuntimeError
+            If the cast is unsupported.
+
+        Notes
+        -----
+        This only produces a copy if the requested dtype doesn't match
+        the current one.
+        """
+        if self.obj.type() != dtype:
+            return plc.unary.cast(self.obj, dtype)
+        return self.obj
+
     def copy_metadata(self, from_: pl.Series, /) -> Self:
         """
         Copy metadata from a host series onto self.
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index a5c99e2bc11..f3e3862d0cc 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -7,7 +7,7 @@
 
 import itertools
 from functools import cached_property
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING
 
 import pyarrow as pa
 import pylibcudf as plc
@@ -45,11 +45,19 @@ def copy(self) -> Self:
 
     def to_polars(self) -> pl.DataFrame:
         """Convert to a polars DataFrame."""
+        # If the arrow table has empty names, from_arrow produces
+        # column_$i. But here we know there is only one such column
+        # (by construction) and it should have an empty name.
+        # https://github.com/pola-rs/polars/issues/11632
+        # To guarantee we produce correct names, we therefore
+        # serialise with names we control and rename with that map.
+        name_map = {f"column_{i}": c.name for i, c in enumerate(self.columns)}
         table: pa.Table = plc.interop.to_arrow(
             self.table,
-            [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
+            [plc.interop.ColumnMetadata(name=name) for name in name_map],
         )
-        return cast(pl.DataFrame, pl.from_arrow(table)).with_columns(
+        df: pl.DataFrame = pl.from_arrow(table)
+        return df.rename(name_map).with_columns(
             *(
                 pl.col(c.name).set_sorted(
                     descending=c.order == plc.types.Order.DESCENDING
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index e1b4d30b76b..c401e5a2f17 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -21,8 +21,10 @@
 from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
 
 import pyarrow as pa
+import pyarrow.compute as pc
 import pylibcudf as plc
 
+from polars.exceptions import InvalidOperationError
 from polars.polars import _expr_nodes as pl_expr
 
 from cudf_polars.containers import Column, NamedColumn
@@ -477,12 +479,6 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
-        if (
-            self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All)
-            and not self.options[0]
-        ):
-            # With ignore_nulls == False, polars uses Kleene logic
-            raise NotImplementedError(f"Kleene logic for {self.name}")
         if self.name == pl_expr.BooleanFunction.IsIn and not all(
             c.dtype == self.children[0].dtype for c in self.children
         ):
@@ -577,20 +573,31 @@ def do_evaluate(
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
         ]
-        if self.name == pl_expr.BooleanFunction.Any:
+        # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
+        # False
+        if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All):
+            (ignore_nulls,) = self.options
             (column,) = columns
-            return Column(
-                plc.Column.from_scalar(
-                    plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.All:
-            (column,) = columns
-            return Column(
-                plc.Column.from_scalar(
-                    plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1
-                )
-            )
+            is_any = self.name == pl_expr.BooleanFunction.Any
+            agg = plc.aggregation.any() if is_any else plc.aggregation.all()
+            result = plc.reduce.reduce(column.obj, agg, self.dtype)
+            if not ignore_nulls and column.obj.null_count() > 0:
+                #      Truth tables
+                #     Any         All
+                #   | F U T     | F U T
+                # --+------   --+------
+                # F | F U T   F | F F F
+                # U | U U T   U | F U U
+                # T | T T T   T | F U T
+                #
+                # If the input null count was non-zero, we must
+                # post-process the result to insert the correct value.
+                h_result = plc.interop.to_arrow(result).as_py()
+                if is_any and not h_result or not is_any and h_result:
+                    # Any                     All
+                    # False || Null => Null   True && Null => Null
+                    return Column(plc.Column.all_null_like(column.obj, 1))
+            return Column(plc.Column.from_scalar(result, 1))
         if self.name == pl_expr.BooleanFunction.IsNull:
             (column,) = columns
             return Column(plc.unary.is_null(column.obj))
@@ -598,13 +605,19 @@ def do_evaluate(
             (column,) = columns
             return Column(plc.unary.is_valid(column.obj))
         elif self.name == pl_expr.BooleanFunction.IsNan:
-            # TODO: copy over null mask since is_nan(null) => null in polars
             (column,) = columns
-            return Column(plc.unary.is_nan(column.obj))
+            return Column(
+                plc.unary.is_nan(column.obj).with_mask(
+                    column.obj.null_mask(), column.obj.null_count()
+                )
+            )
         elif self.name == pl_expr.BooleanFunction.IsNotNan:
-            # TODO: copy over null mask since is_not_nan(null) => null in polars
             (column,) = columns
-            return Column(plc.unary.is_not_nan(column.obj))
+            return Column(
+                plc.unary.is_not_nan(column.obj).with_mask(
+                    column.obj.null_mask(), column.obj.null_count()
+                )
+            )
         elif self.name == pl_expr.BooleanFunction.IsFirstDistinct:
             (column,) = columns
             return self._distinct(
@@ -654,26 +667,22 @@ def do_evaluate(
                 ),
             )
         elif self.name == pl_expr.BooleanFunction.AllHorizontal:
-            if any(c.obj.null_count() > 0 for c in columns):
-                raise NotImplementedError("Kleene logic for all_horizontal")
             return Column(
                 reduce(
                     partial(
                         plc.binaryop.binary_operation,
-                        op=plc.binaryop.BinaryOperator.BITWISE_AND,
+                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
                         output_type=self.dtype,
                     ),
                     (c.obj for c in columns),
                 )
             )
         elif self.name == pl_expr.BooleanFunction.AnyHorizontal:
-            if any(c.obj.null_count() > 0 for c in columns):
-                raise NotImplementedError("Kleene logic for any_horizontal")
             return Column(
                 reduce(
                     partial(
                         plc.binaryop.binary_operation,
-                        op=plc.binaryop.BinaryOperator.BITWISE_OR,
+                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
                         output_type=self.dtype,
                     ),
                     (c.obj for c in columns),
@@ -694,7 +703,7 @@ def do_evaluate(
 
 
 class StringFunction(Expr):
-    __slots__ = ("name", "options", "children")
+    __slots__ = ("name", "options", "children", "_regex_program")
     _non_child = ("dtype", "name", "options")
     children: tuple[Expr, ...]
 
@@ -713,12 +722,18 @@ def __init__(
 
     def _validate_input(self):
         if self.name not in (
-            pl_expr.StringFunction.Lowercase,
-            pl_expr.StringFunction.Uppercase,
-            pl_expr.StringFunction.EndsWith,
-            pl_expr.StringFunction.StartsWith,
             pl_expr.StringFunction.Contains,
+            pl_expr.StringFunction.EndsWith,
+            pl_expr.StringFunction.Lowercase,
+            pl_expr.StringFunction.Replace,
+            pl_expr.StringFunction.ReplaceMany,
             pl_expr.StringFunction.Slice,
+            pl_expr.StringFunction.Strptime,
+            pl_expr.StringFunction.StartsWith,
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+            pl_expr.StringFunction.Uppercase,
         ):
             raise NotImplementedError(f"String function {self.name}")
         if self.name == pl_expr.StringFunction.Contains:
@@ -732,11 +747,65 @@ def _validate_input(self):
                     raise NotImplementedError(
                         "Regex contains only supports a scalar pattern"
                     )
+                pattern = self.children[1].value.as_py()
+                try:
+                    self._regex_program = plc.strings.regex_program.RegexProgram.create(
+                        pattern,
+                        flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
+                    )
+                except RuntimeError as e:
+                    raise NotImplementedError(
+                        f"Unsupported regex {pattern} for GPU engine."
+                    ) from e
+        elif self.name == pl_expr.StringFunction.Replace:
+            _, literal = self.options
+            if not literal:
+                raise NotImplementedError("literal=False is not supported for replace")
+            if not all(isinstance(expr, Literal) for expr in self.children[1:]):
+                raise NotImplementedError("replace only supports scalar target")
+            target = self.children[1]
+            if target.value == pa.scalar("", type=pa.string()):
+                raise NotImplementedError(
+                    "libcudf replace does not support empty strings"
+                )
+        elif self.name == pl_expr.StringFunction.ReplaceMany:
+            (ascii_case_insensitive,) = self.options
+            if ascii_case_insensitive:
+                raise NotImplementedError(
+                    "ascii_case_insensitive not implemented for replace_many"
+                )
+            if not all(
+                isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:]
+            ):
+                raise NotImplementedError("replace_many only supports literal inputs")
+            target = self.children[1]
+            if pc.any(pc.equal(target.value, "")).as_py():
+                raise NotImplementedError(
+                    "libcudf replace_many is implemented differently from polars "
+                    "for empty strings"
+                )
         elif self.name == pl_expr.StringFunction.Slice:
             if not all(isinstance(child, Literal) for child in self.children[1:]):
                 raise NotImplementedError(
                     "Slice only supports literal start and stop values"
                 )
+        elif self.name == pl_expr.StringFunction.Strptime:
+            format, _, exact, cache = self.options
+            if cache:
+                raise NotImplementedError("Strptime cache is a CPU feature")
+            if format is None:
+                raise NotImplementedError("Strptime format is required")
+            if not exact:
+                raise NotImplementedError("Strptime does not support exact=False")
+        elif self.name in {
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+        }:
+            if not isinstance(self.children[1], Literal):
+                raise NotImplementedError(
+                    "strip operations only support scalar patterns"
+                )
 
     def do_evaluate(
         self,
@@ -759,12 +828,10 @@ def do_evaluate(
                     else pat.obj
                 )
                 return Column(plc.strings.find.contains(column.obj, pattern))
-            assert isinstance(arg, Literal)
-            prog = plc.strings.regex_program.RegexProgram.create(
-                arg.value.as_py(),
-                flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
-            )
-            return Column(plc.strings.contains.contains_re(column.obj, prog))
+            else:
+                return Column(
+                    plc.strings.contains.contains_re(column.obj, self._regex_program)
+                )
         elif self.name == pl_expr.StringFunction.Slice:
             child, expr_offset, expr_length = self.children
             assert isinstance(expr_offset, Literal)
@@ -795,6 +862,22 @@ def do_evaluate(
                     plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
                 )
             )
+        elif self.name in {
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+        }:
+            column, chars = (
+                c.evaluate(df, context=context, mapping=mapping) for c in self.children
+            )
+            if self.name == pl_expr.StringFunction.StripCharsStart:
+                side = plc.strings.SideType.LEFT
+            elif self.name == pl_expr.StringFunction.StripCharsEnd:
+                side = plc.strings.SideType.RIGHT
+            else:
+                side = plc.strings.SideType.BOTH
+            return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar))
+
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
@@ -825,6 +908,51 @@ def do_evaluate(
                     else prefix.obj,
                 )
             )
+        elif self.name == pl_expr.StringFunction.Strptime:
+            # TODO: ignores ambiguous
+            format, strict, exact, cache = self.options
+            col = self.children[0].evaluate(df, context=context, mapping=mapping)
+
+            is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
+                col.obj, format.encode()
+            )
+
+            if strict:
+                if not plc.interop.to_arrow(
+                    plc.reduce.reduce(
+                        is_timestamps,
+                        plc.aggregation.all(),
+                        plc.DataType(plc.TypeId.BOOL8),
+                    )
+                ).as_py():
+                    raise InvalidOperationError("conversion from `str` failed.")
+            else:
+                not_timestamps = plc.unary.unary_operation(
+                    is_timestamps, plc.unary.UnaryOperator.NOT
+                )
+
+                null = plc.interop.from_arrow(pa.scalar(None, type=pa.string()))
+                res = plc.copying.boolean_mask_scatter(
+                    [null], plc.Table([col.obj]), not_timestamps
+                )
+                return Column(
+                    plc.strings.convert.convert_datetime.to_timestamps(
+                        res.columns()[0], self.dtype, format.encode()
+                    )
+                )
+        elif self.name == pl_expr.StringFunction.Replace:
+            column, target, repl = columns
+            n, _ = self.options
+            return Column(
+                plc.strings.replace.replace(
+                    column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
+                )
+            )
+        elif self.name == pl_expr.StringFunction.ReplaceMany:
+            column, target, repl = columns
+            return Column(
+                plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj)
+            )
         raise NotImplementedError(
             f"StringFunction {self.name}"
         )  # pragma: no cover; handled by init raising
@@ -832,6 +960,18 @@ def do_evaluate(
 
 class TemporalFunction(Expr):
     __slots__ = ("name", "options", "children")
+    _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
+        pl_expr.TemporalFunction.Year: "year",
+        pl_expr.TemporalFunction.Month: "month",
+        pl_expr.TemporalFunction.Day: "day",
+        pl_expr.TemporalFunction.WeekDay: "weekday",
+        pl_expr.TemporalFunction.Hour: "hour",
+        pl_expr.TemporalFunction.Minute: "minute",
+        pl_expr.TemporalFunction.Second: "second",
+        pl_expr.TemporalFunction.Millisecond: "millisecond",
+        pl_expr.TemporalFunction.Microsecond: "microsecond",
+        pl_expr.TemporalFunction.Nanosecond: "nanosecond",
+    }
     _non_child = ("dtype", "name", "options")
     children: tuple[Expr, ...]
 
@@ -846,8 +986,8 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
-        if self.name != pl_expr.TemporalFunction.Year:
-            raise NotImplementedError(f"String function {self.name}")
+        if self.name not in self._COMPONENT_MAP:
+            raise NotImplementedError(f"Temporal function {self.name}")
 
     def do_evaluate(
         self,
@@ -861,12 +1001,59 @@ def do_evaluate(
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
         ]
-        if self.name == pl_expr.TemporalFunction.Year:
-            (column,) = columns
-            return Column(plc.datetime.extract_year(column.obj))
-        raise NotImplementedError(
-            f"TemporalFunction {self.name}"
-        )  # pragma: no cover; init trips first
+        (column,) = columns
+        if self.name == pl_expr.TemporalFunction.Microsecond:
+            millis = plc.datetime.extract_datetime_component(column.obj, "millisecond")
+            micros = plc.datetime.extract_datetime_component(column.obj, "microsecond")
+            millis_as_micros = plc.binaryop.binary_operation(
+                millis,
+                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.DataType(plc.TypeId.INT32),
+            )
+            total_micros = plc.binaryop.binary_operation(
+                micros,
+                millis_as_micros,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            return Column(total_micros)
+        elif self.name == pl_expr.TemporalFunction.Nanosecond:
+            millis = plc.datetime.extract_datetime_component(column.obj, "millisecond")
+            micros = plc.datetime.extract_datetime_component(column.obj, "microsecond")
+            nanos = plc.datetime.extract_datetime_component(column.obj, "nanosecond")
+            millis_as_nanos = plc.binaryop.binary_operation(
+                millis,
+                plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            micros_as_nanos = plc.binaryop.binary_operation(
+                micros,
+                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            total_nanos = plc.binaryop.binary_operation(
+                nanos,
+                millis_as_nanos,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            total_nanos = plc.binaryop.binary_operation(
+                total_nanos,
+                micros_as_nanos,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            return Column(total_nanos)
+
+        return Column(
+            plc.datetime.extract_datetime_component(
+                column.obj,
+                self._COMPONENT_MAP[self.name],
+            )
+        )
 
 
 class UnaryFunction(Expr):
@@ -874,6 +1061,51 @@ class UnaryFunction(Expr):
     _non_child = ("dtype", "name", "options")
     children: tuple[Expr, ...]
 
+    # Note: log, and pow are handled via translation to binops
+    _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = {
+        "sin": plc.unary.UnaryOperator.SIN,
+        "cos": plc.unary.UnaryOperator.COS,
+        "tan": plc.unary.UnaryOperator.TAN,
+        "arcsin": plc.unary.UnaryOperator.ARCSIN,
+        "arccos": plc.unary.UnaryOperator.ARCCOS,
+        "arctan": plc.unary.UnaryOperator.ARCTAN,
+        "sinh": plc.unary.UnaryOperator.SINH,
+        "cosh": plc.unary.UnaryOperator.COSH,
+        "tanh": plc.unary.UnaryOperator.TANH,
+        "arcsinh": plc.unary.UnaryOperator.ARCSINH,
+        "arccosh": plc.unary.UnaryOperator.ARCCOSH,
+        "arctanh": plc.unary.UnaryOperator.ARCTANH,
+        "exp": plc.unary.UnaryOperator.EXP,
+        "sqrt": plc.unary.UnaryOperator.SQRT,
+        "cbrt": plc.unary.UnaryOperator.CBRT,
+        "ceil": plc.unary.UnaryOperator.CEIL,
+        "floor": plc.unary.UnaryOperator.FLOOR,
+        "abs": plc.unary.UnaryOperator.ABS,
+        "bit_invert": plc.unary.UnaryOperator.BIT_INVERT,
+        "not": plc.unary.UnaryOperator.NOT,
+    }
+    _supported_misc_fns = frozenset(
+        {
+            "drop_nulls",
+            "fill_null",
+            "mask_nans",
+            "round",
+            "set_sorted",
+            "unique",
+        }
+    )
+    _supported_cum_aggs = frozenset(
+        {
+            "cum_min",
+            "cum_max",
+            "cum_prod",
+            "cum_sum",
+        }
+    )
+    _supported_fns = frozenset().union(
+        _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys()
+    )
+
     def __init__(
         self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
     ) -> None:
@@ -881,15 +1113,15 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
-        if self.name not in (
-            "mask_nans",
-            "round",
-            "setsorted",
-            "unique",
-            "dropnull",
-            "fill_null",
-        ):
+
+        if self.name not in UnaryFunction._supported_fns:
             raise NotImplementedError(f"Unary function {name=}")
+        if self.name in UnaryFunction._supported_cum_aggs:
+            (reverse,) = self.options
+            if reverse:
+                raise NotImplementedError(
+                    "reverse=True is not supported for cumulative aggregations"
+                )
 
     def do_evaluate(
         self,
@@ -947,7 +1179,7 @@ def do_evaluate(
             if maintain_order:
                 return Column(column).sorted_like(values)
             return Column(column)
-        elif self.name == "setsorted":
+        elif self.name == "set_sorted":
             (column,) = (
                 child.evaluate(df, context=context, mapping=mapping)
                 for child in self.children
@@ -974,7 +1206,7 @@ def do_evaluate(
                 order=order,
                 null_order=null_order,
             )
-        elif self.name == "dropnull":
+        elif self.name == "drop_nulls":
             (column,) = (
                 child.evaluate(df, context=context, mapping=mapping)
                 for child in self.children
@@ -994,13 +1226,65 @@ def do_evaluate(
                 )
                 arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
             return Column(plc.replace.replace_nulls(column.obj, arg))
-
+        elif self.name in self._OP_MAPPING:
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            if column.obj.type().id() != self.dtype.id():
+                arg = plc.unary.cast(column.obj, self.dtype)
+            else:
+                arg = column.obj
+            return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name]))
+        elif self.name in UnaryFunction._supported_cum_aggs:
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            plc_col = column.obj
+            col_type = column.obj.type()
+            # cum_sum casts
+            # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention
+            # Bool -> UInt32
+            # cum_prod casts integer dtypes < int64 and bool to int64
+            # See:
+            # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs
+            if (
+                self.name == "cum_sum"
+                and col_type.id()
+                in {
+                    plc.types.TypeId.INT8,
+                    plc.types.TypeId.UINT8,
+                    plc.types.TypeId.INT16,
+                    plc.types.TypeId.UINT16,
+                }
+            ) or (
+                self.name == "cum_prod"
+                and plc.traits.is_integral(col_type)
+                and plc.types.size_of(col_type) <= 4
+            ):
+                plc_col = plc.unary.cast(
+                    plc_col, plc.types.DataType(plc.types.TypeId.INT64)
+                )
+            elif (
+                self.name == "cum_sum"
+                and column.obj.type().id() == plc.types.TypeId.BOOL8
+            ):
+                plc_col = plc.unary.cast(
+                    plc_col, plc.types.DataType(plc.types.TypeId.UINT32)
+                )
+            if self.name == "cum_sum":
+                agg = plc.aggregation.sum()
+            elif self.name == "cum_prod":
+                agg = plc.aggregation.product()
+            elif self.name == "cum_min":
+                agg = plc.aggregation.min()
+            elif self.name == "cum_max":
+                agg = plc.aggregation.max()
+
+            return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE))
         raise NotImplementedError(
             f"Unimplemented unary function {self.name=}"
         )  # pragma: no cover; init trips first
 
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
+        if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs:
+            raise NotImplementedError(f"{self.name} in groupby")
         if depth == 1:
             # inside aggregation, need to pre-evaluate, groupby
             # construction has checked that we don't have nested aggs,
@@ -1187,11 +1471,7 @@ class Cast(Expr):
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
         super().__init__(dtype)
         self.children = (value,)
-        if not (
-            plc.traits.is_fixed_width(self.dtype)
-            and plc.traits.is_fixed_width(value.dtype)
-            and plc.unary.is_supported_cast(value.dtype, self.dtype)
-        ):
+        if not dtypes.can_cast(value.dtype, self.dtype):
             raise NotImplementedError(
                 f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
             )
@@ -1255,6 +1535,13 @@ def __init__(
             req = plc.aggregation.variance(ddof=options)
         elif name == "count":
             req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE)
+        elif name == "quantile":
+            _, quantile = self.children
+            if not isinstance(quantile, Literal):
+                raise NotImplementedError("Only support literal quantile values")
+            req = plc.aggregation.quantile(
+                quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options]
+            )
         else:
             raise NotImplementedError(
                 f"Unreachable, {name=} is incorrectly listed in _SUPPORTED"
@@ -1286,9 +1573,18 @@ def __init__(
             "count",
             "std",
             "var",
+            "quantile",
         ]
     )
 
+    interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = {
+        "nearest": plc.types.Interpolation.NEAREST,
+        "higher": plc.types.Interpolation.HIGHER,
+        "lower": plc.types.Interpolation.LOWER,
+        "midpoint": plc.types.Interpolation.MIDPOINT,
+        "linear": plc.types.Interpolation.LINEAR,
+    }
+
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
         if depth >= 1:
@@ -1299,7 +1595,19 @@ def collect_agg(self, *, depth: int) -> AggInfo:
             raise NotImplementedError("Nan propagation in groupby for min/max")
         (child,) = self.children
         ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
-        if self.request is None:
+        request = self.request
+        # These are handled specially here because we don't set up the
+        # request for the whole-frame agg because we can avoid a
+        # reduce for these.
+        if self.name == "first":
+            request = plc.aggregation.nth_element(
+                0, null_handling=plc.types.NullPolicy.INCLUDE
+            )
+        elif self.name == "last":
+            request = plc.aggregation.nth_element(
+                -1, null_handling=plc.types.NullPolicy.INCLUDE
+            )
+        if request is None:
             raise NotImplementedError(
                 f"Aggregation {self.name} in groupby"
             )  # pragma: no cover; __init__ trips first
@@ -1308,7 +1616,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
             # Ignore nans in these groupby aggs, do this by masking
             # nans in the input
             expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
-        return AggInfo([(expr, self.request, self)])
+        return AggInfo([(expr, request, self)])
 
     def _reduce(
         self, column: Column, *, request: plc.aggregation.Aggregation
@@ -1380,7 +1688,10 @@ def do_evaluate(
             raise NotImplementedError(
                 f"Agg in context {context}"
             )  # pragma: no cover; unreachable
-        (child,) = self.children
+
+        # Aggregations like quantiles may have additional children that were
+        # preprocessed into pylibcudf requests.
+        child = self.children[0]
         return self.op(child.evaluate(df, context=context, mapping=mapping))
 
 
@@ -1425,6 +1736,11 @@ def __init__(
         right: Expr,
     ) -> None:
         super().__init__(dtype)
+        if plc.traits.is_boolean(self.dtype):
+            # For boolean output types, bitand and bitor implement
+            # boolean logic, so translate. bitxor also does, but the
+            # default behaviour is correct.
+            op = BinOp._BOOL_KLEENE_MAPPING.get(op, op)
         self.op = op
         self.children = (left, right)
         if not plc.binaryop.is_supported_operation(
@@ -1436,6 +1752,15 @@ def __init__(
                 f"with output type {self.dtype.id().name}"
             )
 
+    _BOOL_KLEENE_MAPPING: ClassVar[
+        dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator]
+    ] = {
+        plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+        plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+        plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+        plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+    }
+
     _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
         pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
         pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index e334e6f5cc5..8cd56c8ee3a 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -15,7 +15,6 @@
 
 import dataclasses
 import itertools
-import types
 from functools import cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar
@@ -28,7 +27,7 @@
 
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import DataFrame, NamedColumn
-from cudf_polars.utils import sorting
+from cudf_polars.utils import dtypes, sorting
 
 if TYPE_CHECKING:
     from collections.abc import Callable, MutableMapping
@@ -133,8 +132,7 @@ class IR:
 
     def __post_init__(self):
         """Validate preconditions."""
-        if any(dtype.id() == plc.TypeId.EMPTY for dtype in self.schema.values()):
-            raise NotImplementedError("Cannot make empty columns.")
+        pass  # noqa: PIE790
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """
@@ -189,32 +187,42 @@ class Scan(IR):
     """Cloud-related authentication options, currently ignored."""
     paths: list[str]
     """List of paths to read from."""
-    file_options: Any
-    """Options for reading the file.
-
-    Attributes are:
-    - ``with_columns: list[str]`` of projected columns to return.
-    - ``n_rows: int``: Number of rows to read.
-    - ``row_index: tuple[name, offset] | None``: Add an integer index
-        column with given name.
-    """
+    with_columns: list[str]
+    """Projected columns to return."""
+    skip_rows: int
+    """Rows to skip at the start when reading."""
+    n_rows: int
+    """Number of rows to read after skipping."""
+    row_index: tuple[str, int] | None
+    """If not None add an integer index column of the given name."""
     predicate: expr.NamedExpr | None
     """Mask to apply to the read dataframe."""
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
+        super().__post_init__()
         if self.typ not in ("csv", "parquet", "ndjson"):  # pragma: no cover
             # This line is unhittable ATM since IPC/Anonymous scan raise
             # on the polars side
             raise NotImplementedError(f"Unhandled scan type: {self.typ}")
-        if self.typ == "ndjson" and self.file_options.n_rows is not None:
-            raise NotImplementedError("row limit in scan")
+        if self.typ == "ndjson" and (self.n_rows != -1 or self.skip_rows != 0):
+            raise NotImplementedError("row limit in scan for json reader")
+        if self.skip_rows < 0:
+            # TODO: polars has this implemented for parquet,
+            # maybe we can do this too?
+            raise NotImplementedError("slice pushdown for negative slices")
+        if self.typ == "csv" and self.skip_rows != 0:  # pragma: no cover
+            # This comes from slice pushdown, but that
+            # optimization doesn't happen right now
+            raise NotImplementedError("skipping rows in CSV reader")
         if self.cloud_options is not None and any(
             self.cloud_options.get(k) is not None for k in ("aws", "azure", "gcp")
         ):
             raise NotImplementedError(
                 "Read from cloud storage"
             )  # pragma: no cover; no test yet
+        if any(p.startswith("https://") for p in self.paths):
+            raise NotImplementedError("Read from https")
         if self.typ == "csv":
             if self.reader_options["skip_rows_after_header"] != 0:
                 raise NotImplementedError("Skipping rows after header in CSV reader")
@@ -242,13 +250,21 @@ def __post_init__(self) -> None:
                 raise NotImplementedError(
                     "ignore_errors is not supported in the JSON reader"
                 )
+        elif (
+            self.typ == "parquet"
+            and self.row_index is not None
+            and self.with_columns is not None
+            and len(self.with_columns) == 0
+        ):
+            raise NotImplementedError(
+                "Reading only parquet metadata to produce row index."
+            )
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        options = self.file_options
-        with_columns = options.with_columns
-        row_index = options.row_index
-        nrows = self.file_options.n_rows if self.file_options.n_rows is not None else -1
+        with_columns = self.with_columns
+        row_index = self.row_index
+        n_rows = self.n_rows
         if self.typ == "csv":
             parse_options = self.reader_options["parse_options"]
             sep = chr(parse_options["separator"])
@@ -256,7 +272,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             eol = chr(parse_options["eol_char"])
             if self.reader_options["schema"] is not None:
                 # Reader schema provides names
-                column_names = list(self.reader_options["schema"]["inner"].keys())
+                column_names = list(self.reader_options["schema"]["fields"].keys())
             else:
                 # file provides column names
                 column_names = None
@@ -282,6 +298,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
 
             # polars skips blank lines at the beginning of the file
             pieces = []
+            read_partial = n_rows != -1
             for p in self.paths:
                 skiprows = self.reader_options["skip_rows"]
                 path = Path(p)
@@ -303,9 +320,13 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                     comment=comment,
                     decimal=decimal,
                     dtypes=self.schema,
-                    nrows=nrows,
+                    nrows=n_rows,
                 )
                 pieces.append(tbl_w_meta)
+                if read_partial:
+                    n_rows -= tbl_w_meta.tbl.num_rows()
+                    if n_rows <= 0:
+                        break
             tables, colnames = zip(
                 *(
                     (piece.tbl, piece.column_names(include_children=False))
@@ -321,7 +342,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             tbl_w_meta = plc.io.parquet.read_parquet(
                 plc.io.SourceInfo(self.paths),
                 columns=with_columns,
-                nrows=nrows,
+                nrows=n_rows,
+                skip_rows=self.skip_rows,
             )
             df = DataFrame.from_table(
                 tbl_w_meta.tbl,
@@ -354,12 +376,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             raise NotImplementedError(
                 f"Unhandled scan type: {self.typ}"
             )  # pragma: no cover; post init trips first
-        if (
-            row_index is not None
-            # TODO: remove condition when dropping support for polars 1.0
-            # https://github.com/pola-rs/polars/pull/17363
-            and row_index[0] in self.schema
-        ):
+        if row_index is not None:
             name, offset = row_index
             dtype = self.schema[name]
             step = plc.interop.from_arrow(
@@ -481,36 +498,6 @@ def evaluate(
         return DataFrame(columns)
 
 
-def placeholder_column(n: int) -> plc.Column:
-    """
-    Produce a placeholder pylibcudf column with NO BACKING DATA.
-
-    Parameters
-    ----------
-    n
-        Number of rows the column will advertise
-
-    Returns
-    -------
-    pylibcudf Column that is almost unusable. DO NOT ACCESS THE DATA BUFFER.
-
-    Notes
-    -----
-    This is used to avoid allocating data for count aggregations.
-    """
-    return plc.Column(
-        plc.DataType(plc.TypeId.INT8),
-        n,
-        plc.gpumemoryview(
-            types.SimpleNamespace(__cuda_array_interface__={"data": (1, True)})
-        ),
-        None,
-        0,
-        0,
-        [],
-    )
-
-
 @dataclasses.dataclass
 class GroupBy(IR):
     """Perform a groupby."""
@@ -557,8 +544,7 @@ def check_agg(agg: expr.Expr) -> int:
 
     def __post_init__(self) -> None:
         """Check whether all the aggregations are implemented."""
-        if self.options.rolling is None and self.maintain_order:
-            raise NotImplementedError("Maintaining order in groupby")
+        super().__post_init__()
         if self.options.rolling:
             raise NotImplementedError(
                 "rolling window/groupby"
@@ -566,6 +552,8 @@ def __post_init__(self) -> None:
         if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
             raise NotImplementedError("Nested aggregations in groupby")
         self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
+        if len(self.keys) == 0:
+            raise NotImplementedError("dynamic groupby")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -591,7 +579,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         for info in self.agg_infos:
             for pre_eval, req, rep in info.requests:
                 if pre_eval is None:
-                    col = placeholder_column(df.num_rows)
+                    # A count aggregation, doesn't touch the column,
+                    # but we need to have one. Rather than evaluating
+                    # one, just use one of the key columns.
+                    col = keys[0].obj
                 else:
                     col = pre_eval.evaluate(df).obj
                 requests.append(plc.groupby.GroupByRequest(col, [req]))
@@ -611,7 +602,34 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         results = [
             req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests
         ]
-        return DataFrame(broadcast(*result_keys, *results)).slice(self.options.slice)
+        broadcasted = broadcast(*result_keys, *results)
+        result_keys = broadcasted[: len(result_keys)]
+        results = broadcasted[len(result_keys) :]
+        # Handle order preservation of groups
+        # like cudf classic does
+        # https://github.com/rapidsai/cudf/blob/5780c4d8fb5afac2e04988a2ff5531f94c22d3a3/python/cudf/cudf/core/groupby/groupby.py#L723-L743
+        if self.maintain_order and not sorted:
+            left = plc.stream_compaction.stable_distinct(
+                plc.Table([k.obj for k in keys]),
+                list(range(group_keys.num_columns())),
+                plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+                plc.types.NullEquality.EQUAL,
+                plc.types.NanEquality.ALL_EQUAL,
+            )
+            right = plc.Table([key.obj for key in result_keys])
+            _, indices = plc.join.left_join(left, right, plc.types.NullEquality.EQUAL)
+            ordered_table = plc.copying.gather(
+                plc.Table([col.obj for col in broadcasted]),
+                indices,
+                plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+            )
+            broadcasted = [
+                NamedColumn(reordered, b.name)
+                for reordered, b in zip(
+                    ordered_table.columns(), broadcasted, strict=True
+                )
+            ]
+        return DataFrame(broadcasted).slice(self.options.slice)
 
 
 @dataclasses.dataclass
@@ -627,7 +645,7 @@ class Join(IR):
     right_on: list[expr.NamedExpr]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "full", "leftsemi", "leftanti", "cross"],
+        Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"],
         bool,
         tuple[int, int] | None,
         str | None,
@@ -644,6 +662,7 @@ class Join(IR):
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
+        super().__post_init__()
         if any(
             isinstance(e.value, expr.Literal)
             for e in itertools.chain(self.left_on, self.right_on)
@@ -653,7 +672,7 @@ def __post_init__(self) -> None:
     @staticmethod
     @cache
     def _joiners(
-        how: Literal["inner", "left", "full", "leftsemi", "leftanti"],
+        how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"],
     ) -> tuple[
         Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
     ]:
@@ -663,7 +682,7 @@ def _joiners(
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
             )
-        elif how == "left":
+        elif how == "left" or how == "right":
             return (
                 plc.join.left_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
@@ -687,8 +706,7 @@ def _joiners(
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        else:
-            assert_never(how)
+        assert_never(how)
 
     def _reorder_maps(
         self,
@@ -786,8 +804,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             table = plc.copying.gather(left.table, lg, left_policy)
             result = DataFrame.from_table(table, left.column_names)
         else:
+            if how == "right":
+                # Right join is a left join with the tables swapped
+                left, right = right, left
+                left_on, right_on = right_on, left_on
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
-            if how == "left":
+            if how == "left" or how == "right":
                 # Order of left table is preserved
                 lg, rg = self._reorder_maps(
                     left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
@@ -815,6 +837,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                     )
                 )
                 right = right.discard_columns(right_on.column_names_set)
+            if how == "right":
+                # Undo the swap for right join before gluing together.
+                left, right = right, left
             right = right.rename_columns(
                 {
                     name: f"{name}{suffix}"
@@ -1065,11 +1090,13 @@ class MapFunction(IR):
             # "merge_sorted",
             "rename",
             "explode",
+            "unpivot",
         ]
     )
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
+        super().__post_init__()
         if self.name not in MapFunction._NAMES:
             raise NotImplementedError(f"Unhandled map function {self.name}")
         if self.name == "explode":
@@ -1086,6 +1113,22 @@ def __post_init__(self) -> None:
                 set(new) & (set(self.df.schema.keys() - set(old)))
             ):
                 raise NotImplementedError("Duplicate new names in rename.")
+        elif self.name == "unpivot":
+            indices, pivotees, variable_name, value_name = self.options
+            value_name = "value" if value_name is None else value_name
+            variable_name = "variable" if variable_name is None else variable_name
+            if len(pivotees) == 0:
+                index = frozenset(indices)
+                pivotees = [name for name in self.df.schema if name not in index]
+            if not all(
+                dtypes.can_cast(self.df.schema[p], self.schema[value_name])
+                for p in pivotees
+            ):
+                raise NotImplementedError(
+                    "Unpivot cannot cast all input columns to "
+                    f"{self.schema[value_name].id()}"
+                )
+            self.options = (indices, pivotees, variable_name, value_name)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -1107,6 +1150,41 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return DataFrame.from_table(
                 plc.lists.explode_outer(df.table, index), df.column_names
             ).sorted_like(df, subset=subset)
+        elif self.name == "unpivot":
+            indices, pivotees, variable_name, value_name = self.options
+            npiv = len(pivotees)
+            df = self.df.evaluate(cache=cache)
+            index_columns = [
+                NamedColumn(col, name)
+                for col, name in zip(
+                    plc.reshape.tile(df.select(indices).table, npiv).columns(),
+                    indices,
+                    strict=True,
+                )
+            ]
+            (variable_column,) = plc.filling.repeat(
+                plc.Table(
+                    [
+                        plc.interop.from_arrow(
+                            pa.array(
+                                pivotees,
+                                type=plc.interop.to_arrow(self.schema[variable_name]),
+                            ),
+                        )
+                    ]
+                ),
+                df.num_rows,
+            ).columns()
+            value_column = plc.concatenate.concatenate(
+                [c.astype(self.schema[value_name]) for c in df.select(pivotees).columns]
+            )
+            return DataFrame(
+                [
+                    *index_columns,
+                    NamedColumn(variable_column, variable_name),
+                    NamedColumn(value_column, value_name),
+                ]
+            )
         else:
             raise AssertionError("Should never be reached")  # pragma: no cover
 
@@ -1122,6 +1200,7 @@ class Union(IR):
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
+        super().__post_init__()
         schema = self.dfs[0].schema
         if not all(s.schema == schema for s in self.dfs[1:]):
             raise NotImplementedError("Schema mismatch")
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 6dc97c7cb51..a0291037f01 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -75,13 +75,12 @@ def _translate_ir(
 def _(
     node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.PythonScan(
-        schema,
-        node.options,
-        translate_named_expr(visitor, n=node.predicate)
-        if node.predicate is not None
-        else None,
+    scan_fn, with_columns, source_type, predicate, nrows = node.options
+    options = (scan_fn, with_columns, source_type, nrows)
+    predicate = (
+        translate_named_expr(visitor, n=predicate) if predicate is not None else None
     )
+    return ir.PythonScan(schema, options, predicate)
 
 
 @_translate_ir.register
@@ -94,13 +93,27 @@ def _(
         cloud_options = None
     else:
         reader_options, cloud_options = map(json.loads, options)
+    file_options = node.file_options
+    with_columns = file_options.with_columns
+    n_rows = file_options.n_rows
+    if n_rows is None:
+        n_rows = -1  # All rows
+        skip_rows = 0  # Don't skip
+    else:
+        # TODO: with versioning, rename on the rust side
+        skip_rows, n_rows = n_rows
+
+    row_index = file_options.row_index
     return ir.Scan(
         schema,
         typ,
         reader_options,
         cloud_options,
         node.paths,
-        node.file_options,
+        with_columns,
+        skip_rows,
+        n_rows,
+        row_index,
         translate_named_expr(visitor, n=node.predicate)
         if node.predicate is not None
         else None,
@@ -293,10 +306,28 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
     ctx: AbstractContextManager[None] = (
         set_node(visitor, n) if n is not None else noop_context
     )
+    # IR is versioned with major.minor, minor is bumped for backwards
+    # compatible changes (e.g. adding new nodes), major is bumped for
+    # incompatible changes (e.g. renaming nodes).
+    # Polars 1.7 changes definition of the CSV reader options schema name.
+    if (version := visitor.version()) >= (3, 0):
+        raise NotImplementedError(
+            f"No support for polars IR {version=}"
+        )  # pragma: no cover; no such version for now.
+
     with ctx:
+        polars_schema = visitor.get_schema()
         node = visitor.view_current_node()
-        schema = {k: dtypes.from_polars(v) for k, v in visitor.get_schema().items()}
-        return _translate_ir(node, visitor, schema)
+        schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()}
+        result = _translate_ir(node, visitor, schema)
+        if any(
+            isinstance(dtype, pl.Null)
+            for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values())
+        ):
+            raise NotImplementedError(
+                f"No GPU support for {result} with Null column dtype."
+            )
+        return result
 
 
 def translate_named_expr(
@@ -345,6 +376,24 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
     name, *options = node.function_data
     options = tuple(options)
     if isinstance(name, pl_expr.StringFunction):
+        if name in {
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+        }:
+            column, chars = (translate_expr(visitor, n=n) for n in node.input)
+            if isinstance(chars, expr.Literal):
+                if chars.value == pa.scalar(""):
+                    # No-op in polars, but libcudf uses empty string
+                    # as signifier to remove whitespace.
+                    return column
+                elif chars.value == pa.scalar(None):
+                    # Polars uses None to mean "strip all whitespace"
+                    chars = expr.Literal(
+                        column.dtype,
+                        pa.scalar("", type=plc.interop.to_arrow(column.dtype)),
+                    )
+            return expr.StringFunction(dtype, name, options, column, chars)
         return expr.StringFunction(
             dtype,
             name,
@@ -369,19 +418,43 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             *(translate_expr(visitor, n=n) for n in node.input),
         )
     elif isinstance(name, pl_expr.TemporalFunction):
-        return expr.TemporalFunction(
+        # functions for which evaluation of the expression may not return
+        # the same dtype as polars, either due to libcudf returning a different
+        # dtype, or due to our internal processing affecting what libcudf returns
+        needs_cast = {
+            pl_expr.TemporalFunction.Year,
+            pl_expr.TemporalFunction.Month,
+            pl_expr.TemporalFunction.Day,
+            pl_expr.TemporalFunction.WeekDay,
+            pl_expr.TemporalFunction.Hour,
+            pl_expr.TemporalFunction.Minute,
+            pl_expr.TemporalFunction.Second,
+            pl_expr.TemporalFunction.Millisecond,
+        }
+        result_expr = expr.TemporalFunction(
             dtype,
             name,
             options,
             *(translate_expr(visitor, n=n) for n in node.input),
         )
+        if name in needs_cast:
+            return expr.Cast(dtype, result_expr)
+        return result_expr
+
     elif isinstance(name, str):
-        return expr.UnaryFunction(
-            dtype,
-            name,
-            options,
-            *(translate_expr(visitor, n=n) for n in node.input),
-        )
+        children = (translate_expr(visitor, n=n) for n in node.input)
+        if name == "log":
+            (base,) = options
+            (child,) = children
+            return expr.BinOp(
+                dtype,
+                plc.binaryop.BinaryOperator.LOG_BASE,
+                child,
+                expr.Literal(dtype, pa.scalar(base, type=plc.interop.to_arrow(dtype))),
+            )
+        elif name == "pow":
+            return expr.BinOp(dtype, plc.binaryop.BinaryOperator.POW, *children)
+        return expr.UnaryFunction(dtype, name, options, *children)
     raise NotImplementedError(
         f"No handler for Expr function node with {name=}"
     )  # pragma: no cover; polars raises on the rust side for now
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index d37c96a15de..7b6f3848fc4 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -5,12 +5,11 @@
 
 from __future__ import annotations
 
-from functools import partial
 from typing import TYPE_CHECKING
 
+from polars import GPUEngine
 from polars.testing.asserts import assert_frame_equal
 
-from cudf_polars.callback import execute_with_cudf
 from cudf_polars.dsl.translate import translate_ir
 
 if TYPE_CHECKING:
@@ -77,21 +76,13 @@ def assert_gpu_result_equal(
     NotImplementedError
         If GPU collection failed in some way.
     """
-    if collect_kwargs is None:
-        collect_kwargs = {}
-    final_polars_collect_kwargs = collect_kwargs.copy()
-    final_cudf_collect_kwargs = collect_kwargs.copy()
-    if polars_collect_kwargs is not None:
-        final_polars_collect_kwargs.update(polars_collect_kwargs)
-    if cudf_collect_kwargs is not None:  # pragma: no cover
-        # exclude from coverage since not used ATM
-        # but this is probably still useful
-        final_cudf_collect_kwargs.update(cudf_collect_kwargs)
-    expect = lazydf.collect(**final_polars_collect_kwargs)
-    got = lazydf.collect(
-        **final_cudf_collect_kwargs,
-        post_opt_callback=partial(execute_with_cudf, raise_on_fail=True),
+    final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs(
+        collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs
     )
+
+    expect = lazydf.collect(**final_polars_collect_kwargs)
+    engine = GPUEngine(raise_on_fail=True)
+    got = lazydf.collect(**final_cudf_collect_kwargs, engine=engine)
     assert_frame_equal(
         expect,
         got,
@@ -134,3 +125,98 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception])
         raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e
     else:
         raise AssertionError(f"Translation DID NOT RAISE {exceptions}")
+
+
+def _process_kwargs(
+    collect_kwargs: dict[OptimizationArgs, bool] | None,
+    polars_collect_kwargs: dict[OptimizationArgs, bool] | None,
+    cudf_collect_kwargs: dict[OptimizationArgs, bool] | None,
+) -> tuple[dict[OptimizationArgs, bool], dict[OptimizationArgs, bool]]:
+    if collect_kwargs is None:
+        collect_kwargs = {}
+    final_polars_collect_kwargs = collect_kwargs.copy()
+    final_cudf_collect_kwargs = collect_kwargs.copy()
+    if polars_collect_kwargs is not None:  # pragma: no cover; not currently used
+        final_polars_collect_kwargs.update(polars_collect_kwargs)
+    if cudf_collect_kwargs is not None:  # pragma: no cover; not currently used
+        final_cudf_collect_kwargs.update(cudf_collect_kwargs)
+    return final_polars_collect_kwargs, final_cudf_collect_kwargs
+
+
+def assert_collect_raises(
+    lazydf: pl.LazyFrame,
+    *,
+    polars_except: type[Exception] | tuple[type[Exception], ...],
+    cudf_except: type[Exception] | tuple[type[Exception], ...],
+    collect_kwargs: dict[OptimizationArgs, bool] | None = None,
+    polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None,
+    cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None,
+):
+    """
+    Assert that collecting the result of a query raises the expected exceptions.
+
+    Parameters
+    ----------
+    lazydf
+        frame to collect.
+    collect_kwargs
+        Common keyword arguments to pass to collect for both polars CPU and
+        cudf-polars.
+        Useful for controlling optimization settings.
+    polars_except
+        Exception or exceptions polars CPU is expected to raise. If
+        None, CPU is not expected to raise an exception.
+    cudf_except
+        Exception or exceptions polars GPU is expected to raise. If
+        None, GPU is not expected to raise an exception.
+    collect_kwargs
+        Common keyword arguments to pass to collect for both polars CPU and
+        cudf-polars.
+        Useful for controlling optimization settings.
+    polars_collect_kwargs
+        Keyword arguments to pass to collect for execution on polars CPU.
+        Overrides kwargs in collect_kwargs.
+        Useful for controlling optimization settings.
+    cudf_collect_kwargs
+        Keyword arguments to pass to collect for execution on cudf-polars.
+        Overrides kwargs in collect_kwargs.
+        Useful for controlling optimization settings.
+
+    Returns
+    -------
+    None
+        If both sides raise the expected exceptions.
+
+    Raises
+    ------
+    AssertionError
+        If either side did not raise the expected exceptions.
+    """
+    final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs(
+        collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs
+    )
+
+    try:
+        lazydf.collect(**final_polars_collect_kwargs)
+    except polars_except:
+        pass
+    except Exception as e:
+        raise AssertionError(
+            f"CPU execution RAISED {type(e)}, EXPECTED {polars_except}"
+        ) from e
+    else:
+        if polars_except != ():
+            raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}")
+
+    engine = GPUEngine(raise_on_fail=True)
+    try:
+        lazydf.collect(**final_cudf_collect_kwargs, engine=engine)
+    except cudf_except:
+        pass
+    except Exception as e:
+        raise AssertionError(
+            f"GPU execution RAISED {type(e)}, EXPECTED {cudf_except}"
+        ) from e
+    else:
+        if cudf_except != ():
+            raise AssertionError(f"GPU execution DID NOT RAISE {cudf_except}")
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
new file mode 100644
index 00000000000..05b76d76808
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -0,0 +1,158 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Plugin for running polars test suite setting GPU engine as default."""
+
+from __future__ import annotations
+
+from functools import partialmethod
+from typing import TYPE_CHECKING
+
+import pytest
+
+import polars
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+
+def pytest_addoption(parser: pytest.Parser):
+    """Add plugin-specific options."""
+    group = parser.getgroup(
+        "cudf-polars", "Plugin to set GPU as default engine for polars tests"
+    )
+    group.addoption(
+        "--cudf-polars-no-fallback",
+        action="store_true",
+        help="Turn off fallback to CPU when running tests (default use fallback)",
+    )
+
+
+def pytest_configure(config: pytest.Config):
+    """Enable use of this module as a pytest plugin to enable GPU collection."""
+    no_fallback = config.getoption("--cudf-polars-no-fallback")
+    collect = polars.LazyFrame.collect
+    engine = polars.GPUEngine(raise_on_fail=no_fallback)
+    polars.LazyFrame.collect = partialmethod(collect, engine=engine)
+    config.addinivalue_line(
+        "filterwarnings",
+        "ignore:.*GPU engine does not support streaming or background collection",
+    )
+    config.addinivalue_line(
+        "filterwarnings",
+        "ignore:.*Query execution with GPU not supported",
+    )
+
+
+EXPECTED_FAILURES: Mapping[str, str] = {
+    "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed",
+    "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
+    "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
+    "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
+    "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
+    "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
+    "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR",
+    "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed",
+    "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed",
+    "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly",
+    "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU",
+    "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match",
+    "tests/unit/operations/aggregation/test_aggregations.py::test_duration_function_literal": "Broadcasting inside groupby-agg not supported",
+    "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero",
+    "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
+    "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
+    "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input14-expected14-input_dtype14-output_dtype14]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
+    "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
+    "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
+    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
+    "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
+    "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
+    "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU",
+    "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv",
+    "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception",
+    "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
+    "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
+    "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
+    # Maybe flaky, order-dependent?
+    "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
+    "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
+}
+
+
+def pytest_collection_modifyitems(
+    session: pytest.Session, config: pytest.Config, items: list[pytest.Item]
+):
+    """Mark known failing tests."""
+    if config.getoption("--cudf-polars-no-fallback"):
+        # Don't xfail tests if running without fallback
+        return
+    for item in items:
+        if item.nodeid in EXPECTED_FAILURES:
+            item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index adab10bdded..240b11bdf59 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -84,6 +84,10 @@ def view_expression(self, n: int) -> Expr:
         """Convert the given expression to python rep."""
         ...
 
+    def version(self) -> tuple[int, int]:
+        """The IR version as `(major, minor)`."""
+        ...
+
     def set_udf(
         self,
         callback: Callable[[list[str] | None, str | None, int | None], pl.DataFrame],
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 7f6ea1edfd9..4154a404e98 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -13,7 +13,7 @@
 
 import polars as pl
 
-__all__ = ["from_polars", "downcast_arrow_lists"]
+__all__ = ["from_polars", "downcast_arrow_lists", "can_cast"]
 
 
 def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType:
@@ -45,6 +45,28 @@ def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType:
     return typ
 
 
+def can_cast(from_: plc.DataType, to: plc.DataType) -> bool:
+    """
+    Can we cast (via :func:`~.pylibcudf.unary.cast`) between two datatypes.
+
+    Parameters
+    ----------
+    from_
+        Source datatype
+    to
+        Target datatype
+
+    Returns
+    -------
+    True if casting is supported, False otherwise
+    """
+    return (
+        plc.traits.is_fixed_width(to)
+        and plc.traits.is_fixed_width(from_)
+        and plc.unary.is_supported_cast(from_, to)
+    )
+
+
 @cache
 def from_polars(dtype: pl.DataType) -> plc.DataType:
     """
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index 9807cffb384..4a7ad6b3cf2 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -12,18 +12,11 @@
 
 POLARS_VERSION = parse(__version__)
 
-POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0")
-POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1")
-POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2")
-POLARS_VERSION_GE_121 = POLARS_VERSION >= parse("1.2.1")
-POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0")
-POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1")
-POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2")
-
-POLARS_VERSION_LE_12 = POLARS_VERSION <= parse("1.2")
-POLARS_VERSION_LE_11 = POLARS_VERSION <= parse("1.1")
-POLARS_VERSION_LT_12 = POLARS_VERSION < parse("1.2")
-POLARS_VERSION_LT_11 = POLARS_VERSION < parse("1.1")
-
-if POLARS_VERSION < parse("1.0"):  # pragma: no cover
-    raise ImportError("cudf_polars requires py-polars v1.0 or greater.")
+POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8")
+
+
+def _ensure_polars_version():
+    if POLARS_VERSION_LT_18:
+        raise ImportError(
+            "cudf_polars requires py-polars v1.8 or greater."
+        )  # pragma: no cover
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index daf8286ae07..bff44af1468 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -15,8 +15,10 @@ You will need:
 
 ## Installing polars
 
-We will need to build polars from source. Until things settle down,
-live at `HEAD`.
+`cudf-polars` works with polars >= 1.3, as long as the internal IR
+version doesn't get a major version bump. So `pip install polars>=1.3`
+should work. For development, if we're adding things to the polars
+side of things, we will need to build polars from source:
 
 ```sh
 git clone https://github.com/pola-rs/polars
@@ -59,7 +61,7 @@ The executor for the polars logical plan lives in the cudf repo, in
 
 ```sh
 cd cudf/python/cudf_polars
-uv pip install --no-build-isolation --no-deps -e .
+pip install --no-build-isolation --no-deps -e .
 ```
 
 You should now be able to run the tests in the `cudf_polars` package:
@@ -69,16 +71,18 @@ pytest -v tests
 
 # Executor design
 
-The polars `LazyFrame.collect` functionality offers a
-"post-optimization" callback that may be used by a third party library
-to replace a node (or more, though we only replace a single node) in the
-optimized logical plan with a Python callback that is to deliver the
-result of evaluating the plan. This splits the execution of the plan
-into two phases. First, a symbolic phase which translates to our
-internal representation (IR). Second, an execution phase which executes
-using our IR.
-
-The translation phase receives the a low-level Rust `NodeTraverse`
+The polars `LazyFrame.collect` functionality offers configuration of
+the engine to use for collection through the `engine` argument. At a
+low level, this provides for configuration of a "post-optimization"
+callback that may be used by a third party library to replace a node
+(or more, though we only replace a single node) in the optimized
+logical plan with a Python callback that is to deliver the result of
+evaluating the plan. This splits the execution of the plan into two
+phases. First, a symbolic phase which translates to our internal
+representation (IR). Second, an execution phase which executes using
+our IR.
+
+The translation phase receives the a low-level Rust `NodeTraverser`
 object which delivers Python representations of the plan nodes (and
 expressions) one at a time. During translation, we endeavour to raise
 `NotImplementedError` for any unsupported functionality. This way, if
@@ -86,33 +90,60 @@ we can't execute something, we just don't modify the logical plan at
 all: if we can translate the IR, it is assumed that evaluation will
 later succeed.
 
-The usage of the cudf-based executor is therefore, at present:
+The usage of the cudf-based executor is therefore selected with the
+gpu engine:
 
 ```python
-from cudf_polars.callback import execute_with_cudf
+import polars as pl
 
-result = q.collect(post_opt_callback=execute_with_cudf)
+result = q.collect(engine="gpu")
 ```
 
 This should either transparently run on the GPU and deliver a polars
 dataframe, or else fail (but be handled) and just run the normal CPU
-execution.
+execution. If `POLARS_VERBOSE` is true, then fallback is logged with a
+`PerformanceWarning`.
 
-If you want to fail during translation, set the keyword argument
-`raise_on_fail` to `True`:
+As well as a string argument, the engine can also be specified with a
+polars `GPUEngine` object. This allows passing more configuration in.
+Currently, the public properties are `device`, to select the device,
+and `memory_resource`, to select the RMM memory resource used for
+allocations during the collection phase.
 
+For example:
 ```python
-from functools import partial
-from cudf_polars.callback import execute_with_cudf
+import polars as pl
 
-result = q.collect(
-    post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)
-)
+result = q.collect(engine=pl.GPUEngine(device=1, memory_resource=mr))
+```
+
+Uses device-1, and the given memory resource. Note that the memory
+resource provided _must_ be valid for allocations on the specified
+device, no checking is performed.
+
+For debugging purposes, we can also pass undocumented keyword
+arguments, at the moment, `raise_on_fail` is also supported, which
+raises, rather than falling back, during translation:
+
+```python
+
+result = q.collect(engine=pl.GPUEngine(raise_on_fail=True))
 ```
 
 This is mostly useful when writing tests, since in that case we want
 any failures to propagate, rather than falling back to the CPU mode.
 
+## IR versioning
+
+On the polars side, the `NodeTraverser` object advertises an internal
+version (via `NodeTraverser.version()` as a `(major, minor)` tuple).
+`minor` version bumps are for backwards compatible changes (e.g.
+exposing new nodes), whereas `major` bumps are for incompatible
+changes. We can therefore attempt to detect the IR version
+(independently of the polars version) and dispatch, or error
+appropriately. This should be done during IR translation in
+`translate.py`.
+
 ## Adding a handler for a new plan node
 
 Plan node definitions live in `cudf_polars/dsl/ir.py`, these are
@@ -175,7 +206,7 @@ around their pylibcudf counterparts. We have four (in
 
 1. `Scalar` (a wrapper around a pylibcudf `Scalar`)
 2. `Column` (a wrapper around a pylibcudf `Column`)
-3. `NamedColumn` a `Column` with an additional name
+3. `NamedColumn` (a `Column` with an additional name)
 4. `DataFrame` (a wrapper around a pylibcudf `Table`)
 
 The interfaces offered by these are somewhat in flux, but broadly
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index b44f633e2d9..f55031e0826 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.0,<1.3",
+    "polars>=1.8,<1.9",
     "pylibcudf==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -58,6 +58,9 @@ exclude_also = [
   "class .*\\bProtocol\\):",
   "assert_never\\("
 ]
+# The cudf_polars test suite doesn't exercise the plugin, so we omit
+# it from coverage checks.
+omit = ["cudf_polars/testing/plugin.py"]
 
 [tool.ruff]
 line-length = 88
diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py
index 6b470268084..39fb44d55a5 100644
--- a/python/cudf_polars/tests/containers/test_dataframe.py
+++ b/python/cudf_polars/tests/containers/test_dataframe.py
@@ -9,6 +9,7 @@
 import polars as pl
 
 from cudf_polars.containers import DataFrame, NamedColumn
+from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
 def test_select_missing_raises():
@@ -140,3 +141,13 @@ def test_sorted_flags_preserved(with_nulls, nulls_last):
     assert b.null_order == b_null_order
     assert c.is_sorted == plc.types.Sorted.NO
     assert df.flags == gf.to_polars().flags
+
+
+def test_empty_name_roundtrips_overlap():
+    df = pl.LazyFrame({"": [1, 2, 3], "column_0": [4, 5, 6]})
+    assert_gpu_result_equal(df)
+
+
+def test_empty_name_roundtrips_no_overlap():
+    df = pl.LazyFrame({"": [1, 2, 3], "b": [4, 5, 6]})
+    assert_gpu_result_equal(df)
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 245bde3acab..56055f4c6c2 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -7,15 +7,38 @@
 import polars as pl
 
 from cudf_polars.dsl import expr
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
-@pytest.fixture(params=sorted(expr.Agg._SUPPORTED))
+@pytest.fixture(
+    params=[
+        # regular aggs from Agg
+        "min",
+        "max",
+        "median",
+        "n_unique",
+        "first",
+        "last",
+        "mean",
+        "sum",
+        "count",
+        "std",
+        "var",
+        # scan aggs from UnaryFunction
+        "cum_min",
+        "cum_max",
+        "cum_prod",
+        "cum_sum",
+    ]
+)
 def agg(request):
     return request.param
 
 
-@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16])
+@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16, pl.Int8, pl.UInt16])
 def dtype(request):
     return request.param
 
@@ -34,6 +57,11 @@ def df(dtype, with_nulls, is_sorted):
     if is_sorted:
         values = sorted(values, key=lambda x: -1000 if x is None else x)
 
+    if dtype.is_unsigned_integer():
+        values = pl.Series(values).abs()
+        if is_sorted:
+            values = values.sort()
+
     df = pl.LazyFrame({"a": values}, schema={"a": dtype})
     if is_sorted:
         return df.set_sorted("a")
@@ -52,6 +80,51 @@ def test_agg(df, agg):
     assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False)
 
 
+def test_bool_agg(agg, request):
+    if agg == "cum_min" or agg == "cum_max":
+        pytest.skip("Does not apply")
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=agg == "n_unique",
+            reason="Wrong dtype we get Int32, polars gets UInt32",
+        )
+    )
+    df = pl.LazyFrame({"a": [True, False, None, True]})
+    expr = getattr(pl.col("a"), agg)()
+    q = df.select(expr)
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("cum_agg", expr.UnaryFunction._supported_cum_aggs)
+def test_cum_agg_reverse_unsupported(cum_agg):
+    df = pl.LazyFrame({"a": [1, 2, 3]})
+    expr = getattr(pl.col("a"), cum_agg)(reverse=True)
+    q = df.select(expr)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize("q", [0.5, pl.lit(0.5)])
+@pytest.mark.parametrize("interp", ["nearest", "higher", "lower", "midpoint", "linear"])
+def test_quantile(df, q, interp):
+    expr = pl.col("a").quantile(q, interp)
+    q = df.select(expr)
+
+    # https://github.com/rapidsai/cudf/issues/15852
+    check_dtypes = q.collect_schema()["a"] == pl.Float64
+    if not check_dtypes:
+        with pytest.raises(AssertionError):
+            assert_gpu_result_equal(q)
+    assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False)
+
+
+def test_quantile_invalid_q(df):
+    expr = pl.col("a").quantile(pl.col("a"))
+    q = df.select(expr)
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
 @pytest.mark.parametrize(
     "op", [pl.Expr.min, pl.Expr.nan_min, pl.Expr.max, pl.Expr.nan_max]
 )
diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py
index 97421008669..2347021c40e 100644
--- a/python/cudf_polars/tests/expressions/test_booleanfunction.py
+++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py
@@ -17,15 +17,11 @@ def has_nulls(request):
     return request.param
 
 
-@pytest.mark.parametrize(
-    "ignore_nulls",
-    [
-        pytest.param(
-            False, marks=pytest.mark.xfail(reason="No support for Kleene logic")
-        ),
-        True,
-    ],
-)
+@pytest.fixture(params=[False, True], ids=["include_nulls", "ignore_nulls"])
+def ignore_nulls(request):
+    return request.param
+
+
 def test_booleanfunction_reduction(ignore_nulls):
     ldf = pl.LazyFrame(
         {
@@ -43,6 +39,25 @@ def test_booleanfunction_reduction(ignore_nulls):
     assert_gpu_result_equal(query)
 
 
+@pytest.mark.parametrize("expr", [pl.Expr.any, pl.Expr.all])
+def test_booleanfunction_all_any_kleene(expr, ignore_nulls):
+    ldf = pl.LazyFrame(
+        {
+            "a": [False, None],
+            "b": [False, False],
+            "c": [False, True],
+            "d": [None, False],
+            "e": pl.Series([None, None], dtype=pl.Boolean()),
+            "f": [None, True],
+            "g": [True, False],
+            "h": [True, None],
+            "i": [True, True],
+        }
+    )
+    q = ldf.select(expr(pl.col("*"), ignore_nulls=ignore_nulls))
+    assert_gpu_result_equal(q)
+
+
 @pytest.mark.parametrize(
     "expr",
     [
@@ -54,14 +69,7 @@ def test_booleanfunction_reduction(ignore_nulls):
     ids=lambda f: f"{f.__name__}()",
 )
 @pytest.mark.parametrize("has_nans", [False, True], ids=["no_nans", "nans"])
-def test_boolean_function_unary(request, expr, has_nans, has_nulls):
-    if has_nulls and expr in (pl.Expr.is_nan, pl.Expr.is_not_nan):
-        request.applymarker(
-            pytest.mark.xfail(
-                reason="Need to copy null mask since is_{not_}nan(null) => null"
-            )
-        )
-
+def test_boolean_function_unary(expr, has_nans, has_nulls):
     values: list[float | None] = [1, 2, 3, 4, 5]
     if has_nans:
         values[3] = float("nan")
@@ -119,9 +127,7 @@ def test_boolean_isbetween(closed, bounds):
     "expr", [pl.any_horizontal("*"), pl.all_horizontal("*")], ids=["any", "all"]
 )
 @pytest.mark.parametrize("wide", [False, True], ids=["narrow", "wide"])
-def test_boolean_horizontal(request, expr, has_nulls, wide):
-    if has_nulls:
-        request.applymarker(pytest.mark.xfail(reason="No support for Kleene logic"))
+def test_boolean_horizontal(expr, has_nulls, wide):
     ldf = pl.LazyFrame(
         {
             "a": [False, False, False, False, False, True],
@@ -164,6 +170,18 @@ def test_boolean_is_in(expr):
     assert_gpu_result_equal(q)
 
 
+@pytest.mark.parametrize("expr", [pl.Expr.and_, pl.Expr.or_, pl.Expr.xor])
+def test_boolean_kleene_logic(expr):
+    ldf = pl.LazyFrame(
+        {
+            "a": [False, False, False, None, None, None, True, True, True],
+            "b": [False, None, True, False, None, True, False, None, True],
+        }
+    )
+    q = ldf.select(expr(pl.col("a"), pl.col("b")))
+    assert_gpu_result_equal(q)
+
+
 def test_boolean_is_in_raises_unsupported():
     ldf = pl.LazyFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int64)})
     q = ldf.select(pl.col("a").is_in(pl.lit(1, dtype=pl.Int32())))
diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py
index 218101bf87c..c6ea29ddd38 100644
--- a/python/cudf_polars/tests/expressions/test_datetime_basic.py
+++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py
@@ -9,7 +9,11 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.dsl.expr import TemporalFunction
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.mark.parametrize(
@@ -37,26 +41,97 @@ def test_datetime_dataframe_scan(dtype):
     assert_gpu_result_equal(query)
 
 
+datetime_extract_fields = [
+    "year",
+    "month",
+    "day",
+    "weekday",
+    "hour",
+    "minute",
+    "second",
+    "millisecond",
+    "microsecond",
+    "nanosecond",
+]
+
+
+@pytest.fixture(
+    ids=datetime_extract_fields,
+    params=[methodcaller(f) for f in datetime_extract_fields],
+)
+def field(request):
+    return request.param
+
+
+def test_datetime_extract(field):
+    ldf = pl.LazyFrame(
+        {
+            "datetimes": pl.datetime_range(
+                datetime.datetime(2020, 1, 1),
+                datetime.datetime(2021, 12, 30),
+                "3mo14h15s11ms33us999ns",
+                eager=True,
+            )
+        }
+    )
+
+    q = ldf.select(field(pl.col("datetimes").dt))
+
+    assert_gpu_result_equal(q)
+
+
+def test_datetime_extra_unsupported(monkeypatch):
+    ldf = pl.LazyFrame(
+        {
+            "datetimes": pl.datetime_range(
+                datetime.datetime(2020, 1, 1),
+                datetime.datetime(2021, 12, 30),
+                "3mo14h15s11ms33us999ns",
+                eager=True,
+            )
+        }
+    )
+
+    def unsupported_name_setter(self, value):
+        pass
+
+    def unsupported_name_getter(self):
+        return "unsupported"
+
+    monkeypatch.setattr(
+        TemporalFunction,
+        "name",
+        property(unsupported_name_getter, unsupported_name_setter),
+    )
+
+    q = ldf.select(pl.col("datetimes").dt.nanosecond())
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
 @pytest.mark.parametrize(
     "field",
     [
         methodcaller("year"),
-        pytest.param(
-            methodcaller("day"),
-            marks=pytest.mark.xfail(reason="day extraction not implemented"),
-        ),
+        methodcaller("month"),
+        methodcaller("day"),
+        methodcaller("weekday"),
     ],
 )
-def test_datetime_extract(field):
+def test_date_extract(field):
+    ldf = pl.LazyFrame(
+        {
+            "dates": [
+                datetime.date(2024, 1, 1),
+                datetime.date(2024, 10, 11),
+            ]
+        }
+    )
+
     ldf = pl.LazyFrame(
         {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]}
     )
-    q = ldf.select(field(pl.col("dates").dt))
 
-    with pytest.raises(AssertionError):
-        # polars produces int32, libcudf produces int16 for the year extraction
-        # libcudf can lose data here.
-        # https://github.com/rapidsai/cudf/issues/16196
-        assert_gpu_result_equal(q)
+    q = ldf.select(field(pl.col("dates").dt))
 
-    assert_gpu_result_equal(q, check_dtypes=False)
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/expressions/test_gather.py b/python/cudf_polars/tests/expressions/test_gather.py
index 6bffa3e252c..f7c5d1bf2cd 100644
--- a/python/cudf_polars/tests/expressions/test_gather.py
+++ b/python/cudf_polars/tests/expressions/test_gather.py
@@ -6,7 +6,6 @@
 
 import polars as pl
 
-from cudf_polars import execute_with_cudf
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
@@ -47,4 +46,4 @@ def test_gather_out_of_bounds(negative):
     query = ldf.select(pl.col("a").gather(pl.col("b")))
 
     with pytest.raises(pl.exceptions.ComputeError):
-        query.collect(post_opt_callback=execute_with_cudf)
+        query.collect(engine="gpu")
diff --git a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py
new file mode 100644
index 00000000000..ac3aecf88e6
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(
+    params=[
+        "sin",
+        "cos",
+        "tan",
+        "arcsin",
+        "arccos",
+        "arctan",
+        "sinh",
+        "cosh",
+        "tanh",
+        "arcsinh",
+        "arccosh",
+        "arctanh",
+        "exp",
+        "sqrt",
+        "cbrt",
+        "ceil",
+        "floor",
+        "abs",
+    ]
+)
+def op(request):
+    return request.param
+
+
+@pytest.fixture(params=[pl.Int32, pl.Float32])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture
+def ldf(with_nulls, dtype):
+    values = [1, 2, 4, 5, -2, -4, 0]
+    if with_nulls:
+        values.append(None)
+    if dtype == pl.Float32:
+        values.append(-float("inf"))
+        values.append(float("nan"))
+        values.append(float("inf"))
+    elif dtype == pl.Int32:
+        iinfo = np.iinfo("int32")
+        values.append(iinfo.min)
+        values.append(iinfo.max)
+    return pl.LazyFrame(
+        {
+            "a": pl.Series(values, dtype=dtype),
+            "b": pl.Series([i - 4 for i in range(len(values))], dtype=pl.Float32),
+        }
+    )
+
+
+def test_unary(ldf, op):
+    expr = getattr(pl.col("a"), op)()
+    q = ldf.select(expr)
+    assert_gpu_result_equal(q, check_exact=False)
+
+
+@pytest.mark.parametrize("base_literal", [False, True])
+@pytest.mark.parametrize("exponent_literal", [False, True])
+def test_pow(ldf, base_literal, exponent_literal):
+    base = pl.lit(2) if base_literal else pl.col("a")
+    exponent = pl.lit(-3, dtype=pl.Float32) if exponent_literal else pl.col("b")
+
+    q = ldf.select(base.pow(exponent))
+
+    assert_gpu_result_equal(q, check_exact=False)
+
+
+@pytest.mark.parametrize("natural", [True, False])
+def test_log(ldf, natural):
+    if natural:
+        expr = pl.col("a").log()
+    else:
+        expr = pl.col("a").log(10)
+
+    q = ldf.select(expr)
+
+    assert_gpu_result_equal(q, check_exact=False)
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index df08e15baa4..4f6850ac977 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -10,6 +10,7 @@
 
 from cudf_polars import execute_with_cudf
 from cudf_polars.testing.asserts import (
+    assert_collect_raises,
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
@@ -152,3 +153,187 @@ def test_slice_column(slice_column_data):
     else:
         query = slice_column_data.select(pl.col("a").str.slice(pl.col("start")))
     assert_ir_translation_raises(query, NotImplementedError)
+
+
+@pytest.fixture
+def to_datetime_data():
+    return pl.LazyFrame(
+        {
+            "a": [
+                "2021-01-01",
+                "2021-01-02",
+                "abcd",
+            ]
+        }
+    )
+
+
+@pytest.mark.parametrize("cache", [True, False], ids=lambda cache: f"{cache=}")
+@pytest.mark.parametrize("strict", [True, False], ids=lambda strict: f"{strict=}")
+@pytest.mark.parametrize("exact", [True, False], ids=lambda exact: f"{exact=}")
+@pytest.mark.parametrize("format", ["%Y-%m-%d", None], ids=lambda format: f"{format=}")
+def test_to_datetime(to_datetime_data, cache, strict, format, exact):
+    query = to_datetime_data.select(
+        pl.col("a").str.strptime(
+            pl.Datetime("ns"), format=format, cache=cache, strict=strict, exact=exact
+        )
+    )
+    if cache or format is None or not exact:
+        assert_ir_translation_raises(query, NotImplementedError)
+    elif strict:
+        assert_collect_raises(
+            query,
+            polars_except=pl.exceptions.InvalidOperationError,
+            cudf_except=pl.exceptions.ComputeError,
+        )
+    else:
+        assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "target, repl",
+    [("a", "a"), ("Wı", "☺"), ("FG", ""), ("doesnotexist", "blahblah")],  # noqa: RUF001
+)
+@pytest.mark.parametrize("n", [0, 3, -1])
+def test_replace_literal(ldf, target, repl, n):
+    query = ldf.select(pl.col("a").str.replace(target, repl, literal=True, n=n))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize("target, repl", [("", ""), ("a", pl.col("a"))])
+def test_replace_literal_unsupported(ldf, target, repl):
+    query = ldf.select(pl.col("a").str.replace(target, repl, literal=True))
+    assert_ir_translation_raises(query, NotImplementedError)
+
+
+def test_replace_re(ldf):
+    query = ldf.select(pl.col("a").str.replace("A", "a", literal=False))
+    assert_ir_translation_raises(query, NotImplementedError)
+
+
+@pytest.mark.parametrize(
+    "target,repl",
+    [
+        (["A", "de", "kLm", "awef"], "a"),
+        (["A", "de", "kLm", "awef"], ""),
+        (["A", "de", "kLm", "awef"], ["a", "b", "c", "d"]),
+        (["A", "de", "kLm", "awef"], ["a", "b", "c", ""]),
+        (
+            pl.lit(pl.Series(["A", "de", "kLm", "awef"])),
+            pl.lit(pl.Series(["a", "b", "c", "d"])),
+        ),
+    ],
+)
+def test_replace_many(ldf, target, repl):
+    query = ldf.select(pl.col("a").str.replace_many(target, repl))
+
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "target,repl",
+    [(["A", ""], ["a", "b"]), (pl.col("a").drop_nulls(), pl.col("a").drop_nulls())],
+)
+def test_replace_many_notimplemented(ldf, target, repl):
+    query = ldf.select(pl.col("a").str.replace_many(target, repl))
+    assert_ir_translation_raises(query, NotImplementedError)
+
+
+def test_replace_many_ascii_case(ldf):
+    query = ldf.select(
+        pl.col("a").str.replace_many(["a", "b", "c"], "a", ascii_case_insensitive=True)
+    )
+
+    assert_ir_translation_raises(query, NotImplementedError)
+
+
+_strip_data = [
+    "AbC",
+    "123abc",
+    "",
+    " ",
+    None,
+    "aAaaaAAaa",
+    " ab c ",
+    "abc123",
+    "    ",
+    "\tabc\t",
+    "\nabc\n",
+    "\r\nabc\r\n",
+    "\t\n abc \n\t",
+    "!@#$%^&*()",
+    "   abc!!!   ",
+    "   abc\t\n!!!   ",
+    "__abc__",
+    "abc\n\n",
+    "123abc456",
+    "abcxyzabc",
+]
+
+strip_chars = [
+    "a",
+    "",
+    " ",
+    "\t",
+    "\n",
+    "\r\n",
+    "!",
+    "@#",
+    "123",
+    "xyz",
+    "abc",
+    "__",
+    " \t\n",
+    "abc123",
+    None,
+]
+
+
+@pytest.fixture
+def strip_ldf():
+    return pl.DataFrame({"a": _strip_data}).lazy()
+
+
+@pytest.fixture(params=strip_chars)
+def to_strip(request):
+    return request.param
+
+
+def test_strip_chars(strip_ldf, to_strip):
+    q = strip_ldf.select(pl.col("a").str.strip_chars(to_strip))
+    assert_gpu_result_equal(q)
+
+
+def test_strip_chars_start(strip_ldf, to_strip):
+    q = strip_ldf.select(pl.col("a").str.strip_chars_start(to_strip))
+    assert_gpu_result_equal(q)
+
+
+def test_strip_chars_end(strip_ldf, to_strip):
+    q = strip_ldf.select(pl.col("a").str.strip_chars_end(to_strip))
+    assert_gpu_result_equal(q)
+
+
+def test_strip_chars_column(strip_ldf):
+    q = strip_ldf.select(pl.col("a").str.strip_chars(pl.col("a")))
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_invalid_regex_raises():
+    df = pl.LazyFrame({"a": ["abc"]})
+
+    q = df.select(pl.col("a").str.contains(r"ab)", strict=True))
+
+    assert_collect_raises(
+        q,
+        polars_except=pl.exceptions.ComputeError,
+        cudf_except=pl.exceptions.ComputeError,
+    )
+
+
+@pytest.mark.parametrize("pattern", ["a{1000}", "a(?i:B)"])
+def test_unsupported_regex_raises(pattern):
+    df = pl.LazyFrame({"a": ["abc"]})
+
+    q = df.select(pl.col("a").str.contains(pattern, strict=True))
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/pytest.ini b/python/cudf_polars/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/cudf_polars/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
index 5b4bba55552..3c3986be19b 100644
--- a/python/cudf_polars/tests/test_config.py
+++ b/python/cudf_polars/tests/test_config.py
@@ -6,6 +6,9 @@
 import pytest
 
 import polars as pl
+from polars.testing.asserts import assert_frame_equal
+
+import rmm
 
 from cudf_polars.dsl.ir import IR
 from cudf_polars.testing.asserts import (
@@ -32,3 +35,48 @@ def raise_unimplemented(self):
     ):
         # And ensure that collecting issues the correct warning.
         assert_gpu_result_equal(q)
+
+
+def test_unsupported_config_raises():
+    q = pl.LazyFrame({})
+
+    with pytest.raises(pl.exceptions.ComputeError):
+        q.collect(engine=pl.GPUEngine(unknown_key=True))
+
+
+@pytest.mark.parametrize("device", [-1, "foo"])
+def test_invalid_device_raises(device):
+    q = pl.LazyFrame({})
+    with pytest.raises(pl.exceptions.ComputeError):
+        q.collect(engine=pl.GPUEngine(device=device))
+
+
+@pytest.mark.parametrize("mr", [1, object()])
+def test_invalid_memory_resource_raises(mr):
+    q = pl.LazyFrame({})
+    with pytest.raises(pl.exceptions.ComputeError):
+        q.collect(engine=pl.GPUEngine(memory_resource=mr))
+
+
+def test_explicit_device_zero():
+    q = pl.LazyFrame({"a": [1, 2, 3]})
+
+    result = q.collect(engine=pl.GPUEngine(device=0))
+    assert_frame_equal(q.collect(), result)
+
+
+def test_explicit_memory_resource():
+    upstream = rmm.mr.CudaMemoryResource()
+    n_allocations = 0
+
+    def allocate(bytes, stream):
+        nonlocal n_allocations
+        n_allocations += 1
+        return upstream.allocate(bytes, stream)
+
+    mr = rmm.mr.CallbackMemoryResource(allocate, upstream.deallocate)
+
+    q = pl.LazyFrame({"a": [1, 2, 3]})
+    result = q.collect(engine=pl.GPUEngine(memory_resource=mr))
+    assert_frame_equal(q.collect(), result)
+    assert n_allocations > 0
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index a75825ef3d3..74bf8b9e4e2 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -12,7 +12,6 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
-from cudf_polars.utils import versions
 
 
 @pytest.fixture
@@ -31,6 +30,7 @@ def df():
     params=[
         [pl.col("key1")],
         [pl.col("key2")],
+        [pl.col("key1"), pl.lit(1)],
         [pl.col("key1") * pl.col("key2")],
         [pl.col("key1"), pl.col("key2")],
         [pl.col("key1") == pl.col("key2")],
@@ -52,6 +52,7 @@ def keys(request):
         [(pl.col("float") - pl.lit(2)).max()],
         [pl.col("float").sum().round(decimals=1)],
         [pl.col("float").round(decimals=1).sum()],
+        [pl.col("int").first(), pl.col("float").last()],
     ],
     ids=lambda aggs: "-".join(map(str, aggs)),
 )
@@ -60,15 +61,7 @@ def exprs(request):
 
 
 @pytest.fixture(
-    params=[
-        False,
-        pytest.param(
-            True,
-            marks=pytest.mark.xfail(
-                reason="Maintaining order in groupby not implemented"
-            ),
-        ),
-    ],
+    params=[False, True],
     ids=["no_maintain_order", "maintain_order"],
 )
 def maintain_order(request):
@@ -98,15 +91,10 @@ def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs):
     # Multiple keys don't do sorting
     qsorted = q.sort(*sort_keys)
     if len(keys) > 1:
-        with pytest.raises(AssertionError):
-            # https://github.com/pola-rs/polars/issues/17556
-            assert_gpu_result_equal(q, check_exact=False)
-        if versions.POLARS_VERSION_LT_12 and schema[sort_keys[1]] == pl.Boolean():
-            # https://github.com/pola-rs/polars/issues/17557
-            with pytest.raises(AssertionError):
-                assert_gpu_result_equal(qsorted, check_exact=False)
-        else:
-            assert_gpu_result_equal(qsorted, check_exact=False)
+        # https://github.com/pola-rs/polars/issues/17556
+        # Can't assert that the query without post-sorting fails,
+        # since it _might_ pass.
+        assert_gpu_result_equal(qsorted, check_exact=False)
     elif schema[sort_keys[0]] == pl.Boolean():
         # Boolean keys don't do sorting, so we get random order
         assert_gpu_result_equal(qsorted, check_exact=False)
@@ -133,6 +121,21 @@ def test_groupby_unsupported(df, expr):
     assert_ir_translation_raises(q, NotImplementedError)
 
 
+def test_groupby_null_keys(maintain_order):
+    df = pl.LazyFrame(
+        {
+            "key": pl.Series([1, float("nan"), 2, None, 2, None], dtype=pl.Float64()),
+            "value": [-1, 2, 1, 2, 3, 4],
+        }
+    )
+
+    q = df.group_by("key", maintain_order=maintain_order).agg(pl.col("value").min())
+    if not maintain_order:
+        q = q.sort("key")
+
+    assert_gpu_result_equal(q)
+
+
 @pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513")
 def test_groupby_minmax_with_nan():
     df = pl.LazyFrame(
@@ -159,21 +162,17 @@ def test_groupby_nan_minmax_raises(op):
 
 @pytest.mark.parametrize(
     "key",
-    [
-        pytest.param(
-            1,
-            marks=pytest.mark.xfail(
-                versions.POLARS_VERSION_GE_121, reason="polars 1.2.1 disallows this"
-            ),
-        ),
-        pl.col("key1"),
-    ],
+    [1, pl.col("key1")],
 )
 @pytest.mark.parametrize(
     "expr",
     [
         pl.lit(1).alias("value"),
-        pl.lit([[4, 5, 6]]).alias("value"),
+        pytest.param(
+            pl.lit([[4, 5, 6]]).alias("value"),
+            marks=pytest.mark.xfail(reason="Need to expose OtherScalar in rust IR"),
+        ),
+        pl.Series("value", [[4, 5, 6]], dtype=pl.List(pl.Int32)),
         pl.col("float") * (1 - pl.col("int")),
         [pl.lit(2).alias("value"), pl.col("float") * 2],
     ],
@@ -183,3 +182,12 @@ def test_groupby_literal_in_agg(df, key, expr):
     # so just sort by the group key
     q = df.group_by(key).agg(expr).sort(key, maintain_order=True)
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [pl.col("int").unique(), pl.col("int").drop_nulls(), pl.col("int").cum_max()],
+)
+def test_groupby_unary_non_pointwise_raises(df, expr):
+    q = df.group_by("key1").agg(expr)
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_groupby_dynamic.py b/python/cudf_polars/tests/test_groupby_dynamic.py
new file mode 100644
index 00000000000..38b3ce74ac5
--- /dev/null
+++ b/python/cudf_polars/tests/test_groupby_dynamic.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+from datetime import datetime
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_ir_translation_raises
+
+
+def test_groupby_dynamic_raises():
+    df = pl.LazyFrame(
+        {
+            "dt": [
+                datetime(2021, 12, 31, 0, 0, 0),
+                datetime(2022, 1, 1, 0, 0, 1),
+                datetime(2022, 3, 31, 0, 0, 1),
+                datetime(2022, 4, 1, 0, 0, 1),
+            ]
+        }
+    )
+
+    q = (
+        df.sort("dt")
+        .group_by_dynamic("dt", every="1q")
+        .agg(pl.col("dt").count().alias("num_values"))
+    )
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 1e880cdc6de..7d9ec98db97 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -17,7 +17,7 @@ def join_nulls(request):
     return request.param
 
 
-@pytest.fixture(params=["inner", "left", "semi", "anti", "full"])
+@pytest.fixture(params=["inner", "left", "right", "semi", "anti", "full"])
 def how(request):
     return request.param
 
diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py
index 77032108e6f..e895f27f637 100644
--- a/python/cudf_polars/tests/test_mapfunction.py
+++ b/python/cudf_polars/tests/test_mapfunction.py
@@ -61,3 +61,48 @@ def test_rename_columns(mapping):
     q = df.rename(mapping)
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("index", [None, ["a"], ["d", "a"]])
+@pytest.mark.parametrize("variable_name", [None, "names"])
+@pytest.mark.parametrize("value_name", [None, "unpivoted"])
+def test_unpivot(index, variable_name, value_name):
+    df = pl.LazyFrame(
+        {
+            "a": ["x", "y", "z"],
+            "b": pl.Series([1, 3, 5], dtype=pl.Int16),
+            "c": pl.Series([2, 4, 6], dtype=pl.Float32),
+            "d": ["a", "b", "c"],
+        }
+    )
+    q = df.unpivot(
+        ["c", "b"], index=index, variable_name=variable_name, value_name=value_name
+    )
+
+    assert_gpu_result_equal(q)
+
+
+def test_unpivot_defaults():
+    df = pl.LazyFrame(
+        {
+            "a": pl.Series([11, 12, 13], dtype=pl.UInt16),
+            "b": pl.Series([1, 3, 5], dtype=pl.Int16),
+            "c": pl.Series([2, 4, 6], dtype=pl.Float32),
+            "d": ["a", "b", "c"],
+        }
+    )
+    q = df.unpivot(index="d")
+    assert_gpu_result_equal(q)
+
+
+def test_unpivot_unsupported_cast_raises():
+    df = pl.LazyFrame(
+        {
+            "a": ["x", "y", "z"],
+            "b": pl.Series([1, 3, 5], dtype=pl.Int16),
+        }
+    )
+
+    q = df.unpivot(["a", "b"])
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py
index fd8453b77c4..0cda89474a8 100644
--- a/python/cudf_polars/tests/test_python_scan.py
+++ b/python/cudf_polars/tests/test_python_scan.py
@@ -8,7 +8,9 @@
 
 
 def test_python_scan():
-    def source(with_columns, predicate, nrows):
+    def source(with_columns, predicate, nrows, *batch_size):
+        # PythonScan interface changes between 1.3 and 1.4 to add an
+        # extra batch_size argument
         return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())})
 
     q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False)
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index 64acbb076ed..792b136acd8 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -12,7 +12,6 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
-from cudf_polars.utils import versions
 
 
 @pytest.fixture(
@@ -58,6 +57,22 @@ def mask(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        None,
+        (1, 1),
+    ],
+    ids=[
+        "no-slice",
+        "slice-second",
+    ],
+)
+def slice(request):
+    # For use in testing that we handle
+    # polars slice pushdown correctly
+    return request.param
+
+
 def make_source(df, path, format):
     """
     Writes the passed polars df to a file of
@@ -79,7 +94,9 @@ def make_source(df, path, format):
         ("parquet", pl.scan_parquet),
     ],
 )
-def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, request):
+def test_scan(
+    tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, slice, request
+):
     name, offset = row_index
     make_source(df, tmp_path / "file", format)
     request.applymarker(
@@ -94,21 +111,23 @@ def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, r
         row_index_offset=offset,
         n_rows=n_rows,
     )
+    if slice is not None:
+        q = q.slice(*slice)
     if mask is not None:
         q = q.filter(mask)
     if columns is not None:
         q = q.select(*columns)
-    polars_collect_kwargs = {}
-    if versions.POLARS_VERSION_LT_12:
-        # https://github.com/pola-rs/polars/issues/17553
-        polars_collect_kwargs = {"projection_pushdown": False}
-    assert_gpu_result_equal(
-        q,
-        polars_collect_kwargs=polars_collect_kwargs,
-        # This doesn't work in polars < 1.2 since the row-index
-        # is in the wrong order in previous polars releases
-        check_column_order=versions.POLARS_VERSION_LT_12,
-    )
+    assert_gpu_result_equal(q)
+
+
+def test_negative_slice_pushdown_raises(tmp_path):
+    df = pl.DataFrame({"a": [1, 2, 3]})
+
+    df.write_parquet(tmp_path / "df.parquet")
+    q = pl.scan_parquet(tmp_path / "df.parquet")
+    # Take the last row
+    q = q.slice(-1, 1)
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_scan_unsupported_raises(tmp_path):
@@ -127,10 +146,6 @@ def test_scan_ndjson_nrows_notimplemented(tmp_path, df):
     assert_ir_translation_raises(q, NotImplementedError)
 
 
-@pytest.mark.xfail(
-    versions.POLARS_VERSION_LT_11,
-    reason="https://github.com/pola-rs/polars/issues/15730",
-)
 def test_scan_row_index_projected_out(tmp_path):
     df = pl.DataFrame({"a": [1, 2, 3]})
 
@@ -169,15 +184,25 @@ def test_scan_csv_column_renames_projection_schema(tmp_path):
         ("test*.csv", False),
     ],
 )
-def test_scan_csv_multi(tmp_path, filename, glob):
+@pytest.mark.parametrize(
+    "nrows_skiprows",
+    [
+        (None, 0),
+        (1, 1),
+        (3, 0),
+        (4, 2),
+    ],
+)
+def test_scan_csv_multi(tmp_path, filename, glob, nrows_skiprows):
+    n_rows, skiprows = nrows_skiprows
     with (tmp_path / "test1.csv").open("w") as f:
-        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
     with (tmp_path / "test2.csv").open("w") as f:
-        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
     with (tmp_path / "test*.csv").open("w") as f:
-        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
     os.chdir(tmp_path)
-    q = pl.scan_csv(filename, glob=glob)
+    q = pl.scan_csv(filename, glob=glob, n_rows=n_rows, skip_rows=skiprows)
 
     assert_gpu_result_equal(q)
 
@@ -280,3 +305,24 @@ def test_scan_ndjson_unsupported(df, tmp_path):
     make_source(df, tmp_path / "file", "ndjson")
     q = pl.scan_ndjson(tmp_path / "file", ignore_errors=True)
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_parquet_nested_null_raises(tmp_path):
+    df = pl.DataFrame({"a": pl.Series([None], dtype=pl.List(pl.Null))})
+
+    df.write_parquet(tmp_path / "file.pq")
+
+    q = pl.scan_parquet(tmp_path / "file.pq")
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_parquet_only_row_index_raises(df, tmp_path):
+    make_source(df, tmp_path / "file", "parquet")
+    q = pl.scan_parquet(tmp_path / "file", row_index_name="index").select("index")
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_hf_url_raises():
+    q = pl.scan_csv("hf://datasets/scikit-learn/iris/Iris.csv")
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_sort.py b/python/cudf_polars/tests/test_sort.py
index ecc02efd967..cfa8e5ff9b9 100644
--- a/python/cudf_polars/tests/test_sort.py
+++ b/python/cudf_polars/tests/test_sort.py
@@ -13,10 +13,7 @@
     "sort_keys",
     [
         (pl.col("a"),),
-        pytest.param(
-            (pl.col("d").abs(),),
-            marks=pytest.mark.xfail(reason="abs not yet implemented"),
-        ),
+        (pl.col("d").abs(),),
         (pl.col("a"), pl.col("d")),
         (pl.col("b"),),
     ],
diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py
index 5bc2fe1efb7..ace1c6b8648 100644
--- a/python/cudf_polars/tests/testing/test_asserts.py
+++ b/python/cudf_polars/tests/testing/test_asserts.py
@@ -8,6 +8,7 @@
 import polars as pl
 
 from cudf_polars.testing.asserts import (
+    assert_collect_raises,
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
@@ -26,10 +27,59 @@ def test_translation_assert_raises():
     class E(Exception):
         pass
 
-    unsupported = df.group_by("a").agg(pl.col("a").cum_max().alias("b"))
+    unsupported = df.group_by("a").agg(pl.col("a").upper_bound().alias("b"))
     # Unsupported query should raise NotImplementedError
     assert_ir_translation_raises(unsupported, NotImplementedError)
 
     with pytest.raises(AssertionError):
         # This should fail, because we can't translate this query, but it doesn't raise E.
         assert_ir_translation_raises(unsupported, E)
+
+
+def test_collect_assert_raises():
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+
+    with pytest.raises(AssertionError, match="CPU execution DID NOT RAISE"):
+        # This should raise, because polars CPU can run this query,
+        # but we expect an error.
+        assert_collect_raises(
+            df,
+            polars_except=pl.exceptions.InvalidOperationError,
+            cudf_except=(),
+        )
+
+    with pytest.raises(AssertionError, match="GPU execution DID NOT RAISE"):
+        # This should raise, because polars GPU can run this query,
+        # but we expect an error.
+        assert_collect_raises(
+            df,
+            polars_except=(),
+            cudf_except=pl.exceptions.InvalidOperationError,
+        )
+
+    # Here's an invalid query that gets caught at IR optimisation time.
+    q = df.select(pl.col("a") * pl.col("b"))
+
+    # This exception is raised in preprocessing, so is the same for
+    # both CPU and GPU engines.
+    assert_collect_raises(
+        q,
+        polars_except=pl.exceptions.InvalidOperationError,
+        cudf_except=pl.exceptions.InvalidOperationError,
+    )
+
+    with pytest.raises(AssertionError, match="GPU execution RAISED"):
+        # This should raise because the expected GPU error is wrong
+        assert_collect_raises(
+            q,
+            polars_except=pl.exceptions.InvalidOperationError,
+            cudf_except=NotImplementedError,
+        )
+
+    with pytest.raises(AssertionError, match="CPU execution RAISED"):
+        # This should raise because the expected CPU error is wrong
+        assert_collect_raises(
+            q,
+            polars_except=NotImplementedError,
+            cudf_except=pl.exceptions.InvalidOperationError,
+        )
diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/custreamz/custreamz/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
index 4655d2165f0..69e1524be39 100644
--- a/python/dask_cudf/README.md
+++ b/python/dask_cudf/README.md
@@ -16,6 +16,7 @@ See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to
 ## Resources
 
 - [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
+- [Best practices](https://docs.rapids.ai/api/dask-cudf/stable/best_practices/)
 - [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/)
 - [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
 - [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index 97e1dffc65b..907abaa2bfc 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -15,6 +15,7 @@
 
 from dask import config
 from dask.dataframe.core import is_dataframe_like
+from dask.typing import no_default
 
 import cudf
 
@@ -90,6 +91,17 @@ def var(
             )
         )
 
+    def rename_axis(
+        self, mapper=no_default, index=no_default, columns=no_default, axis=0
+    ):
+        from dask_cudf.expr._expr import RenameAxisCudf
+
+        return new_collection(
+            RenameAxisCudf(
+                self, mapper=mapper, index=index, columns=columns, axis=axis
+            )
+        )
+
 
 class DataFrame(DXDataFrame, CudfFrameBase):
     @classmethod
@@ -202,27 +214,58 @@ class Index(DXIndex, CudfFrameBase):
 ##
 
 
-try:
-    from dask_expr._backends import create_array_collection
-
-    @get_collection_type.register_lazy("cupy")
-    def _register_cupy():
-        import cupy
-
-        @get_collection_type.register(cupy.ndarray)
-        def get_collection_type_cupy_array(_):
-            return create_array_collection
-
-    @get_collection_type.register_lazy("cupyx")
-    def _register_cupyx():
-        # Needed for cuml
-        from cupyx.scipy.sparse import spmatrix
-
-        @get_collection_type.register(spmatrix)
-        def get_collection_type_csr_matrix(_):
-            return create_array_collection
-
-except ImportError:
-    # Older version of dask-expr.
-    # Implicit conversion to array wont work.
-    pass
+def _create_array_collection_with_meta(expr):
+    # NOTE: This is the GPU compatible version of
+    # `new_dd_object` for DataFrame -> Array conversion.
+    # This can be removed if dask#11017 is resolved
+    # (See: https://github.com/dask/dask/issues/11017)
+    import numpy as np
+
+    import dask.array as da
+    from dask.blockwise import Blockwise
+    from dask.highlevelgraph import HighLevelGraph
+
+    result = expr.optimize()
+    dsk = result.__dask_graph__()
+    name = result._name
+    meta = result._meta
+    divisions = result.divisions
+    chunks = ((np.nan,) * (len(divisions) - 1),) + tuple(
+        (d,) for d in meta.shape[1:]
+    )
+    if len(chunks) > 1:
+        if isinstance(dsk, HighLevelGraph):
+            layer = dsk.layers[name]
+        else:
+            # dask-expr provides a dict only
+            layer = dsk
+        if isinstance(layer, Blockwise):
+            layer.new_axes["j"] = chunks[1][0]
+            layer.output_indices = layer.output_indices + ("j",)
+        else:
+            suffix = (0,) * (len(chunks) - 1)
+            for i in range(len(chunks[0])):
+                layer[(name, i) + suffix] = layer.pop((name, i))
+
+    return da.Array(dsk, name=name, chunks=chunks, meta=meta)
+
+
+@get_collection_type.register_lazy("cupy")
+def _register_cupy():
+    import cupy
+
+    get_collection_type.register(
+        cupy.ndarray,
+        lambda _: _create_array_collection_with_meta,
+    )
+
+
+@get_collection_type.register_lazy("cupyx")
+def _register_cupyx():
+    # Needed for cuml
+    from cupyx.scipy.sparse import spmatrix
+
+    get_collection_type.register(
+        spmatrix,
+        lambda _: _create_array_collection_with_meta,
+    )
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index 8a2c50d3fe7..b284ab3774d 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -4,11 +4,12 @@
 import dask_expr._shuffle as _shuffle_module
 from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
-from dask_expr._expr import Elemwise, Expr, VarColumns
+from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns
 from dask_expr._reductions import Reduction, Var
 
 from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
 from dask.dataframe.dispatch import is_categorical_dtype
+from dask.typing import no_default
 
 import cudf
 
@@ -17,6 +18,19 @@
 ##
 
 
+class RenameAxisCudf(RenameAxis):
+    # TODO: Remove this after rename_axis is supported in cudf
+    # (See: https://github.com/rapidsai/cudf/issues/16895)
+    @staticmethod
+    def operation(df, index=no_default, **kwargs):
+        if index != no_default:
+            df.index.name = index
+            return df
+        raise NotImplementedError(
+            "Only `index` is supported for the cudf backend"
+        )
+
+
 class ToCudfBackend(Elemwise):
     # TODO: Inherit from ToBackend when rapids-dask-dependency
     # is pinned to dask>=2024.8.1
diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 7aa0f6320f2..5f0fae86691 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -16,6 +16,7 @@
 
 import dask_cudf
 from dask_cudf.tests.utils import (
+    QUERY_PLANNING_ON,
     require_dask_expr,
     skip_dask_expr,
     xfail_dask_expr,
@@ -950,12 +951,16 @@ def test_implicit_array_conversion_cupy():
     def func(x):
         return x.values
 
-    # Need to compute the dask collection for now.
-    # See: https://github.com/dask/dask/issues/11017
-    result = ds.map_partitions(func, meta=s.values).compute()
-    expect = func(s)
+    result = ds.map_partitions(func, meta=s.values)
 
-    dask.array.assert_eq(result, expect)
+    if QUERY_PLANNING_ON:
+        # Check Array and round-tripped DataFrame
+        dask.array.assert_eq(result, func(s))
+        dd.assert_eq(result.to_dask_dataframe(), s, check_index=False)
+    else:
+        # Legacy version still carries numpy metadata
+        # See: https://github.com/dask/dask/issues/11017
+        dask.array.assert_eq(result.compute(), func(s))
 
 
 def test_implicit_array_conversion_cupy_sparse():
@@ -967,8 +972,6 @@ def test_implicit_array_conversion_cupy_sparse():
     def func(x):
         return cupyx.scipy.sparse.csr_matrix(x.values)
 
-    # Need to compute the dask collection for now.
-    # See: https://github.com/dask/dask/issues/11017
     result = ds.map_partitions(func, meta=s.values).compute()
     expect = func(s)
 
@@ -1024,3 +1027,15 @@ def test_cov_corr(op, numeric_only):
     # (See: https://github.com/rapidsai/cudf/issues/12626)
     expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only)
     dd.assert_eq(res, expect)
+
+
+def test_rename_axis_after_join():
+    df1 = cudf.DataFrame(index=["a", "b", "c"], data=dict(a=[1, 2, 3]))
+    df1.index.name = "test"
+    ddf1 = dd.from_pandas(df1, 2)
+
+    df2 = cudf.DataFrame(index=["a", "b", "d"], data=dict(b=[1, 2, 3]))
+    ddf2 = dd.from_pandas(df2, 2)
+    result = ddf1.join(ddf2, how="outer")
+    expected = df1.join(df2, how="outer")
+    dd.assert_eq(result, expected, check_index=False)
diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
index 88b15718382..d03e92319be 100644
--- a/python/dask_cudf/dask_cudf/tests/test_reductions.py
+++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -13,6 +13,7 @@
 
 
 def _make_random_frame(nelem, npartitions=2):
+    np.random.seed(0)
     df = pd.DataFrame(
         {
             "x": np.random.randint(0, 5, size=nelem),
@@ -38,7 +39,6 @@ def wrapped(series):
 @pytest.mark.parametrize("reducer", _reducers)
 def test_series_reduce(reducer):
     reducer = _get_reduce_fn(reducer)
-    np.random.seed(0)
     size = 10
     df, gdf = _make_random_frame(size)
 
diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
index 5a67f4d6cdb..5f9d145139a 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -94,7 +94,7 @@ cpdef bool is_supported_operation(
 ):
     """Check if an operation is supported for the given data types.
 
-    For details, see :cpp:func::is_supported_operation`.
+    For details, see :cpp:func::`is_supported_operation`.
 
     Parameters
     ----------
diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx
index 4601cba515a..e9085e3ea02 100644
--- a/python/pylibcudf/pylibcudf/column_factories.pyx
+++ b/python/pylibcudf/pylibcudf/column_factories.pyx
@@ -18,6 +18,20 @@ from .types import MaskState, TypeId
 
 
 cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id):
+    """Creates an empty column of the specified type.
+
+    For details, see :cpp:func::`make_empty_column`.
+
+    Parameters
+    ----------
+    type_or_id : Union[DataType, type_id, object]
+        The column data type.
+
+    Returns
+    -------
+    Column
+        An empty Column
+    """
     cdef unique_ptr[column] result
     cdef type_id id
 
@@ -60,7 +74,11 @@ cpdef Column make_numeric_column(
     size_type size,
     MaskArg mstate
 ):
+    """Creates an empty numeric column.
+
+    For details, see :cpp:func::`make_numeric_column`.
 
+    """
     cdef unique_ptr[column] result
     cdef mask_state state
 
diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx
index 0ddc68bcb9d..e8e0caaf42d 100644
--- a/python/pylibcudf/pylibcudf/datetime.pyx
+++ b/python/pylibcudf/pylibcudf/datetime.pyx
@@ -2,7 +2,19 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.datetime cimport extract_year as cpp_extract_year
+from pylibcudf.libcudf.datetime cimport (
+    day_of_year as cpp_day_of_year,
+    extract_day as cpp_extract_day,
+    extract_hour as cpp_extract_hour,
+    extract_microsecond_fraction as cpp_extract_microsecond_fraction,
+    extract_millisecond_fraction as cpp_extract_millisecond_fraction,
+    extract_minute as cpp_extract_minute,
+    extract_month as cpp_extract_month,
+    extract_nanosecond_fraction as cpp_extract_nanosecond_fraction,
+    extract_second as cpp_extract_second,
+    extract_weekday as cpp_extract_weekday,
+    extract_year as cpp_extract_year,
+)
 
 from .column cimport Column
 
@@ -28,3 +40,42 @@ cpdef Column extract_year(
     with nogil:
         result = move(cpp_extract_year(values.view()))
     return Column.from_libcudf(move(result))
+
+
+def extract_datetime_component(Column col, str field):
+
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        if field == "year":
+            c_result = move(cpp_extract_year(col.view()))
+        elif field == "month":
+            c_result = move(cpp_extract_month(col.view()))
+        elif field == "day":
+            c_result = move(cpp_extract_day(col.view()))
+        elif field == "weekday":
+            c_result = move(cpp_extract_weekday(col.view()))
+        elif field == "hour":
+            c_result = move(cpp_extract_hour(col.view()))
+        elif field == "minute":
+            c_result = move(cpp_extract_minute(col.view()))
+        elif field == "second":
+            c_result = move(cpp_extract_second(col.view()))
+        elif field == "millisecond":
+            c_result = move(
+                cpp_extract_millisecond_fraction(col.view())
+            )
+        elif field == "microsecond":
+            c_result = move(
+                cpp_extract_microsecond_fraction(col.view())
+            )
+        elif field == "nanosecond":
+            c_result = move(
+                cpp_extract_nanosecond_fraction(col.view())
+            )
+        elif field == "day_of_year":
+            c_result = move(cpp_day_of_year(col.view()))
+        else:
+            raise ValueError(f"Invalid datetime field: '{field}'")
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx
index ae5d33aaa46..afb95dba5b3 100644
--- a/python/pylibcudf/pylibcudf/groupby.pyx
+++ b/python/pylibcudf/pylibcudf/groupby.pyx
@@ -286,7 +286,7 @@ cdef class GroupBy:
 
         Returns
         -------
-        Tuple[List[int], Table, Table]]
+        Tuple[List[int], Table, Table]
             A tuple of tables containing three items:
                 - A list of integer offsets into the group keys/values
                 - A table of group keys
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx
index 667c67f4c36..438b0ff1634 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyx
+++ b/python/pylibcudf/pylibcudf/io/avro.pyx
@@ -20,6 +20,8 @@ cpdef TableWithMetadata read_avro(
     """
     Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.
 
+    For details, see :cpp:func:`read_avro`.
+
     Parameters
     ----------
     source_info: SourceInfo
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index df1f1b14247..981ca7b8159 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -59,6 +59,8 @@ cdef class ChunkedParquetReader:
     """
     Reads chunks of a Parquet file into a :py:class:`~.types.TableWithMetadata`.
 
+    For details, see :cpp:class:`chunked_parquet_reader`.
+
     Parameters
     ----------
     source_info : SourceInfo
@@ -167,6 +169,8 @@ cpdef read_parquet(
 ):
     """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`.
 
+    For details, see :cpp:func:`read_parquet`.
+
     Parameters
     ----------
     source_info : SourceInfo
diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx
index b5a7445df36..b3f6a92d85c 100644
--- a/python/pylibcudf/pylibcudf/labeling.pyx
+++ b/python/pylibcudf/pylibcudf/labeling.pyx
@@ -20,6 +20,8 @@ cpdef Column label_bins(
 ):
     """Labels elements based on membership in the specified bins.
 
+    For details see :cpp:func:`label_bins`.
+
     Parameters
     ----------
     input : Column
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
index bd6e2e0af02..abf4357f862 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources char_types.pyx regex_flags.pyx)
+set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx)
 
 set(linked_libraries cudf::cudf)
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
index 12cd628fc1f..b7166167cfd 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
@@ -10,5 +10,9 @@ from pylibcudf.libcudf.table.table cimport table
 cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[table] extract(
-        column_view source_strings,
-        regex_program) except +
+        column_view input,
+        regex_program prog) except +
+
+    cdef unique_ptr[column] extract_all_record(
+        column_view input,
+        regex_program prog) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
index b25724586e1..e0a8b776465 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
@@ -9,5 +9,5 @@ from pylibcudf.libcudf.strings.regex_program cimport regex_program
 cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] findall(
-        column_view source_strings,
-        regex_program) except +
+        column_view input,
+        regex_program prog) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
index 3a89299f11a..019ff3f17ba 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 from libc.stdint cimport int32_t
 
 
 cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil:
 
-    ctypedef enum side_type:
+    cpdef enum class side_type(int32_t):
         LEFT 'cudf::strings::side_type::LEFT'
         RIGHT 'cudf::strings::side_type::RIGHT'
         BOTH 'cudf::strings::side_type::BOTH'
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx
index 947caddc485..6f82124d06e 100644
--- a/python/pylibcudf/pylibcudf/lists.pyx
+++ b/python/pylibcudf/pylibcudf/lists.pyx
@@ -52,6 +52,8 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx):
 
     All other columns will be duplicated for each element in the list.
 
+    For details, see :cpp:func:`explode_outer`.
+
     Parameters
     ----------
     input : Table
@@ -75,6 +77,8 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx):
 cpdef Column concatenate_rows(Table input):
     """Concatenate multiple lists columns into a single lists column row-wise.
 
+    For details, see :cpp:func:`concatenate_list_elements`.
+
     Parameters
     ----------
     input : Table
@@ -96,6 +100,8 @@ cpdef Column concatenate_rows(Table input):
 cpdef Column concatenate_list_elements(Column input, bool dropna):
     """Concatenate multiple lists on the same row into a single list.
 
+    For details, see :cpp:func:`concatenate_list_elements`.
+
     Parameters
     ----------
     input : Column
@@ -168,6 +174,8 @@ cpdef Column contains_nulls(Column input):
     """Create a column of bool values indicating whether
     each row in the lists column contains a null value.
 
+    For details, see :cpp:func:`contains_nulls`.
+
     Parameters
     ----------
     input : Column
@@ -290,6 +298,8 @@ cpdef Column segmented_gather(Column input, Column gather_map_list):
 cpdef Column extract_list_element(Column input, ColumnOrSizeType index):
     """Create a column of extracted list elements.
 
+    For details, see :cpp:func:`extract_list_element`.
+
     Parameters
     ----------
     input : Column
@@ -318,6 +328,8 @@ cpdef Column count_elements(Column input):
     list element in the given lists column.
     For details, see :cpp:func:`count_elements`.
 
+    For details, see :cpp:func:`count_elements`.
+
     Parameters
     ----------
     input : Column
diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx
index a7d43c9d158..6d707b67449 100644
--- a/python/pylibcudf/pylibcudf/merge.pyx
+++ b/python/pylibcudf/pylibcudf/merge.pyx
@@ -19,6 +19,8 @@ cpdef Table merge (
 ):
     """Merge a set of sorted tables.
 
+    For details see :cpp:func:`merge`.
+
     Parameters
     ----------
     tables_to_merge : list
diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx
index b847ade774d..3a771fbe7ef 100644
--- a/python/pylibcudf/pylibcudf/quantiles.pyx
+++ b/python/pylibcudf/pylibcudf/quantiles.pyx
@@ -30,6 +30,8 @@ cpdef Column quantile(
     Computes the specified quantiles by interpolating values between which they lie,
     using the interpolation strategy specified in interp.
 
+    For details see :cpp:func:`quantile`.
+
     Parameters
     ----------
     input: Column
@@ -91,6 +93,8 @@ cpdef Table quantiles(
     specified quantiles. In the event a quantile lies in between rows, the specified
     interpolation strategy is used to pick between the rows.
 
+    For details see :cpp:func:`quantiles`.
+
     Parameters
     ----------
     input: Table
diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx
index a99145be900..eb1499ebbea 100644
--- a/python/pylibcudf/pylibcudf/reshape.pyx
+++ b/python/pylibcudf/pylibcudf/reshape.pyx
@@ -23,6 +23,8 @@ cpdef Column interleave_columns(Table source_table):
     in     = [[A1, A2, A3], [B1, B2, B3]]
     return = [A1, B1, A2, B2, A3, B3]
 
+    For details, see :cpp:func:`interleave_columns`.
+
     Parameters
     ----------
     source_table: Table
@@ -44,6 +46,8 @@ cpdef Column interleave_columns(Table source_table):
 cpdef Table tile(Table source_table, size_type count):
     """Repeats the rows from input table count times to form a new table.
 
+    For details, see :cpp:func:`tile`.
+
     Parameters
     ----------
     source_table: Table
diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx
index ff2468f3f9c..814bc6553d8 100644
--- a/python/pylibcudf/pylibcudf/search.pyx
+++ b/python/pylibcudf/pylibcudf/search.pyx
@@ -19,6 +19,8 @@ cpdef Column lower_bound(
 ):
     """Find smallest indices in haystack where needles may be inserted to retain order.
 
+    For details, see :cpp:func:`lower_bound`.
+
     Parameters
     ----------
     haystack : Table
@@ -58,6 +60,8 @@ cpdef Column upper_bound(
 ):
     """Find largest indices in haystack where needles may be inserted to retain order.
 
+    For details, see :cpp:func:`upper_bound`.
+
     Parameters
     ----------
     haystack : Table
@@ -92,6 +96,8 @@ cpdef Column upper_bound(
 cpdef Column contains(Column haystack, Column needles):
     """Check whether needles are present in haystack.
 
+    For details, see :cpp:func:`contains`.
+
     Parameters
     ----------
     haystack : Table
diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx
index bd173eebacb..42289d54bca 100644
--- a/python/pylibcudf/pylibcudf/sorting.pyx
+++ b/python/pylibcudf/pylibcudf/sorting.pyx
@@ -16,6 +16,8 @@ from .table cimport Table
 cpdef Column sorted_order(Table source_table, list column_order, list null_precedence):
     """Computes the row indices required to sort the table.
 
+    For details, see :cpp:func:`sorted_order`.
+
     Parameters
     ----------
     source_table : Table
@@ -52,6 +54,8 @@ cpdef Column stable_sorted_order(
     """Computes the row indices required to sort the table,
     preserving order of equal elements.
 
+    For details, see :cpp:func:`stable_sorted_order`.
+
     Parameters
     ----------
     source_table : Table
@@ -90,6 +94,8 @@ cpdef Column rank(
 ):
     """Computes the rank of each element in the column.
 
+    For details, see :cpp:func:`rank`.
+
     Parameters
     ----------
     input_view : Column
@@ -128,6 +134,8 @@ cpdef Column rank(
 cpdef bool is_sorted(Table tbl, list column_order, list null_precedence):
     """Checks if the table is sorted.
 
+    For details, see :cpp:func:`is_sorted`.
+
     Parameters
     ----------
     tbl : Table
@@ -165,6 +173,8 @@ cpdef Table segmented_sort_by_key(
 ):
     """Sorts the table by key, within segments.
 
+    For details, see :cpp:func:`segmented_sort_by_key`.
+
     Parameters
     ----------
     values : Table
@@ -209,6 +219,8 @@ cpdef Table stable_segmented_sort_by_key(
     """Sorts the table by key preserving order of equal elements,
     within segments.
 
+    For details, see :cpp:func:`stable_segmented_sort_by_key`.
+
     Parameters
     ----------
     values : Table
@@ -251,6 +263,8 @@ cpdef Table sort_by_key(
 ):
     """Sorts the table by key.
 
+    For details, see :cpp:func:`sort_by_key`.
+
     Parameters
     ----------
     values : Table
@@ -290,6 +304,8 @@ cpdef Table stable_sort_by_key(
 ):
     """Sorts the table by key preserving order of equal elements.
 
+    For details, see :cpp:func:`stable_sort_by_key`.
+
     Parameters
     ----------
     values : Table
@@ -324,6 +340,8 @@ cpdef Table stable_sort_by_key(
 cpdef Table sort(Table source_table, list column_order, list null_precedence):
     """Sorts the table.
 
+    For details, see :cpp:func:`sort`.
+
     Parameters
     ----------
     source_table : Table
@@ -355,6 +373,8 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence):
 cpdef Table stable_sort(Table source_table, list column_order, list null_precedence):
     """Sorts the table preserving order of equal elements.
 
+    For details, see :cpp:func:`stable_sort`.
+
     Parameters
     ----------
     source_table : Table
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx
index b574bfa9fa2..d5475ea79d5 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pyx
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx
@@ -25,6 +25,8 @@ from .table cimport Table
 cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold):
     """Filters out rows from the input table based on the presence of nulls.
 
+    For details, see :cpp:func:`drop_nulls`.
+
     Parameters
     ----------
     source_table : Table
@@ -53,6 +55,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold):
 cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold):
     """Filters out rows from the input table based on the presence of NaNs.
 
+    For details, see :cpp:func:`drop_nans`.
+
     Parameters
     ----------
     source_table : Table
@@ -81,6 +85,8 @@ cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold):
 cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask):
     """Filters out rows from the input table based on a boolean mask.
 
+    For details, see :cpp:func:`apply_boolean_mask`.
+
     Parameters
     ----------
     source_table : Table
@@ -111,6 +117,8 @@ cpdef Table unique(
 ):
     """Filter duplicate consecutive rows from the input table.
 
+    For details, see :cpp:func:`unique`.
+
     Parameters
     ----------
     input : Table
@@ -153,6 +161,8 @@ cpdef Table distinct(
 ):
     """Get the distinct rows from the input table.
 
+    For details, see :cpp:func:`distinct`.
+
     Parameters
     ----------
     input : Table
@@ -191,6 +201,8 @@ cpdef Column distinct_indices(
 ):
     """Get the indices of the distinct rows from the input table.
 
+    For details, see :cpp:func:`distinct_indices`.
+
     Parameters
     ----------
     input : Table
@@ -226,6 +238,8 @@ cpdef Table stable_distinct(
 ):
     """Get the distinct rows from the input table, preserving input order.
 
+    For details, see :cpp:func:`stable_distinct`.
+
     Parameters
     ----------
     input : Table
@@ -263,6 +277,8 @@ cpdef size_type unique_count(
 ):
     """Returns the number of unique consecutive elements in the input column.
 
+    For details, see :cpp:func:`unique_count`.
+
     Parameters
     ----------
     source : Column
@@ -294,6 +310,8 @@ cpdef size_type distinct_count(
 ):
     """Returns the number of distinct elements in the input column.
 
+    For details, see :cpp:func:`distinct_count`.
+
     Parameters
     ----------
     source : Column
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 457e462e3cf..77f20b0b917 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -12,8 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
-                   regex_program.pyx repeat.pyx replace.pyx slice.pyx
+set(cython_sources
+    capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx
+    regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx
 )
 
 set(linked_libraries cudf::cudf)
@@ -22,3 +23,5 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf
 )
+
+add_subdirectory(convert)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index d1f632d6d8e..91d884b294b 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -5,9 +5,14 @@ from . cimport (
     case,
     char_types,
     contains,
+    convert,
+    extract,
     find,
+    findall,
     regex_flags,
     regex_program,
     replace,
     slice,
+    strip,
 )
+from .side_type cimport side_type
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index 250cefedf55..b4856784390 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -5,10 +5,15 @@
     case,
     char_types,
     contains,
+    convert,
+    extract,
     find,
+    findall,
     regex_flags,
     regex_program,
     repeat,
     replace,
     slice,
+    strip,
 )
+from .side_type import SideType
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
new file mode 100644
index 00000000000..175c9b3738e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -0,0 +1,22 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources convert_durations.pyx convert_datetime.pyx)
+
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
new file mode 100644
index 00000000000..05324cb49df
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from . cimport convert_datetime, convert_durations
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
new file mode 100644
index 00000000000..d803399d53c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from . import convert_datetime, convert_durations
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
new file mode 100644
index 00000000000..07c84d263d6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.string cimport string
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_timestamps(
+    Column input,
+    DataType timestamp_type,
+    const string& format
+)
+
+cpdef Column from_timestamps(
+    Column input,
+    const string& format,
+    Column input_strings_names
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
new file mode 100644
index 00000000000..fcacb096f87
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
@@ -0,0 +1,56 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_datetime as cpp_convert_datetime,
+)
+
+from pylibcudf.types import DataType
+
+
+cpdef Column to_timestamps(
+    Column input,
+    DataType timestamp_type,
+    const string& format
+):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_convert_datetime.to_timestamps(
+            input.view(),
+            timestamp_type.c_obj,
+            format
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column from_timestamps(
+    Column input,
+    const string& format,
+    Column input_strings_names
+):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_convert_datetime.from_timestamps(
+            input.view(),
+            format,
+            input_strings_names.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column is_timestamp(
+    Column input,
+    const string& format
+):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_convert_datetime.is_timestamp(
+            input.view(),
+            format
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
new file mode 100644
index 00000000000..ac11b8959ed
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.string cimport string
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_durations(
+    Column input,
+    DataType duration_type,
+    const string& format
+)
+
+cpdef Column from_durations(
+    Column input,
+    const string& format
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
new file mode 100644
index 00000000000..f3e0b7c9c8e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_durations as cpp_convert_durations,
+)
+
+from pylibcudf.types import DataType
+
+
+cpdef Column to_durations(
+    Column input,
+    DataType duration_type,
+    const string& format
+):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_convert_durations.to_durations(
+            input.view(),
+            duration_type.c_obj,
+            format
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column from_durations(
+    Column input,
+    const string& format
+):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_convert_durations.from_durations(
+            input.view(),
+            format
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/strings/extract.pxd
new file mode 100644
index 00000000000..3871f5a0e4e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/extract.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+
+cpdef Table extract(Column input, RegexProgram prog)
+
+cpdef Column extract_all_record(Column input, RegexProgram prog)
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx
new file mode 100644
index 00000000000..dcb11ca10ce
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/extract.pyx
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport extract as cpp_extract
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+
+cpdef Table extract(Column input, RegexProgram prog):
+    """
+    Returns a table of strings columns where each column
+    corresponds to the matching group specified in the given
+    egex_program object.
+
+    For details, see :cpp:func:`cudf::strings::extract`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    Table
+        Columns of strings extracted from the input column.
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_extract.extract(
+                input.view(),
+                prog.c_obj.get()[0]
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+
+cpdef Column extract_all_record(Column input, RegexProgram prog):
+    """
+    Returns a lists column of strings where each string column
+    row corresponds to the matching group specified in the given
+    regex_program object.
+
+    For details, see :cpp:func:`cudf::strings::extract_all_record`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    Column
+        Lists column containing strings extracted from the input column
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_extract.extract_all_record(
+                input.view(),
+                prog.c_obj.get()[0]
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd
new file mode 100644
index 00000000000..54afa088141
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/findall.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+
+cpdef Column findall(Column input, RegexProgram pattern)
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx
new file mode 100644
index 00000000000..3a6b87504b3
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyx
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport findall as cpp_findall
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+
+cpdef Column findall(Column input, RegexProgram pattern):
+    """
+    Returns a lists column of strings for each matching occurrence using
+    the regex_program pattern within each string.
+
+    For details, see :cpp:func:`cudf::strings::findall`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    pattern : RegexProgram
+        Regex pattern
+
+    Returns
+    -------
+    Column
+        New lists column of strings
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_findall.findall(
+                input.view(),
+                pattern.c_obj.get()[0]
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd
new file mode 100644
index 00000000000..34b7a580380
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.libcudf.strings.side_type cimport side_type
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx
new file mode 100644
index 00000000000..acdc7d6ff1f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.libcudf.strings.side_type import \
+    side_type as SideType  # no-cython-lint
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/strings/strip.pxd
new file mode 100644
index 00000000000..8bbe4753edd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/strip.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.side_type cimport side_type
+
+
+cpdef Column strip(
+    Column input,
+    side_type side=*,
+    Scalar to_strip=*
+)
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx
new file mode 100644
index 00000000000..429a23c3cdf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/strip.pyx
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings cimport strip as cpp_strip
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.side_type cimport side_type
+
+
+cpdef Column strip(
+    Column input,
+    side_type side=side_type.BOTH,
+    Scalar to_strip=None
+):
+    """Removes the specified characters from the beginning
+    or end (or both) of each string.
+
+    For details, see :cpp:func:`cudf::strings::strip`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column for this operation
+    side : SideType, default SideType.BOTH
+        Indicates characters are to be stripped from the beginning,
+        end, or both of each string; Default is both
+    to_strip : Scalar
+        UTF-8 encoded characters to strip from each string;
+        Default is empty string which indicates strip whitespace characters
+
+    Returns
+    -------
+    pylibcudf.Column
+        New strings column.
+    """
+
+    if to_strip is None:
+        to_strip = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    cdef unique_ptr[column] c_result
+    cdef string_scalar* cpp_to_strip
+    cpp_to_strip = <string_scalar *>(to_strip.c_obj.get())
+
+    with nogil:
+        c_result = cpp_strip.strip(
+            input.view(),
+            side,
+            dereference(cpp_to_strip)
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini
index 1761c0f011c..f572f85ca49 100644
--- a/python/pylibcudf/pylibcudf/tests/pytest.ini
+++ b/python/pylibcudf/pylibcudf/tests/pytest.ini
@@ -6,3 +6,4 @@ filterwarnings =
     error
     ignore:::.*xdist.*
     ignore:::.*pytest.*
+addopts = --tb=native
diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py
index d3aa6101e2d..89c96829e71 100644
--- a/python/pylibcudf/pylibcudf/tests/test_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import datetime
+import functools
 
 import pyarrow as pa
 import pyarrow.compute as pc
@@ -10,7 +11,7 @@
 
 
 @pytest.fixture
-def column(has_nulls):
+def date_column(has_nulls):
     values = [
         datetime.date(1999, 1, 1),
         datetime.date(2024, 10, 12),
@@ -22,9 +23,41 @@ def column(has_nulls):
     return plc.interop.from_arrow(pa.array(values, type=pa.date32()))
 
 
-def test_extract_year(column):
-    got = plc.datetime.extract_year(column)
+@pytest.fixture(scope="module", params=["s", "ms", "us", "ns"])
+def datetime_column(has_nulls, request):
+    values = [
+        datetime.datetime(1999, 1, 1),
+        datetime.datetime(2024, 10, 12),
+        datetime.datetime(1970, 1, 1),
+        datetime.datetime(2260, 1, 1),
+        datetime.datetime(2024, 2, 29, 3, 14, 15),
+        datetime.datetime(2024, 2, 29, 3, 14, 15, 999),
+    ]
+    if has_nulls:
+        values[2] = None
+    return plc.interop.from_arrow(
+        pa.array(values, type=pa.timestamp(request.param))
+    )
+
+
+@pytest.mark.parametrize(
+    "component, pc_fun",
+    [
+        ("year", pc.year),
+        ("month", pc.month),
+        ("day", pc.day),
+        ("weekday", functools.partial(pc.day_of_week, count_from_zero=False)),
+        ("hour", pc.hour),
+        ("minute", pc.minute),
+        ("second", pc.second),
+        ("millisecond", pc.millisecond),
+        ("microsecond", pc.microsecond),
+        ("nanosecond", pc.nanosecond),
+    ],
+)
+def test_extraction(datetime_column, component, pc_fun):
+    got = plc.datetime.extract_datetime_component(datetime_column, component)
     # libcudf produces an int16, arrow produces an int64
-    expect = pc.year(plc.interop.to_arrow(column)).cast(pa.int16())
+    expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16())
 
     assert_column_eq(expect, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
new file mode 100644
index 00000000000..e9e95459d0e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from datetime import datetime
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        pa.timestamp("ns"),
+        pa.timestamp("us"),
+        pa.timestamp("ms"),
+        pa.timestamp("s"),
+    ],
+)
+def timestamp_type(request):
+    return request.param
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        pa.duration("ns"),
+        pa.duration("us"),
+        pa.duration("ms"),
+        pa.duration("s"),
+    ],
+)
+def duration_type(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def pa_timestamp_col():
+    return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"])
+
+
+@pytest.fixture(scope="module")
+def pa_duration_col():
+    return pa.array(["05:20:25"])
+
+
+@pytest.fixture(scope="module")
+def plc_timestamp_col(pa_timestamp_col):
+    return plc.interop.from_arrow(pa_timestamp_col)
+
+
+@pytest.fixture(scope="module")
+def plc_duration_col(pa_duration_col):
+    return plc.interop.from_arrow(pa_duration_col)
+
+
+@pytest.mark.parametrize("format", ["%Y-%m-%d"])
+def test_to_datetime(
+    pa_timestamp_col, plc_timestamp_col, timestamp_type, format
+):
+    expect = pa.compute.strptime(pa_timestamp_col, format, timestamp_type.unit)
+    got = plc.strings.convert.convert_datetime.to_timestamps(
+        plc_timestamp_col,
+        plc.interop.from_arrow(timestamp_type),
+        format.encode(),
+    )
+    assert_column_eq(expect, got)
+
+
+@pytest.mark.parametrize("format", ["%H:%M:%S"])
+def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format):
+    def to_timedelta(duration_str):
+        date = datetime.strptime(duration_str, format)
+        return date - datetime(1900, 1, 1)  # "%H:%M:%S" zero date
+
+    expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast(
+        duration_type
+    )
+
+    got = plc.strings.convert.convert_durations.to_durations(
+        plc_duration_col,
+        plc.interop.from_arrow(duration_type),
+        format.encode(),
+    )
+    assert_column_eq(expect, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
new file mode 100644
index 00000000000..788b86423c4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+
+
+def test_extract():
+    pattern = "([ab])(\\d)"
+    pa_pattern = "(?P<letter>[ab])(?P<digit>\\d)"
+    arr = pa.array(["a1", "b2", "c3"])
+    plc_result = plc.strings.extract.extract(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pc.extract_regex(arr, pa_pattern)
+    for i, result_col in enumerate(result.itercolumns()):
+        expected_col = pa.chunked_array(expected.field(i))
+        assert result_col.fill_null("").equals(expected_col)
+
+
+def test_extract_all_record():
+    pattern = "([ab])(\\d)"
+    arr = pa.array(["a1", "b2", "c3"])
+    plc_result = plc.strings.extract.extract_all_record(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.chunked_array(
+        [pa.array([["a", "1"], ["b", "2"], None], type=result.type)]
+    )
+    assert result.equals(expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
new file mode 100644
index 00000000000..994552fa276
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import re
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_findall():
+    arr = pa.array(["bunny", "rabbit", "hare", "dog"])
+    pattern = "[ab]"
+    result = plc.strings.findall.findall(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    pa_result = plc.interop.to_arrow(result)
+    expected = pa.array(
+        [re.findall(pattern, elem) for elem in arr.to_pylist()],
+        type=pa_result.type,
+    )
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
new file mode 100644
index 00000000000..005e5e4a405
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+data_strings = [
+    "AbC",
+    "123abc",
+    "",
+    " ",
+    None,
+    "aAaaaAAaa",
+    " ab c ",
+    "abc123",
+    "    ",
+    "\tabc\t",
+    "\nabc\n",
+    "\r\nabc\r\n",
+    "\t\n abc \n\t",
+    "!@#$%^&*()",
+    "   abc!!!   ",
+    "   abc\t\n!!!   ",
+    "__abc__",
+    "abc\n\n",
+    "123abc456",
+    "abcxyzabc",
+]
+
+strip_chars = [
+    "a",
+    "",
+    " ",
+    "\t",
+    "\n",
+    "\r\n",
+    "!",
+    "@#",
+    "123",
+    "xyz",
+    "abc",
+    "__",
+    " \t\n",
+    "abc123",
+]
+
+
+@pytest.fixture
+def pa_col():
+    return pa.array(data_strings, type=pa.string())
+
+
+@pytest.fixture
+def plc_col(pa_col):
+    return plc.interop.from_arrow(pa_col)
+
+
+@pytest.fixture(params=strip_chars)
+def pa_char(request):
+    return pa.scalar(request.param, type=pa.string())
+
+
+@pytest.fixture
+def plc_char(pa_char):
+    return plc.interop.from_arrow(pa_char)
+
+
+def test_strip(pa_col, plc_col, pa_char, plc_char):
+    def strip_string(st, char):
+        if st is None:
+            return None
+
+        elif char == "":
+            return st.strip()
+        return st.strip(char)
+
+    expected = pa.array(
+        [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()],
+        type=pa.string(),
+    )
+
+    got = plc.strings.strip.strip(plc_col, plc.strings.SideType.BOTH, plc_char)
+    assert_column_eq(expected, got)
+
+
+def test_strip_right(pa_col, plc_col, pa_char, plc_char):
+    def strip_string(st, char):
+        if st is None:
+            return None
+
+        elif char == "":
+            return st.rstrip()
+        return st.rstrip(char)
+
+    expected = pa.array(
+        [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()],
+        type=pa.string(),
+    )
+
+    got = plc.strings.strip.strip(
+        plc_col, plc.strings.SideType.RIGHT, plc_char
+    )
+    assert_column_eq(expected, got)
+
+
+def test_strip_left(pa_col, plc_col, pa_char, plc_char):
+    def strip_string(st, char):
+        if st is None:
+            return None
+
+        elif char == "":
+            return st.lstrip()
+        return st.lstrip(char)
+
+    expected = pa.array(
+        [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()],
+        type=pa.string(),
+    )
+
+    got = plc.strings.strip.strip(plc_col, plc.strings.SideType.LEFT, plc_char)
+    assert_column_eq(expected, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py
index 06fc35d8835..d5c618f07e4 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transform.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transform.py
@@ -29,3 +29,54 @@ def test_nans_to_nulls(has_nans):
     got = input.with_mask(mask, null_count)
 
     assert_column_eq(expect, got)
+
+
+def test_bools_to_mask_roundtrip():
+    pa_array = pa.array([True, None, False])
+    plc_input = plc.interop.from_arrow(pa_array)
+    mask, result_null_count = plc.transform.bools_to_mask(plc_input)
+
+    assert result_null_count == 2
+    result = plc_input.with_mask(mask, result_null_count)
+    assert_column_eq(pa.array([True, None, None]), result)
+
+    plc_output = plc.transform.mask_to_bools(mask.ptr, 0, len(pa_array))
+    result_pa = plc.interop.to_arrow(plc_output)
+    expected_pa = pa.chunked_array([[True, False, False]])
+    assert result_pa.equals(expected_pa)
+
+
+def test_encode():
+    pa_table = pa.table({"a": [1, 3, 4], "b": [1, 2, 4]})
+    plc_input = plc.interop.from_arrow(pa_table)
+    result_table, result_column = plc.transform.encode(plc_input)
+    pa_table_result = plc.interop.to_arrow(result_table)
+    pa_column_result = plc.interop.to_arrow(result_column)
+
+    pa_table_expected = pa.table(
+        [[1, 3, 4], [1, 2, 4]],
+        schema=pa.schema(
+            [
+                pa.field("", pa.int64(), nullable=False),
+                pa.field("", pa.int64(), nullable=False),
+            ]
+        ),
+    )
+    assert pa_table_result.equals(pa_table_expected)
+
+    pa_column_expected = pa.chunked_array([[0, 1, 2]], type=pa.int32())
+    assert pa_column_result.equals(pa_column_expected)
+
+
+def test_one_hot_encode():
+    pa_column = pa.array([1, 2, 3])
+    pa_categories = pa.array([0, 0, 0])
+    plc_input = plc.interop.from_arrow(pa_column)
+    plc_categories = plc.interop.from_arrow(pa_categories)
+    plc_table = plc.transform.one_hot_encode(plc_input, plc_categories)
+    result = plc.interop.to_arrow(plc_table)
+    expected = pa.table(
+        [[False] * 3] * 3,
+        schema=pa.schema([pa.field("", pa.bool_(), nullable=False)] * 3),
+    )
+    assert result.equals(expected)
diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd
index 4b21feffe25..b530f433c97 100644
--- a/python/pylibcudf/pylibcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/transform.pxd
@@ -1,7 +1,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp cimport bool
+from pylibcudf.libcudf.types cimport bitmask_type, data_type
 
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
+from .table cimport Table
+from .types cimport DataType
 
 
 cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input)
+
+cpdef tuple[gpumemoryview, int] bools_to_mask(Column input)
+
+cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit)
+
+cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx)
+
+cpdef tuple[Table, Column] encode(Table input)
+
+cpdef Table one_hot_encode(Column input_column, Column categories)
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index 100ccb580ce..de425a27c15 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -1,19 +1,27 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
 from libcpp.utility cimport move, pair
 from pylibcudf.libcudf cimport transform as cpp_transform
-from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport bitmask_type, size_type
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
+from .types cimport DataType
+from .utils cimport int_to_bitmask_ptr
 
 
 cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
     """Create a null mask preserving existing nulls and converting nans to null.
 
+    For details, see :cpp:func:`nans_to_nulls`.
+
     Parameters
     ----------
     input : Column
@@ -32,3 +40,141 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
         gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
         c_result.second
     )
+
+
+cpdef tuple[gpumemoryview, int] bools_to_mask(Column input):
+    """Create a bitmask from a column of boolean elements
+
+    Parameters
+    ----------
+    input : Column
+        Column to produce new mask from.
+
+    Returns
+    -------
+    tuple[gpumemoryview, int]
+        Two-tuple of a gpumemoryview wrapping the bitmask and the null count.
+    """
+    cdef pair[unique_ptr[device_buffer], size_type] c_result
+
+    with nogil:
+        c_result = move(cpp_transform.bools_to_mask(input.view()))
+
+    return (
+        gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
+        c_result.second
+    )
+
+
+cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit):
+    """Creates a boolean column from given bitmask.
+
+    Parameters
+    ----------
+    bitmask : int
+        Pointer to the bitmask which needs to be converted
+    begin_bit : int
+        Position of the bit from which the conversion should start
+    end_bit : int
+        Position of the bit before which the conversion should stop
+
+    Returns
+    -------
+    Column
+        Boolean column of the bitmask from [begin_bit, end_bit]
+    """
+    cdef unique_ptr[column] c_result
+    cdef bitmask_type * bitmask_ptr = int_to_bitmask_ptr(bitmask)
+
+    with nogil:
+        c_result = move(cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx):
+    """Create a new column by applying a unary function against every
+       element of an input column.
+
+    Parameters
+    ----------
+    input : Column
+        Column to transform.
+    unary_udf : str
+        The PTX/CUDA string of the unary function to apply.
+    output_type : DataType
+        The output type that is compatible with the output type in the unary_udf.
+    is_ptx : bool
+        If `True`, the UDF is treated as PTX code.
+        If `False`, the UDF is treated as CUDA code.
+
+    Returns
+    -------
+    Column
+        The transformed column having the UDF applied to each element.
+    """
+    cdef unique_ptr[column] c_result
+    cdef string c_unary_udf = unary_udf.encode()
+    cdef bool c_is_ptx = is_ptx
+
+    with nogil:
+        c_result = move(
+            cpp_transform.transform(
+                input.view(), c_unary_udf, output_type.c_obj, c_is_ptx
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef tuple[Table, Column] encode(Table input):
+    """Encode the rows of the given table as integers.
+
+    Parameters
+    ----------
+    input : Table
+        Table containing values to be encoded
+
+    Returns
+    -------
+    tuple[Table, Column]
+        The distinct row of the input table in sorted order,
+        and a column of integer indices representing the encoded rows.
+    """
+    cdef pair[unique_ptr[table], unique_ptr[column]] c_result
+
+    with nogil:
+        c_result = move(cpp_transform.encode(input.view()))
+
+    return (
+        Table.from_libcudf(move(c_result.first)),
+        Column.from_libcudf(move(c_result.second))
+    )
+
+cpdef Table one_hot_encode(Column input, Column categories):
+    """Encodes `input` by generating a new column
+    for each value in `categories` indicating the presence
+    of that value in `input`.
+
+    Parameters
+    ----------
+    input : Column
+        Column containing values to be encoded.
+    categories : Column
+        Column containing categories
+
+    Returns
+    -------
+    Column
+        A table of the encoded values.
+    """
+    cdef pair[unique_ptr[column], table_view] c_result
+    cdef Table owner_table
+
+    with nogil:
+        c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view()))
+
+    owner_table = Table(
+        [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns()
+    )
+
+    return Table.from_table_view(c_result.second, owner_table)