diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a35802f2ab0..ceee9074b93 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,7 +25,8 @@ jobs:
       - docs-build
       - wheel-build-cudf
       - wheel-tests-cudf
-      - test-cudf-polars
+      - wheel-build-cudf-polars
+      - wheel-tests-cudf-polars
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
       - devcontainer
@@ -133,9 +134,18 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
-  test-cudf-polars:
+  wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      script: "ci/build_wheel_cudf_polars.sh"
+  wheel-tests-cudf-polars:
+    needs: wheel-build-cudf-polars
+    secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -143,7 +153,7 @@ jobs:
       build_type: pull-request
       # This always runs, but only fails if this PR touches code in
       # pylibcudf or cudf_polars
-      script: "ci/test_cudf_polars.sh"
+      script: "ci/test_wheel_cudf_polars.sh"
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f8c4f4b9143..bbcd78d051f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -136,11 +136,6 @@ repos:
             .*test.*|
             ^CHANGELOG.md$
           )
-  - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.13.11
-    hooks:
-      - id: rapids-dependency-file-generator
-        args: ["--clean"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.4.8
     hooks:
@@ -149,7 +144,7 @@ repos:
       - id: ruff-format
         files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.0.3
+    rev: v0.2.0
     hooks:
       - id: verify-copyright
         exclude: |
@@ -158,6 +153,12 @@ repos:
             cpp/src/io/parquet/ipc/Message_generated[.]h$|
             cpp/src/io/parquet/ipc/Schema_generated[.]h$
           )
+      - id: verify-alpha-spec
+  - repo: https://github.com/rapidsai/dependency-file-generator
+    rev: v1.13.11
+    hooks:
+      - id: rapids-dependency-file-generator
+        args: ["--clean"]
 
 default_language_version:
       python: python3
diff --git a/build.sh b/build.sh
index 4291c88ea12..52bb1e64d16 100755
--- a/build.sh
+++ b/build.sh
@@ -17,7 +17,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats"
+VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
 HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
    clean                         - remove all existing build artifacts and configuration (start
                                    over)
@@ -39,6 +39,7 @@ HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [li
    --opensource_nvcomp           - disable use of proprietary nvcomp extensions
    --show_depr_warn              - show cmake deprecation warnings
    --ptds                        - enable per-thread default stream
+   --disable_large_strings       - disable large strings support
    --build_metrics               - generate build metrics report for libcudf
    --incl_cache_stats            - include cache statistics in build metrics report
    --cmake-args=\\\"<args>\\\"   - pass arbitrary list of CMake configuration options (escape all quotes in argument)
@@ -69,6 +70,7 @@ BUILD_DISABLE_DEPRECATION_WARNINGS=ON
 BUILD_PER_THREAD_DEFAULT_STREAM=OFF
 BUILD_REPORT_METRICS=OFF
 BUILD_REPORT_INCL_CACHE_STATS=OFF
+BUILD_DISABLE_LARGE_STRINGS=OFF
 USE_PROPRIETARY_NVCOMP=ON
 PYTHON_ARGS_FOR_INSTALL="-m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true"
 
@@ -153,6 +155,7 @@ function buildLibCudfJniInDocker {
                 -DCUDF_ENABLE_ARROW_S3=OFF \
                 -DBUILD_TESTS=OFF \
                 -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=ON \
+                -DCUDF_LARGE_STRINGS_DISABLED=ON \
                 -DRMM_LOGGING_LEVEL=OFF \
                 -DBUILD_SHARED_LIBS=OFF && \
              cmake --build . --parallel ${PARALLEL_LEVEL} && \
@@ -239,6 +242,9 @@ if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUDF_CPP"* ]]; then
     EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_CUDF_CPP=ON"
 fi
 
+if hasArg --disable_large_strings; then
+    BUILD_DISABLE_LARGE_STRINGS="ON"
+fi
 
 # If clean given, run it prior to any other steps
 if hasArg clean; then
@@ -292,6 +298,7 @@ if buildAll || hasArg libcudf; then
           -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \
           -DDISABLE_DEPRECATION_WARNINGS=${BUILD_DISABLE_DEPRECATION_WARNINGS} \
           -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=${BUILD_PER_THREAD_DEFAULT_STREAM} \
+          -DCUDF_LARGE_STRINGS_DISABLED=${BUILD_DISABLE_LARGE_STRINGS} \
           -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
           ${EXTRA_CMAKE_ARGS}
 
diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh
new file mode 100755
index 00000000000..9c945e11c00
--- /dev/null
+++ b/ci/build_wheel_cudf_polars.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir="python/cudf_polars"
+
+./ci/build_wheel.sh ${package_dir}
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh
new file mode 100755
index 00000000000..c10612a065a
--- /dev/null
+++ b/ci/run_cudf_polars_pytests.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# It is essential to cd into python/cudf_polars as `pytest-xdist` + `coverage` seem to work only at this directory level.
+
+# Support invoking run_cudf_polars_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/
+
+python -m pytest --cache-clear "$@" tests
diff --git a/ci/test_java.sh b/ci/test_java.sh
index 9713eb192d2..629ad11014a 100755
--- a/ci/test_java.sh
+++ b/ci/test_java.sh
@@ -39,6 +39,9 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
+# disable large strings
+export LIBCUDF_LARGE_STRINGS_ENABLED=0
+
 rapids-logger "Run Java tests"
 pushd java
 mvn test -B -DCUDF_JNI_ENABLE_PROFILING=OFF
diff --git a/ci/test_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
similarity index 65%
rename from ci/test_cudf_polars.sh
rename to ci/test_wheel_cudf_polars.sh
index 669e049ab26..900acd5d473 100755
--- a/ci/test_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -18,21 +18,14 @@ else
 fi
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
-RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
-mkdir -p "${RAPIDS_TESTS_DIR}"
-
-rapids-logger "Install cudf wheel"
-# echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/cudf*.whl)[test]
-
-rapids-logger "Install polars (allow pre-release versions)"
-python -m pip install 'polars>=1.0.0a0'
+# Download the cudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+python -m pip install ./local-cudf-dep/cudf*.whl
 
 rapids-logger "Install cudf_polars"
-python -m pip install --no-deps python/cudf_polars
+python -m pip install $(echo ./dist/cudf_polars*.whl)[test]
 
 rapids-logger "Run cudf_polars tests"
 
@@ -44,13 +37,11 @@ EXITCODE=0
 trap set_exitcode ERR
 set +e
 
-python -m pytest \
-       --cache-clear \
+./ci/run_cudf_polars_pytests.sh \
        --cov cudf_polars \
        --cov-fail-under=100 \
-       --cov-config=python/cudf_polars/pyproject.toml \
-       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml" \
-       python/cudf_polars/tests
+       --cov-config=./pyproject.toml \
+       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml"
 
 trap ERR
 set -e
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 2b20b9d9ce4..c3800d3cc25 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -8,7 +8,7 @@ RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="
 
 # Download the cudf built in the previous step
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
+python -m pip install ./local-cudf-dep/cudf*.whl
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 946e2d1cd32..b8d73a01f96 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cachetools
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cramjam
 - cubinlinker
 - cuda-nvtx=11.8
@@ -26,7 +26,6 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*
 - dask-cuda==24.8.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
@@ -44,10 +43,10 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==24.8.*
+- libkvikio==24.8.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*
+- librmm==24.8.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index f069616ddbe..c32d21c5d36 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cachetools
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cramjam
 - cuda-cudart-dev
 - cuda-nvcc
@@ -27,7 +27,6 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*
 - dask-cuda==24.8.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
@@ -43,10 +42,10 @@ dependencies:
 - libarrow==16.1.0.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==24.8.*
+- libkvikio==24.8.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*
+- librmm==24.8.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -66,7 +65,7 @@ dependencies:
 - pre-commit
 - pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
-- pynvjitlink
+- pynvjitlink>=0.0.0a0
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov
diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml
index d399e440edd..af894cccda0 100644
--- a/conda/recipes/cudf/conda_build_config.yaml
+++ b/conda/recipes/cudf/conda_build_config.yaml
@@ -11,7 +11,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 cuda_compiler:
   - cuda-nvcc
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index d399e440edd..af894cccda0 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -11,7 +11,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 cuda_compiler:
   - cuda-nvcc
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index c01178bf732..4f99411e978 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -17,7 +17,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 libarrow_version:
   - "==16.1.0"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 94df0433b81..80f737378c5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -52,6 +52,8 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
 option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON)
 mark_as_advanced(CUDF_BUILD_TESTUTIL)
 option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON)
+option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF)
+mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED)
 option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF)
 option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF)
 option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF)
@@ -365,6 +367,7 @@ add_library(
   src/interop/to_arrow_device.cu
   src/interop/from_arrow_device.cu
   src/interop/from_arrow_host.cu
+  src/interop/from_arrow_stream.cu
   src/interop/to_arrow_schema.cpp
   src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
@@ -782,6 +785,11 @@ if(NOT USE_NVTX)
   target_compile_definitions(cudf PUBLIC NVTX_DISABLE)
 endif()
 
+# Disable large strings support
+if(CUDF_LARGE_STRINGS_DISABLED)
+  target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED)
+endif()
+
 # Define RMM logging level
 target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL")
 
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 8a48126e195..a5b248135c1 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -40,8 +40,13 @@ target_include_directories(
 
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
-  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
-                               synchronization/synchronization.cpp io/cuio_common.cpp
+  cudf_benchmark_common OBJECT
+  "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
+  synchronization/synchronization.cpp
+  io/cuio_common.cpp
+  common/table_utilities.cpp
+  common/benchmark_utilities.cpp
+  common/nvbench_utilities.cpp
 )
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
 add_custom_command(
diff --git a/cpp/benchmarks/common/benchmark_utilities.cpp b/cpp/benchmarks/common/benchmark_utilities.cpp
new file mode 100644
index 00000000000..0b9fc17e779
--- /dev/null
+++ b/cpp/benchmarks/common/benchmark_utilities.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "benchmark_utilities.hpp"
+
+void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration)
+{
+  state.SetItemsProcessed(state.iterations() * items_processed_per_iteration);
+}
+
+void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration)
+{
+  state.SetBytesProcessed(state.iterations() * bytes_processed_per_iteration);
+}
diff --git a/cpp/benchmarks/common/benchmark_utilities.hpp b/cpp/benchmarks/common/benchmark_utilities.hpp
new file mode 100644
index 00000000000..c5c80e73674
--- /dev/null
+++ b/cpp/benchmarks/common/benchmark_utilities.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+/**
+ * @brief Sets the number of items processed during the benchmark.
+ *
+ * This function could be used instead of ::benchmark::State.SetItemsProcessed()
+ * to avoid repeatedly computing ::benchmark::State.iterations() * items_processed_per_iteration.
+ *
+ * @param state the benchmark state
+ * @param items_processed_per_iteration number of items processed per iteration
+ */
+void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration);
+
+/**
+ * @brief Sets the number of bytes processed during the benchmark.
+ *
+ * This function could be used instead of ::benchmark::State.SetItemsProcessed()
+ * to avoid repeatedly computing ::benchmark::State.iterations() * bytes_processed_per_iteration.
+ *
+ * @param state the benchmark state
+ * @param bytes_processed_per_iteration number of bytes processed per iteration
+ */
+void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration);
diff --git a/cpp/benchmarks/common/nvbench_utilities.cpp b/cpp/benchmarks/common/nvbench_utilities.cpp
new file mode 100644
index 00000000000..c740eaa52f4
--- /dev/null
+++ b/cpp/benchmarks/common/nvbench_utilities.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nvbench_utilities.hpp"
+
+#include <nvbench/nvbench.cuh>
+
+// This function is copied over from
+// https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224.
+void set_throughputs(nvbench::state& state)
+{
+  double avg_cuda_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+
+  if (const auto items = state.get_element_count(); items != 0) {
+    auto& summ = state.add_summary("nv/cold/bw/item_rate");
+    summ.set_string("name", "Elem/s");
+    summ.set_string("hint", "item_rate");
+    summ.set_string("description", "Number of input elements processed per second");
+    summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
+  }
+
+  if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0) {
+    const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cuda_time;
+    {
+      auto& summ = state.add_summary("nv/cold/bw/global/bytes_per_second");
+      summ.set_string("name", "GlobalMem BW");
+      summ.set_string("hint", "byte_rate");
+      summ.set_string("description",
+                      "Number of bytes read/written per second to the CUDA "
+                      "device's global memory");
+      summ.set_float64("value", avg_used_gmem_bw);
+    }
+
+    {
+      const auto peak_gmem_bw =
+        static_cast<double>(state.get_device()->get_global_memory_bus_bandwidth());
+
+      auto& summ = state.add_summary("nv/cold/bw/global/utilization");
+      summ.set_string("name", "BWUtil");
+      summ.set_string("hint", "percentage");
+      summ.set_string("description",
+                      "Global device memory utilization as a percentage of the "
+                      "device's peak bandwidth");
+      summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw);
+    }
+  }
+}
diff --git a/cpp/benchmarks/common/nvbench_utilities.hpp b/cpp/benchmarks/common/nvbench_utilities.hpp
new file mode 100644
index 00000000000..98d879efac5
--- /dev/null
+++ b/cpp/benchmarks/common/nvbench_utilities.hpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace nvbench {
+struct state;
+}
+
+/**
+ * @brief Sets throughput statistics, such as "Elem/s", "GlobalMem BW", and "BWUtil" for the
+ * nvbench results summary.
+ *
+ * This function could be used to work around a known issue that the throughput statistics
+ * should be added before the nvbench::state.exec() call, otherwise they will not be printed
+ * in the summary. See https://github.com/NVIDIA/nvbench/issues/175 for more details.
+ */
+void set_throughputs(nvbench::state& state);
diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp
new file mode 100644
index 00000000000..a6fbdac9fb8
--- /dev/null
+++ b/cpp/benchmarks/common/table_utilities.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "table_utilities.hpp"
+
+#include <cudf/reduction.hpp>
+#include <cudf/transform.hpp>
+
+#include <cmath>
+
+int64_t estimate_size(cudf::column_view const& col)
+{
+  return estimate_size(cudf::table_view({col}));
+}
+
+int64_t estimate_size(cudf::table_view const& view)
+{
+  // Compute the size in bits for each row.
+  auto const row_sizes = cudf::row_bit_count(view);
+  // Accumulate the row sizes to compute a sum.
+  auto const agg = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  cudf::data_type sum_dtype{cudf::type_id::INT64};
+  auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype);
+  auto const total_size_in_bits =
+    static_cast<cudf::numeric_scalar<int64_t>*>(total_size_scalar.get())->value();
+  // Convert the size in bits to the size in bytes.
+  return static_cast<int64_t>(std::ceil(static_cast<double>(total_size_in_bits) / 8));
+}
diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp
new file mode 100644
index 00000000000..04ee847d397
--- /dev/null
+++ b/cpp/benchmarks/common/table_utilities.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/table/table_view.hpp>
+
+/**
+ * @brief Estimates the column size in bytes.
+ *
+ * @remark As this function internally uses cudf::row_bit_count() to estimate each row size
+ * and accumulates them, the returned estimate may be an inexact approximation in some
+ * cases. See cudf::row_bit_count() for more details.
+ *
+ * @param view The column view to estimate its size
+ */
+int64_t estimate_size(cudf::column_view const& view);
+
+/**
+ * @brief Estimates the table size in bytes.
+ *
+ * @remark As this function internally uses cudf::row_bit_count() to estimate each row size
+ * and accumulates them, the returned estimate may be an inexact approximation in some
+ * cases. See cudf::row_bit_count() for more details.
+ *
+ * @param view The table view to estimate its size
+ */
+int64_t estimate_size(cudf::table_view const& view);
diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp
index 8b1e71c1585..e9d23881764 100644
--- a/cpp/benchmarks/reduction/anyall.cpp
+++ b/cpp/benchmarks/reduction/anyall.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -42,6 +44,10 @@ void BM_reduction_anyall(benchmark::State& state,
     cuda_event_timer timer(state, true);
     auto result = cudf::reduce(*values, *agg, output_dtype);
   }
+
+  // The benchmark takes a column and produces one scalar.
+  set_items_processed(state, column_size + 1);
+  set_bytes_processed(state, estimate_size(values->view()) + cudf::size_of(output_dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp
index c1c44c919ac..5095337dbb3 100644
--- a/cpp/benchmarks/reduction/dictionary.cpp
+++ b/cpp/benchmarks/reduction/dictionary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -52,6 +53,13 @@ void BM_reduction_dictionary(benchmark::State& state,
     cuda_event_timer timer(state, true);
     auto result = cudf::reduce(*values, *agg, output_dtype);
   }
+
+  // The benchmark takes a column and produces two scalars.
+  set_items_processed(state, column_size + 1);
+
+  // We don't set the metrics for the size read/written as row_bit_count() doesn't
+  // support the dictionary type yet (and so is estimate_size()).
+  // See https://github.com/rapidsai/cudf/issues/16121 for details.
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp
index 963c26692e7..050f2887221 100644
--- a/cpp/benchmarks/reduction/minmax.cpp
+++ b/cpp/benchmarks/reduction/minmax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -28,14 +30,19 @@ template <typename type>
 void BM_reduction(benchmark::State& state)
 {
   cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  auto const dtype = cudf::type_to_id<type>();
+  auto const dtype_id = cudf::type_to_id<type>();
   auto const input_column =
-    create_random_column(dtype, row_count{column_size}, data_profile_builder().no_validity());
+    create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity());
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
     auto result = cudf::minmax(*input_column);
   }
+
+  // The benchmark takes a column and produces two scalars.
+  set_items_processed(state, column_size + 2);
+  cudf::data_type dtype = cudf::data_type{dtype_id};
+  set_bytes_processed(state, estimate_size(input_column->view()) + 2 * cudf::size_of(dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
index e55f3b9e09f..14876c80d3e 100644
--- a/cpp/benchmarks/reduction/rank.cpp
+++ b/cpp/benchmarks/reduction/rank.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 
 #include <cudf/detail/scan.hpp>
 #include <cudf/filling.hpp>
@@ -39,11 +41,18 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_list<typ
   auto const new_tbl = cudf::repeat(table->view(), 2);
   cudf::column_view input(new_tbl->view().column(0));
 
+  std::unique_ptr<cudf::column> result = nullptr;
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
-    auto result = cudf::detail::inclusive_dense_rank_scan(
+    result = cudf::detail::inclusive_dense_rank_scan(
       input, stream_view, rmm::mr::get_current_device_resource());
   });
+
+  state.add_element_count(input.size());
+  state.add_global_memory_reads(estimate_size(input));
+  state.add_global_memory_writes(estimate_size(result->view()));
+
+  set_throughputs(state);
 }
 
 using data_type = nvbench::type_list<int32_t, cudf::list_view>;
diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp
index 5bd3e2e3bba..63c96f4fe9e 100644
--- a/cpp/benchmarks/reduction/reduce.cpp
+++ b/cpp/benchmarks/reduction/reduce.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -46,6 +48,10 @@ void BM_reduction(benchmark::State& state, std::unique_ptr<cudf::reduce_aggregat
     cuda_event_timer timer(state, true);
     auto result = cudf::reduce(*input_column, *agg, output_dtype);
   }
+
+  // The benchmark takes a column and produces two scalars.
+  set_items_processed(state, column_size + 1);
+  set_bytes_processed(state, estimate_size(input_column->view()) + cudf::size_of(output_dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index 8c9883ece9c..dc05aad9807 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -34,11 +36,16 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
   auto const column = create_random_column(dtype, row_count{n_rows});
   if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0);
 
+  std::unique_ptr<cudf::column> result = nullptr;
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    auto result = cudf::scan(
+    result = cudf::scan(
       *column, *cudf::make_min_aggregation<cudf::scan_aggregation>(), cudf::scan_type::INCLUSIVE);
   }
+
+  // The benchmark takes a column and produces a new column of the same size as input.
+  set_items_processed(state, n_rows * 2);
+  set_bytes_processed(state, estimate_size(column->view()) + estimate_size(result->view()));
 }
 
 #define SCAN_BENCHMARK_DEFINE(name, type, nulls)                          \
diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp
index ee97b54fbef..a781f75a314 100644
--- a/cpp/benchmarks/reduction/scan_structs.cpp
+++ b/cpp/benchmarks/reduction/scan_structs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/scan.hpp>
@@ -45,16 +47,24 @@ static void nvbench_structs_scan(nvbench::state& state)
   auto [null_mask, null_count] = create_random_null_mask(size, null_probability);
   auto const input             = cudf::make_structs_column(
     size, std::move(data_table->release()), null_count, std::move(null_mask));
+  auto input_view = input->view();
 
   auto const agg         = cudf::make_min_aggregation<cudf::scan_aggregation>();
   auto const null_policy = static_cast<cudf::null_policy>(state.get_int64("null_policy"));
   auto const stream      = cudf::get_default_stream();
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  std::unique_ptr<cudf::column> result = nullptr;
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto const result = cudf::detail::scan_inclusive(
-      *input, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
+    result = cudf::detail::scan_inclusive(
+      input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
   });
+
+  state.add_element_count(input_view.size());
+  state.add_global_memory_reads(estimate_size(input_view));
+  state.add_global_memory_writes(estimate_size(result->view()));
+
+  set_throughputs(state);
 }
 
 NVBENCH_BENCH(nvbench_structs_scan)
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 9758958b44f..6ec35ddcaf1 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -15,6 +15,11 @@
 # This function finds cuCollections and performs any additional configuration.
 function(find_and_configure_cucollections)
   include(${rapids-cmake-dir}/cpm/cuco.cmake)
+  include(${rapids-cmake-dir}/cpm/package_override.cmake)
+
+  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
+  rapids_cpm_package_override("${cudf_patch_dir}/cuco_override.json")
+
   if(BUILD_SHARED_LIBS)
     rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports)
   else()
diff --git a/cpp/cmake/thirdparty/patches/cuco_noexcept.diff b/cpp/cmake/thirdparty/patches/cuco_noexcept.diff
new file mode 100644
index 00000000000..0f334c0e81f
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/cuco_noexcept.diff
@@ -0,0 +1,227 @@
+diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh
+index 7f9de01..5228193 100644
+--- a/include/cuco/aow_storage.cuh
++++ b/include/cuco/aow_storage.cuh
+@@ -81,7 +81,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
+    * @param size Number of windows to (de)allocate
+    * @param allocator Allocator used for (de)allocating device storage
+    */
+-  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;
++  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {});
+ 
+   aow_storage(aow_storage&&) = default;  ///< Move constructor
+   /**
+@@ -122,7 +122,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
+    * @param key Key to which all keys in `slots` are initialized
+    * @param stream Stream used for executing the kernel
+    */
+-  void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;
++  void initialize(value_type key, cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
+diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
+index c2c9c14..8ac4236 100644
+--- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh
++++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
+@@ -125,7 +125,7 @@ class open_addressing_impl {
+                                  KeyEqual const& pred,
+                                  ProbingScheme const& probing_scheme,
+                                  Allocator const& alloc,
+-                                 cuda_stream_ref stream) noexcept
++                                 cuda_stream_ref stream)
+     : empty_slot_sentinel_{empty_slot_sentinel},
+       erased_key_sentinel_{this->extract_key(empty_slot_sentinel)},
+       predicate_{pred},
+@@ -233,7 +233,7 @@ class open_addressing_impl {
+    *
+    * @param stream CUDA stream this operation is executed in
+    */
+-  void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); }
++  void clear(cuda_stream_ref stream) { storage_.initialize(empty_slot_sentinel_, stream); }
+ 
+   /**
+    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+@@ -599,7 +599,7 @@ class open_addressing_impl {
+    *
+    * @return The number of elements in the container
+    */
+-  [[nodiscard]] size_type size(cuda_stream_ref stream) const noexcept
++  [[nodiscard]] size_type size(cuda_stream_ref stream) const
+   {
+     auto counter =
+       detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
+diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
+index e17a145..3fa1d02 100644
+--- a/include/cuco/detail/static_map/static_map.inl
++++ b/include/cuco/detail/static_map/static_map.inl
+@@ -123,7 +123,7 @@ template <class Key,
+           class Allocator,
+           class Storage>
+ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
+-  cuda_stream_ref stream) noexcept
++  cuda_stream_ref stream)
+ {
+   impl_->clear(stream);
+ }
+@@ -215,7 +215,7 @@ template <class Key,
+           class Storage>
+ template <typename InputIt>
+ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+-  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) noexcept
++  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream)
+ {
+   return this->insert_or_assign_async(first, last, stream);
+   stream.synchronize();
+@@ -465,7 +465,7 @@ template <class Key,
+           class Storage>
+ static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+ static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
+-  cuda_stream_ref stream) const noexcept
++  cuda_stream_ref stream) const
+ {
+   return impl_->size(stream);
+ }
+diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl
+index 174f9bc..582926b 100644
+--- a/include/cuco/detail/static_multiset/static_multiset.inl
++++ b/include/cuco/detail/static_multiset/static_multiset.inl
+@@ -97,7 +97,7 @@ template <class Key,
+           class Allocator,
+           class Storage>
+ void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
+-  cuda_stream_ref stream) noexcept
++  cuda_stream_ref stream)
+ {
+   impl_->clear(stream);
+ }
+@@ -183,7 +183,7 @@ template <class Key,
+           class Storage>
+ static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+ static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
+-  cuda_stream_ref stream) const noexcept
++  cuda_stream_ref stream) const
+ {
+   return impl_->size(stream);
+ }
+diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl
+index 645013f..d3cece0 100644
+--- a/include/cuco/detail/static_set/static_set.inl
++++ b/include/cuco/detail/static_set/static_set.inl
+@@ -98,7 +98,7 @@ template <class Key,
+           class Allocator,
+           class Storage>
+ void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
+-  cuda_stream_ref stream) noexcept
++  cuda_stream_ref stream)
+ {
+   impl_->clear(stream);
+ }
+@@ -429,7 +429,7 @@ template <class Key,
+           class Storage>
+ static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+ static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
+-  cuda_stream_ref stream) const noexcept
++  cuda_stream_ref stream) const
+ {
+   return impl_->size(stream);
+ }
+diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl
+index 3547f4c..94b7f98 100644
+--- a/include/cuco/detail/storage/aow_storage.inl
++++ b/include/cuco/detail/storage/aow_storage.inl
+@@ -32,8 +32,8 @@
+ namespace cuco {
+ 
+ template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+-constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(
+-  Extent size, Allocator const& allocator) noexcept
++constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(Extent size,
++                                                                     Allocator const& allocator)
+   : detail::aow_storage_base<T, WindowSize, Extent>{size},
+     allocator_{allocator},
+     window_deleter_{capacity(), allocator_},
+@@ -64,7 +64,7 @@ aow_storage<T, WindowSize, Extent, Allocator>::ref() const noexcept
+ 
+ template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+ void aow_storage<T, WindowSize, Extent, Allocator>::initialize(value_type key,
+-                                                               cuda_stream_ref stream) noexcept
++                                                               cuda_stream_ref stream)
+ {
+   this->initialize_async(key, stream);
+   stream.synchronize();
+diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
+index c86e90c..95da423 100644
+--- a/include/cuco/static_map.cuh
++++ b/include/cuco/static_map.cuh
+@@ -269,7 +269,7 @@ class static_map {
+    *
+    * @param stream CUDA stream this operation is executed in
+    */
+-  void clear(cuda_stream_ref stream = {}) noexcept;
++  void clear(cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+@@ -387,7 +387,7 @@ class static_map {
+    * @param stream CUDA stream used for insert
+    */
+   template <typename InputIt>
+-  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
++  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k`
+@@ -690,7 +690,7 @@ class static_map {
+    * @param stream CUDA stream used to get the number of inserted elements
+    * @return The number of elements in the container
+    */
+-  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
++  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
+ 
+   /**
+    * @brief Gets the maximum number of elements the hash map can hold.
+diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh
+index 0daf103..fbcbc9c 100644
+--- a/include/cuco/static_multiset.cuh
++++ b/include/cuco/static_multiset.cuh
+@@ -235,7 +235,7 @@ class static_multiset {
+    *
+    * @param stream CUDA stream this operation is executed in
+    */
+-  void clear(cuda_stream_ref stream = {}) noexcept;
++  void clear(cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+@@ -339,7 +339,7 @@ class static_multiset {
+    * @param stream CUDA stream used to get the number of inserted elements
+    * @return The number of elements in the container
+    */
+-  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
++  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
+ 
+   /**
+    * @brief Gets the maximum number of elements the multiset can hold.
+diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh
+index a069939..3517f84 100644
+--- a/include/cuco/static_set.cuh
++++ b/include/cuco/static_set.cuh
+@@ -240,7 +240,7 @@ class static_set {
+    *
+    * @param stream CUDA stream this operation is executed in
+    */
+-  void clear(cuda_stream_ref stream = {}) noexcept;
++  void clear(cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+@@ -687,7 +687,7 @@ class static_set {
+    * @param stream CUDA stream used to get the number of inserted elements
+    * @return The number of elements in the container
+    */
+-  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
++  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
+ 
+   /**
+    * @brief Gets the maximum number of elements the hash set can hold.
diff --git a/cpp/cmake/thirdparty/patches/cuco_override.json b/cpp/cmake/thirdparty/patches/cuco_override.json
new file mode 100644
index 00000000000..ae0a9a4b4f0
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/cuco_override.json
@@ -0,0 +1,14 @@
+
+{
+  "packages" : {
+    "cuco" : {
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/cuco_noexcept.diff",
+          "issue" : "Remove erroneous noexcept clauses on cuco functions that may throw [https://github.com/rapidsai/cudf/issues/16059]",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}
diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp
index 837eda0d7b5..56ac0554403 100644
--- a/cpp/include/cudf/detail/merge.hpp
+++ b/cpp/include/cudf/detail/merge.hpp
@@ -46,6 +46,7 @@ using index_vector = rmm::device_uvector<index_type>;
  *            std::vector<cudf::size_type> const& key_cols,
  *            std::vector<cudf::order> const& column_order,
  *            std::vector<cudf::null_order> const& null_precedence,
+ *            rmm::cuda_stream_view stream,
  *            rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index e8486a80afc..9cdda773dbb 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -29,7 +29,7 @@ namespace dictionary {
 namespace detail {
 /**
  * @copydoc cudf::dictionary::add_keys(dictionary_column_view const&,column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -40,7 +40,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 
 /**
  * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -51,7 +51,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 
 /**
  * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -61,7 +61,7 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 
 /**
  * @copydoc cudf::dictionary::set_keys(dictionary_column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -72,7 +72,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 
 /**
  * @copydoc
- * cudf::dictionary::match_dictionaries(std::vector<cudf::dictionary_column_view>,mm::mr::device_memory_resource*)
+ * cudf::dictionary::match_dictionaries(std::vector<cudf::dictionary_column_view>,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 56ec62fa6e1..11f6ce2bad7 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -39,6 +39,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <utility>
 
@@ -50,6 +51,8 @@ struct ArrowSchema;
 
 struct ArrowArray;
 
+struct ArrowArrayStream;
+
 namespace cudf {
 /**
  * @addtogroup interop_dlpack
@@ -367,10 +370,11 @@ std::unique_ptr<cudf::scalar> from_arrow(
  * @param mr Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow data
  */
-std::unique_ptr<cudf::table> from_arrow(ArrowSchema const* schema,
-                                        ArrowArray const* input,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+std::unique_ptr<cudf::table> from_arrow(
+  ArrowSchema const* schema,
+  ArrowArray const* input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input
@@ -385,10 +389,11 @@ std::unique_ptr<cudf::table> from_arrow(ArrowSchema const* schema,
  * @param mr Device memory resource used to allocate `cudf::column`
  * @return cudf column generated from given arrow data
  */
-std::unique_ptr<cudf::column> from_arrow_column(ArrowSchema const* schema,
-                                                ArrowArray const* input,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+std::unique_ptr<cudf::column> from_arrow_column(
+  ArrowSchema const* schema,
+  ArrowArray const* input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::table` from given ArrowDeviceArray input
@@ -411,8 +416,26 @@ std::unique_ptr<cudf::column> from_arrow_column(ArrowSchema const* schema,
 std::unique_ptr<table> from_arrow_host(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create `cudf::table` from given ArrowArrayStream input
+ *
+ * @throws std::invalid_argument if input is NULL
+ *
+ * The conversion WILL release the input ArrayArrayStream and its constituent
+ * arrays or schema since Arrow streams are not suitable for multiple reads.
+ *
+ * @param input `ArrowArrayStream` pointer to object that will produce ArrowArray data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to perform cuda allocation
+ * @return cudf table generated from the given Arrow data
+ */
+std::unique_ptr<table> from_arrow_stream(
+  ArrowArrayStream* input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::column` from given ArrowDeviceArray input
@@ -434,8 +457,8 @@ std::unique_ptr<table> from_arrow_host(
 std::unique_ptr<column> from_arrow_host_column(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
@@ -515,8 +538,8 @@ using unique_table_view_t =
 unique_table_view_t from_arrow_device(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
@@ -558,8 +581,8 @@ using unique_column_view_t =
 unique_column_view_t from_arrow_device_column(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp
index 0086432d003..60ee867f058 100644
--- a/cpp/include/cudf/io/text/byte_range_info.hpp
+++ b/cpp/include/cudf/io/text/byte_range_info.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,17 +24,22 @@
 namespace cudf {
 namespace io {
 namespace text {
+/**
+ * @addtogroup io_readers
+ * @{
+ * @file
+ */
 
 /**
  * @brief stores offset and size used to indicate a byte range
  */
 class byte_range_info {
  private:
-  int64_t _offset;  ///< offset in bytes
-  int64_t _size;    ///< size in bytes
+  int64_t _offset{};  ///< offset in bytes
+  int64_t _size{};    ///< size in bytes
 
  public:
-  constexpr byte_range_info() noexcept : _offset(0), _size(0) {}
+  constexpr byte_range_info() = default;
   /**
    * @brief Constructs a byte_range_info object
    *
@@ -104,6 +109,8 @@ std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_b
  */
 byte_range_info create_byte_range_info_max();
 
+/** @} */  // end of group
+
 }  // namespace text
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp
index 28204c82780..13aff4b3b8f 100644
--- a/cpp/include/cudf/io/text/data_chunk_source.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,12 @@ namespace cudf {
 namespace io {
 namespace text {
 
+/**
+ * @addtogroup io_readers
+ * @{
+ * @file
+ */
+
 /**
  * @brief A contract guaranteeing stream-ordered memory access to the underlying device data.
  *
@@ -110,6 +116,8 @@ class data_chunk_source {
   [[nodiscard]] virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
 };
 
+/** @} */  // end of group
+
 }  // namespace text
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index 7abae7c754b..e29ab78ae46 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -30,6 +30,11 @@
 namespace cudf {
 namespace io {
 namespace text {
+/**
+ * @addtogroup io_readers
+ * @{
+ * @file
+ */
 
 /**
  * @brief Parsing options for multibyte_split.
@@ -79,6 +84,7 @@ struct parse_options {
  * @param source The source string
  * @param delimiter UTF-8 encoded string for which to find offsets in the source
  * @param options the parsing options to use (including byte range)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Memory resource to use for the device memory allocation
  * @return The strings found by splitting the source by the delimiter within the relevant byte
  * range.
@@ -87,17 +93,30 @@ std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
   parse_options options             = {},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::column> multibyte_split(
+/**
+ * @brief Splits the source text into a strings column using a multiple byte delimiter.
+ *
+ * @deprecated Since 24.08
+ *
+ * @param source The source input data encoded in UTF-8
+ * @param delimiter UTF-8 encoded string for which to find offsets in the source
+ * @param byte_range The position and size within `source` to produce the column from
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to use for the device memory allocation
+ * @return The strings found by splitting the source by the delimiter within the relevant byte
+ * range.
+ */
+[[deprecated]] std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
   std::optional<byte_range_info> byte_range,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
-                                              std::string const& delimiter,
-                                              rmm::device_async_resource_ref mr);
+/** @} */  // end of group
 
 }  // namespace text
 }  // namespace io
diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp
index 29aa3ffe934..301e56c19b8 100644
--- a/cpp/include/cudf/merge.hpp
+++ b/cpp/include/cudf/merge.hpp
@@ -97,6 +97,7 @@ namespace cudf {
  * @param[in] column_order Sort order types of columns indexed by key_cols
  * @param[in] null_precedence Array indicating the order of nulls with respect
  * to non-nulls for the indexing columns (key_cols)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  *
  * @returns A table containing sorted data from all input tables
@@ -106,7 +107,7 @@ std::unique_ptr<cudf::table> merge(
   std::vector<cudf::size_type> const& key_cols,
   std::vector<cudf::order> const& column_order,
   std::vector<cudf::null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                         = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr                    = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index a19aa9be0c0..a714f762a19 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -122,7 +122,7 @@ std::unique_ptr<column> replace_slice(
  * If a target string is found, it is replaced by the corresponding entry in the repls column.
  * All occurrences found in each string are replaced.
  *
- * This does not use regex to match targets in the string.
+ * This does not use regex to match targets in the string. Empty string targets are ignored.
  *
  * Null string entries will return null output string entries.
  *
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index e9b81a525fc..c181ac7d402 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -252,7 +252,7 @@ using optional_dremel_view = thrust::optional<detail::dremel_device_view const>;
  *
  * @tparam has_nested_columns compile-time optimization for primitive types.
  *         This template parameter is to be used by the developer by querying
- *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+ *         `cudf::has_nested_columns(input)`. `true` compiles operator
  *         overloads for nested types, while `false` only compiles operator
  *         overloads for primitive types.
  * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1014,7 +1014,7 @@ class self_comparator {
    *
    * @tparam has_nested_columns compile-time optimization for primitive types.
    *         This template parameter is to be used by the developer by querying
-   *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+   *         `cudf::has_nested_columns(input)`. `true` compiles operator
    *         overloads for nested types, while `false` only compiles operator
    *         overloads for primitive types.
    * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1186,7 +1186,7 @@ class two_table_comparator {
    *
    * @tparam has_nested_columns compile-time optimization for primitive types.
    *         This template parameter is to be used by the developer by querying
-   *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+   *         `cudf::has_nested_columns(input)`. `true` compiles operator
    *         overloads for nested types, while `false` only compiles operator
    *         overloads for primitive types.
    * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1326,7 +1326,7 @@ struct nan_equal_physical_equality_comparator {
  *
  * @tparam has_nested_columns compile-time optimization for primitive types.
  *         This template parameter is to be used by the developer by querying
- *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+ *         `cudf::has_nested_columns(input)`. `true` compiles operator
  *         overloads for nested types, while `false` only compiles operator
  *         overloads for primitive types.
  * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1643,7 +1643,7 @@ class self_comparator {
    *
    * @tparam has_nested_columns compile-time optimization for primitive types.
    *         This template parameter is to be used by the developer by querying
-   *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+   *         `cudf::has_nested_columns(input)`. `true` compiles operator
    *         overloads for nested types, while `false` only compiles operator
    *         overloads for primitive types.
    * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1757,7 +1757,7 @@ class two_table_comparator {
    *
    * @tparam has_nested_columns compile-time optimization for primitive types.
    *         This template parameter is to be used by the developer by querying
-   *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+   *         `cudf::has_nested_columns(input)`. `true` compiles operator
    *         overloads for nested types, while `false` only compiles operator
    *         overloads for primitive types.
    * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index a71e0558dec..4a990f67ce4 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <algorithm>
 #include <vector>
@@ -32,7 +33,7 @@
  * passed by value.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief Base class for a table of `ColumnView`s
@@ -123,7 +124,10 @@ class table_view_base {
    * @param column_index The index of the desired column
    * @return A reference to the desired column
    */
-  [[nodiscard]] ColumnView const& column(size_type column_index) const;
+  [[nodiscard]] ColumnView const& column(size_type column_index) const
+  {
+    return _columns.at(column_index);
+  }
 
   /**
    * @brief Returns the number of columns
@@ -174,8 +178,17 @@ class table_view_base {
  * @return Whether nested columns exist in the input table
  */
 bool has_nested_columns(table_view const& table);
+
 }  // namespace detail
 
+/**
+ * @brief Determine if any nested columns exist in a given table.
+ *
+ * @param table The input table
+ * @return Whether nested columns exist in the input table
+ */
+bool has_nested_columns(table_view const& table);
+
 /**
  * @brief A set of cudf::column_view's of the same size.
  *
@@ -374,4 +387,4 @@ extern template bool is_relationally_comparable<mutable_table_view>(mutable_tabl
                                                                     mutable_table_view const& rhs);
 // @endcond
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index b66e5cab333..4be3054b3dc 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -463,10 +463,6 @@ void traverse_children::operator()<cudf::list_view>(host_span<column_view const>
  */
 void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()),
-               "Type mismatch in columns to concatenate.",
-               cudf::data_type_error);
-
   // total size of all concatenated rows
   size_t const total_row_count =
     std::accumulate(cols.begin(), cols.end(), std::size_t{}, [](size_t a, auto const& b) {
@@ -476,6 +472,21 @@ void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_v
                "Total number of concatenated rows exceeds the column size limit",
                std::overflow_error);
 
+  if (std::any_of(cols.begin(), cols.end(), [](column_view const& c) {
+        return c.type().id() == cudf::type_id::EMPTY;
+      })) {
+    CUDF_EXPECTS(
+      std::all_of(cols.begin(),
+                  cols.end(),
+                  [](column_view const& c) { return c.type().id() == cudf::type_id::EMPTY; }),
+      "Mismatch in columns to concatenate.",
+      cudf::data_type_error);
+    return;
+  }
+  CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()),
+               "Type mismatch in columns to concatenate.",
+               cudf::data_type_error);
+
   // traverse children
   cudf::type_dispatcher(cols.front().type(), traverse_children{}, cols, stream);
 }
@@ -498,6 +509,15 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_conc
     return empty_like(columns_to_concat.front());
   }
 
+  // For empty columns, we can just create an EMPTY column of the appropriate length.
+  if (columns_to_concat.front().type().id() == cudf::type_id::EMPTY) {
+    auto length = std::accumulate(
+      columns_to_concat.begin(), columns_to_concat.end(), 0, [](auto a, auto const& b) {
+        return a + b.size();
+      });
+    return std::make_unique<column>(
+      data_type(type_id::EMPTY), length, rmm::device_buffer{}, rmm::device_buffer{}, length);
+  }
   return type_dispatcher<dispatch_storage_type>(
     columns_to_concat.front().type(), concatenate_dispatch{columns_to_concat, stream, mr});
 }
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index dd9e9600a87..605d813ed1e 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -39,7 +39,8 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view)
     case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32);
     case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64);
     case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS);
-    case NANOARROW_TYPE_STRING: return data_type(type_id::STRING);
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_LARGE_STRING: return data_type(type_id::STRING);
     case NANOARROW_TYPE_LIST: return data_type(type_id::LIST);
     case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32);
     case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT);
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index 002a8ec1f14..e1d289e67a3 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -35,6 +35,7 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
@@ -56,7 +57,7 @@ struct dispatch_from_arrow_device {
                               data_type,
                               bool,
                               rmm::cuda_stream_view,
-                              rmm::mr::device_memory_resource*)
+                              rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error);
   }
@@ -68,7 +69,7 @@ struct dispatch_from_arrow_device {
                               data_type type,
                               bool skip_mask,
                               rmm::cuda_stream_view,
-                              rmm::mr::device_memory_resource*)
+                              rmm::device_async_resource_ref mr)
   {
     size_type const num_rows   = input->length;
     size_type const offset     = input->offset;
@@ -90,7 +91,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
                             data_type type,
                             bool skip_mask,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 template <>
 dispatch_tuple_t dispatch_from_arrow_device::operator()<bool>(ArrowSchemaView* schema,
@@ -98,7 +99,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<bool>(ArrowSchemaView* s
                                                               data_type type,
                                                               bool skip_mask,
                                                               rmm::cuda_stream_view stream,
-                                                              rmm::mr::device_memory_resource* mr)
+                                                              rmm::device_async_resource_ref mr)
 {
   if (input->length == 0) {
     return std::make_tuple<column_view, owned_columns_t>(
@@ -141,8 +142,11 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
+  CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING,
+               "Large strings are not yet supported in from_arrow_device",
+               cudf::data_type_error);
   if (input->length == 0) {
     return std::make_tuple<column_view, owned_columns_t>(
       {type,
@@ -179,7 +183,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::dictionary32>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   ArrowSchemaView keys_schema_view;
   NANOARROW_THROW_NOT_OK(
@@ -235,7 +239,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::struct_view>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::vector<column_view> children;
   owned_columns_t out_owned_cols;
@@ -280,7 +284,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::list_view>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   size_type const num_rows   = input->length;
   size_type const offset     = input->offset;
@@ -321,7 +325,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
                             data_type type,
                             bool skip_mask,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   return type.id() != type_id::EMPTY
            ? std::move(type_dispatcher(
@@ -339,7 +343,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
 unique_table_view_t from_arrow_device(ArrowSchema const* schema,
                                       ArrowDeviceArray const* input,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -394,7 +398,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema,
 unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
                                               ArrowDeviceArray const* input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -426,7 +430,7 @@ unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
 unique_table_view_t from_arrow_device(ArrowSchema const* schema,
                                       ArrowDeviceArray const* input,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -436,7 +440,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema,
 unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
                                               ArrowDeviceArray const* input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index 854a1d68fdc..b3087dedf98 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -38,6 +38,7 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
@@ -49,7 +50,7 @@ namespace {
 
 struct dispatch_copy_from_arrow_host {
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   std::unique_ptr<rmm::device_buffer> get_mask_buffer(ArrowArray const* array)
   {
@@ -131,7 +132,7 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
                                         data_type type,
                                         bool skip_mask,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 template <>
 std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<bool>(ArrowSchemaView* schema,
@@ -188,8 +189,16 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::string_v
 
   // chars_column does not contain any nulls, they are tracked by the parent string column
   // itself instead. So we pass nullptr for the validity bitmask.
-  size_type const char_data_length =
-    reinterpret_cast<int32_t const*>(offset_buffers[1])[input->length + input->offset];
+  int64_t const char_data_length = [&]() {
+    if (schema->type == NANOARROW_TYPE_LARGE_STRING) {
+      return reinterpret_cast<int64_t const*>(offset_buffers[1])[input->length + input->offset];
+    } else if (schema->type == NANOARROW_TYPE_STRING) {
+      return static_cast<int64_t>(
+        reinterpret_cast<int32_t const*>(offset_buffers[1])[input->length + input->offset]);
+    } else {
+      CUDF_FAIL("Unsupported string type", cudf::data_type_error);
+    }
+  }();
   void const* char_buffers[2] = {nullptr, input->buffers[2]};
   ArrowArray char_array       = {
           .length     = char_data_length,
@@ -210,15 +219,27 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::string_v
   // offset and char data columns for us.
   ArrowSchemaView view;
   NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr));
-  auto offsets_column =
-    this->operator()<int32_t>(&view, &offsets_array, data_type(type_id::INT32), true);
+  auto offsets_column = [&]() {
+    if (schema->type == NANOARROW_TYPE_LARGE_STRING) {
+      return this->operator()<int64_t>(&view, &offsets_array, data_type(type_id::INT64), true);
+    } else if (schema->type == NANOARROW_TYPE_STRING) {
+      return this->operator()<int32_t>(&view, &offsets_array, data_type(type_id::INT32), true);
+    } else {
+      CUDF_FAIL("Unsupported string type", cudf::data_type_error);
+    }
+  }();
   NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr));
-  auto chars_column = this->operator()<int8_t>(&view, &char_array, data_type(type_id::INT8), true);
 
+  rmm::device_buffer chars(char_data_length, stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(),
+                                reinterpret_cast<uint8_t const*>(char_array.buffers[1]),
+                                chars.size(),
+                                cudaMemcpyDefault,
+                                stream.value()));
   auto const num_rows = offsets_column->size() - 1;
   auto out_col        = make_strings_column(num_rows,
                                      std::move(offsets_column),
-                                     std::move(chars_column->release().data.release()[0]),
+                                     std::move(chars),
                                      input->null_count,
                                      std::move(*get_mask_buffer(input)));
 
@@ -368,7 +389,7 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
                                         data_type type,
                                         bool skip_mask,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return type.id() != type_id::EMPTY
            ? std::move(type_dispatcher(
@@ -385,7 +406,7 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
 std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
                                        ArrowDeviceArray const* input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -421,7 +442,7 @@ std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
 std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
                                                ArrowDeviceArray const* input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -442,7 +463,7 @@ std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
 std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
                                        ArrowDeviceArray const* input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -452,7 +473,7 @@ std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
 std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
                                                ArrowDeviceArray const* input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -462,7 +483,7 @@ std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
 std::unique_ptr<table> from_arrow(ArrowSchema const* schema,
                                   ArrowArray const* input,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -477,7 +498,7 @@ std::unique_ptr<table> from_arrow(ArrowSchema const* schema,
 std::unique_ptr<column> from_arrow_column(ArrowSchema const* schema,
                                           ArrowArray const* input,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu
new file mode 100644
index 00000000000..578105aa90a
--- /dev/null
+++ b/cpp/src/interop/from_arrow_stream.cu
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow_utilities.hpp"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/concatenate.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+#include <memory>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+namespace cudf {
+namespace detail {
+
+namespace {
+
+std::unique_ptr<column> make_empty_column_from_schema(ArrowSchema const* schema,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::device_async_resource_ref mr)
+{
+  ArrowSchemaView schema_view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr));
+
+  auto const type{arrow_to_cudf_type(&schema_view)};
+  switch (type.id()) {
+    case type_id::EMPTY: {
+      return std::make_unique<column>(
+        data_type(type_id::EMPTY), 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
+    }
+    case type_id::LIST: {
+      return cudf::make_lists_column(0,
+                                     cudf::make_empty_column(data_type{type_id::INT32}),
+                                     make_empty_column_from_schema(schema->children[0], stream, mr),
+                                     0,
+                                     {},
+                                     stream,
+                                     mr);
+    }
+    case type_id::STRUCT: {
+      std::vector<std::unique_ptr<column>> child_columns;
+      child_columns.reserve(schema->n_children);
+      std::transform(
+        schema->children,
+        schema->children + schema->n_children,
+        std::back_inserter(child_columns),
+        [&](auto const& child) { return make_empty_column_from_schema(child, stream, mr); });
+      return cudf::make_structs_column(0, std::move(child_columns), 0, {}, stream, mr);
+    }
+    default: {
+      return cudf::make_empty_column(type);
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(input != nullptr, "input ArrowArrayStream must not be NULL", std::invalid_argument);
+
+  // Potential future optimization: Since the from_arrow API accepts an
+  // ArrowSchema we're allocating one here instead of using a view, which we
+  // could avoid with a different underlying implementation.
+  ArrowSchema schema;
+  NANOARROW_THROW_NOT_OK(ArrowArrayStreamGetSchema(input, &schema, nullptr));
+
+  std::vector<std::unique_ptr<cudf::table>> chunks;
+  ArrowArray chunk;
+  while (true) {
+    NANOARROW_THROW_NOT_OK(ArrowArrayStreamGetNext(input, &chunk, nullptr));
+    if (chunk.release == nullptr) { break; }
+    chunks.push_back(from_arrow(&schema, &chunk, stream, mr));
+    chunk.release(&chunk);
+  }
+  input->release(input);
+
+  if (chunks.empty()) {
+    if (schema.n_children == 0) {
+      schema.release(&schema);
+      return std::make_unique<cudf::table>();
+    }
+
+    // If there are no chunks but the schema has children, we need to construct a suitable empty
+    // table.
+    std::vector<std::unique_ptr<cudf::column>> columns;
+    columns.reserve(chunks.size());
+    std::transform(
+      schema.children,
+      schema.children + schema.n_children,
+      std::back_inserter(columns),
+      [&](auto const& child) { return make_empty_column_from_schema(child, stream, mr); });
+    schema.release(&schema);
+    return std::make_unique<cudf::table>(std::move(columns));
+  }
+
+  schema.release(&schema);
+
+  auto chunk_views = std::vector<table_view>{};
+  chunk_views.reserve(chunks.size());
+  std::transform(
+    chunks.begin(), chunks.end(), std::back_inserter(chunk_views), [](auto const& chunk) {
+      return chunk->view();
+    });
+  return cudf::detail::concatenate(chunk_views, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::from_arrow_stream(input, stream, mr);
+}
+}  // namespace cudf
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 7c4d5711281..63eb0b03c5f 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -25,6 +25,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/detail/csv.hpp>
@@ -372,15 +373,33 @@ void write_chunked(data_sink* out_sink,
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
   cudf::string_scalar newline{options.get_line_terminator(), true, stream};
-  auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view,
-                                                            newline,
-                                                            string_scalar{"", false, stream},
-                                                            stream,
-                                                            rmm::mr::get_current_device_resource());
-  strings_column_view strings_column{p_str_col_w_nl->view()};
 
-  auto total_num_bytes      = strings_column.chars_size(stream);
-  char const* ptr_all_bytes = strings_column.chars_begin(stream);
+  // use strings concatenate to build the final CSV output in device memory
+  auto contents_w_nl = [&] {
+    auto const total_size =
+      str_column_view.chars_size(stream) + (newline.size() * str_column_view.size());
+    auto const empty_str = string_scalar("", true, stream);
+    // use join_strings when the output will be less than 2GB
+    if (total_size < static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
+      return cudf::strings::detail::join_strings(str_column_view, newline, empty_str, stream, mr)
+        ->release();
+    }
+    auto nl_col = cudf::make_column_from_scalar(newline, str_column_view.size(), stream);
+    // convert the last element into an empty string by resetting the last offset value
+    auto& offsets     = nl_col->child(strings_column_view::offsets_column_index);
+    auto offsets_view = offsets.mutable_view();
+    cudf::fill_in_place(offsets_view,
+                        offsets.size() - 1,  // set the last element with
+                        offsets.size(),      // the value from 2nd to last element
+                        *cudf::detail::get_element(offsets.view(), offsets.size() - 2, stream, mr),
+                        stream);
+    auto const nl_tbl = cudf::table_view({str_column_view.parent(), nl_col->view()});
+    return cudf::strings::detail::concatenate(
+             nl_tbl, empty_str, empty_str, strings::separator_on_nulls::NO, stream, mr)
+      ->release();
+  }();
+  auto const total_num_bytes = contents_w_nl.data->size();
+  auto const ptr_all_bytes   = static_cast<char const*>(contents_w_nl.data->data());
 
   if (out_sink->is_device_write_preferred(total_num_bytes)) {
     // Direct write from device memory
@@ -491,7 +510,8 @@ void write_csv(data_sink* out_sink,
           str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource());
       }();
 
-      write_chunked(out_sink, str_concat_col->view(), options, stream, mr);
+      write_chunked(
+        out_sink, str_concat_col->view(), options, stream, rmm::mr::get_current_device_resource());
     }
   }
 }
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 405084cc4ad..1cd45fc897c 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -2242,131 +2242,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
   return {};
 }
 
-table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,
-                                           cudf::io::json_reader_options const& options,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-{
-  // Range of orchestrating/encapsulating function
-  CUDF_FUNC_RANGE();
-
-  auto const h_input = cudf::detail::make_std_vector_async(d_input, stream);
-
-  auto const new_line_delimited_json = options.is_enabled_lines();
-
-  // Get internal JSON column
-  json_column root_column{};
-  std::stack<tree_node> data_path{};
-
-  constexpr uint32_t row_offset_zero            = 0;
-  constexpr uint32_t token_begin_offset_zero    = 0;
-  constexpr uint32_t token_end_offset_zero      = 0;
-  constexpr uint32_t node_init_child_count_zero = 0;
-
-  // Whether the tokenizer stage should keep quote characters for string values
-  // If the tokenizer keeps the quote characters, they may be stripped during type casting
-  constexpr bool include_quote_chars = true;
-
-  // We initialize the very root node and root column, which represent the JSON document being
-  // parsed. That root node is a list node and that root column is a list column. The column has the
-  // root node as its only row. The values parsed from the JSON input will be treated as follows:
-  // (1) For JSON lines: we expect to find a list of JSON values that all
-  // will be inserted into this root list column. (2) For regular JSON: we expect to have only a
-  // single value (list, struct, string, number, literal) that will be inserted into this root
-  // column.
-  root_column.append_row(
-    row_offset_zero, json_col_t::ListColumn, token_begin_offset_zero, token_end_offset_zero, 1);
-
-  // Push the root node onto the stack for the data path
-  data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero});
-
-  make_json_column(
-    root_column, data_path, h_input, d_input, options, include_quote_chars, stream, mr);
-
-  // data_root refers to the root column of the data represented by the given JSON string
-  auto const& data_root =
-    new_line_delimited_json ? root_column : root_column.child_columns.begin()->second;
-
-  // Zero row entries
-  if (data_root.type == json_col_t::ListColumn && data_root.child_columns.empty()) {
-    return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{})};
-  }
-
-  // Verify that we were in fact given a list of structs (or in JSON speech: an array of objects)
-  auto constexpr single_child_col_count = 1;
-  CUDF_EXPECTS(data_root.type == json_col_t::ListColumn and
-                 data_root.child_columns.size() == single_child_col_count and
-                 data_root.child_columns.begin()->second.type == json_col_t::StructColumn,
-               "Currently the nested JSON parser only supports an array of (nested) objects");
-
-  // Slice off the root list column, which has only a single row that contains all the structs
-  auto const& root_struct_col = data_root.child_columns.begin()->second;
-
-  // Initialize meta data to be populated while recursing through the tree of columns
-  std::vector<std::unique_ptr<column>> out_columns;
-  std::vector<column_name_info> out_column_names;
-
-  // Iterate over the struct's child columns and convert to cudf column
-  size_type column_index = 0;
-  for (auto const& col_name : root_struct_col.column_order) {
-    auto const& json_col = root_struct_col.child_columns.find(col_name)->second;
-    // Insert this columns name into the schema
-    out_column_names.emplace_back(col_name);
-
-    std::optional<schema_element> child_schema_element = std::visit(
-      cudf::detail::visitor_overload{
-        [column_index](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
-          auto ret = (static_cast<std::size_t>(column_index) < user_dtypes.size())
-                       ? std::optional<schema_element>{{user_dtypes[column_index]}}
-                       : std::optional<schema_element>{};
-#ifdef NJP_DEBUG_PRINT
-          std::cout << "Column by index: #" << column_index << ", type id: "
-                    << (ret.has_value() ? std::to_string(static_cast<int>(ret->type.id())) : "n/a")
-                    << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children"
-                    << "\n";
-#endif
-          return ret;
-        },
-        [col_name](
-          std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
-          auto ret = (user_dtypes.find(col_name) != std::end(user_dtypes))
-                       ? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
-                       : std::optional<schema_element>{};
-#ifdef NJP_DEBUG_PRINT
-          std::cout << "Column by flat name: '" << col_name << "', type id: "
-                    << (ret.has_value() ? std::to_string(static_cast<int>(ret->type.id())) : "n/a")
-                    << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children"
-                    << "\n";
-#endif
-          return ret;
-        },
-        [col_name](std::map<std::string, schema_element> const& user_dtypes)
-          -> std::optional<schema_element> {
-          auto ret = (user_dtypes.find(col_name) != std::end(user_dtypes))
-                       ? user_dtypes.find(col_name)->second
-                       : std::optional<schema_element>{};
-#ifdef NJP_DEBUG_PRINT
-          std::cout << "Column by nested name: #" << col_name << ", type id: "
-                    << (ret.has_value() ? std::to_string(static_cast<int>(ret->type.id())) : "n/a")
-                    << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children"
-                    << "\n";
-#endif
-          return ret;
-        }},
-      options.get_dtypes());
-
-    // Get this JSON column's cudf column and schema info
-    auto [cudf_col, col_name_info] =
-      json_column_to_cudf_column(json_col, d_input, options, child_schema_element, stream, mr);
-    out_column_names.back().children = std::move(col_name_info);
-    out_columns.emplace_back(std::move(cudf_col));
-
-    column_index++;
-  }
-
-  return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
-}
-
 }  // namespace detail
 }  // namespace cudf::io::json
 
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 9c406369068..51dc0ca90af 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -565,35 +565,32 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 
 }  // namespace detail
 
+// deprecated in 24.08
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               std::optional<byte_range_info> byte_range,
+                                              rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
 {
-  return multibyte_split(
-    source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr);
+  return multibyte_split(source,
+                         delimiter,
+                         parse_options{byte_range.value_or(create_byte_range_info_max())},
+                         stream,
+                         mr);
 }
 
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               parse_options options,
+                                              rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
 {
-  auto stream = cudf::get_default_stream();
-
   auto result = detail::multibyte_split(
     source, delimiter, options.byte_range, options.strip_delimiters, stream, mr);
 
   return result;
 }
 
-std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
-                                              std::string const& delimiter,
-                                              rmm::device_async_resource_ref mr)
-{
-  return multibyte_split(source, delimiter, parse_options{}, mr);
-}
-
 }  // namespace text
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 42e0e4f45ee..90748e6f322 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -82,9 +82,7 @@ mixed_join(
       // Left and full joins all return all the row indices from
       // left with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
-      case join_kind::FULL_JOIN:
-        return get_trivial_left_join_indices(
-          left_conditional, stream, rmm::mr::get_current_device_resource());
+      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left_conditional, stream, mr);
       // Inner joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
         return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
@@ -100,8 +98,7 @@ mixed_join(
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
-        auto ret_flipped = get_trivial_left_join_indices(
-          right_conditional, stream, rmm::mr::get_current_device_resource());
+        auto ret_flipped = get_trivial_left_join_indices(right_conditional, stream, mr);
         return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 8500b248fcf..c147ea3c253 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -117,9 +117,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
       // Anti and semi return all the row indices from left
       // with a corresponding NULL from the right.
       case join_kind::LEFT_ANTI_JOIN:
-        return get_trivial_left_join_indices(
-                 left_conditional, stream, rmm::mr::get_current_device_resource())
-          .first;
+        return get_trivial_left_join_indices(left_conditional, stream, mr).first;
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::LEFT_SEMI_JOIN:
         return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 630cf328579..7ecaa0fba56 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -694,11 +694,11 @@ std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merg
                                    std::vector<cudf::size_type> const& key_cols,
                                    std::vector<cudf::order> const& column_order,
                                    std::vector<cudf::null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::merge(
-    tables_to_merge, key_cols, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::merge(tables_to_merge, key_cols, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 43a3d69091a..2ca22f0e017 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -451,8 +451,8 @@ struct replace_multi_fn {
     while (spos < d_str.size_bytes()) {
       for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) {
         auto const d_tgt = d_targets.element<string_view>(tgt_idx);
-        if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&    // check fit
-            (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))  // and match
+        if (!d_tgt.empty() && (d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&  // check fit
+            (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))                  // and match
         {
           auto const d_repl = (d_repls.size() == 1) ? d_repls.element<string_view>(0)
                                                     : d_repls.element<string_view>(tgt_idx);
@@ -468,9 +468,8 @@ struct replace_multi_fn {
       }
       ++spos;
     }
-    if (out_ptr)  // copy remainder
-    {
-      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
+    if (out_ptr) {
+      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);  // copy remainder
     } else {
       d_sizes[idx] = bytes;
     }
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 101004a5d06..f70598f33be 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -158,8 +158,13 @@ int64_t get_offset64_threshold()
 
 bool is_large_strings_enabled()
 {
+  // default depends on compile-time switch but can be overridden by the environment variable
   auto const env = std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED");
+#ifdef CUDF_LARGE_STRINGS_DISABLED
   return env != nullptr && std::string(env) == "1";
+#else
+  return env == nullptr || std::string(env) == "1";
+#endif
 }
 
 int64_t get_offset_value(cudf::column_view const& offsets,
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index 13832b0d9dc..8a5340dc20d 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -52,12 +52,6 @@ auto concatenate_column_views(std::vector<ViewType> const& views)
   return concat_cols;
 }
 
-template <typename ColumnView>
-ColumnView const& table_view_base<ColumnView>::column(size_type column_index) const
-{
-  return _columns.at(column_index);
-}
-
 // Explicit instantiation for a table of `column_view`s
 template class table_view_base<column_view>;
 
@@ -172,6 +166,7 @@ bool has_nested_columns(table_view const& table)
   return std::any_of(
     table.begin(), table.end(), [](column_view const& col) { return is_nested(col.type()); });
 }
-
 }  // namespace detail
+
+bool has_nested_columns(table_view const& table) { return detail::has_nested_columns(table); }
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eef09954647..8e2017ccb97 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -273,6 +273,7 @@ ConfigureTest(
   interop/from_arrow_test.cpp
   interop/from_arrow_device_test.cpp
   interop/from_arrow_host_test.cpp
+  interop/from_arrow_stream_test.cpp
   interop/dlpack_test.cpp
   EXTRA_LIB
   nanoarrow
@@ -691,6 +692,8 @@ ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_MERGE_TEST streams/merge_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_MULTIBYTE_SPLIT_TEST streams/io/multibyte_split_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 054441788d0..18140c34abd 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -1667,3 +1667,63 @@ TEST_F(DictionaryConcatTest, ErrorsTest)
   std::vector<cudf::column_view> empty;
   EXPECT_THROW(cudf::concatenate(empty), cudf::logic_error);
 }
+
+struct EmptyColumnTest : public cudf::test::BaseFixture {};
+
+TEST_F(EmptyColumnTest, SimpleTest)
+{
+  std::vector<cudf::column> columns;
+  constexpr auto num_copies = 10;
+  constexpr auto num_rows   = 10;
+  for (auto i = 0; i < num_copies; ++i) {
+    columns.emplace_back(cudf::data_type(cudf::type_id::EMPTY),
+                         num_rows,
+                         rmm::device_buffer{},
+                         rmm::device_buffer{},
+                         0);
+  }
+
+  // Create views from columns
+  std::vector<cudf::column_view> views;
+  for (auto& col : columns) {
+    views.push_back(col.view());
+  }
+  auto result = cudf::concatenate(views);
+
+  ASSERT_EQ(result->size(), num_copies * num_rows);
+  ASSERT_EQ(result->type().id(), cudf::type_id::EMPTY);
+}
+
+struct TableOfEmptyColumnsTest : public cudf::test::BaseFixture {};
+
+TEST_F(TableOfEmptyColumnsTest, SimpleTest)
+{
+  std::vector<cudf::table> tables;
+  constexpr auto num_copies  = 10;
+  constexpr auto num_rows    = 10;
+  constexpr auto num_columns = 10;
+  for (auto i = 0; i < num_copies; ++i) {
+    std::vector<std::unique_ptr<cudf::column>> columns;
+    for (auto j = 0; j < num_columns; ++j) {
+      columns.push_back(std::make_unique<cudf::column>(cudf::data_type(cudf::type_id::EMPTY),
+                                                       num_rows,
+                                                       rmm::device_buffer{},
+                                                       rmm::device_buffer{},
+                                                       0));
+    }
+    tables.emplace_back(std::move(columns));
+  }
+
+  // Create views from columns
+  std::vector<cudf::table_view> views;
+  for (auto& tbl : tables) {
+    views.push_back(tbl.view());
+  }
+  auto result = cudf::concatenate(views);
+
+  ASSERT_EQ(result->num_rows(), num_copies * num_rows);
+  ASSERT_EQ(result->num_columns(), num_columns);
+  for (auto i = 0; i < num_columns; ++i) {
+    ASSERT_EQ(result->get_column(i).type().id(), cudf::type_id::EMPTY);
+  }
+}
diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp
new file mode 100644
index 00000000000..80a2e4b2ffd
--- /dev/null
+++ b/cpp/tests/interop/from_arrow_stream_test.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/type_checks.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+struct VectorOfArrays {
+  std::vector<nanoarrow::UniqueArray> arrays;
+  nanoarrow::UniqueSchema schema;
+  size_t index{0};
+
+  static int get_schema(ArrowArrayStream* stream, ArrowSchema* out_schema)
+  {
+    auto private_data = static_cast<VectorOfArrays*>(stream->private_data);
+
+    [[maybe_unused]] auto rc = ArrowSchemaDeepCopy(private_data->schema.get(), out_schema);
+    return 0;
+  }
+
+  static int get_next(ArrowArrayStream* stream, ArrowArray* out_array)
+  {
+    auto private_data = static_cast<VectorOfArrays*>(stream->private_data);
+    if (private_data->index >= private_data->arrays.size()) {
+      out_array->release = nullptr;
+      return 0;
+    }
+    ArrowArrayMove(private_data->arrays[private_data->index++].get(), out_array);
+    return 0;
+  }
+
+  static const char* get_last_error(ArrowArrayStream* stream) { return nullptr; }
+
+  static void release(ArrowArrayStream* stream)
+  {
+    delete static_cast<VectorOfArrays*>(stream->private_data);
+  }
+};
+
+struct FromArrowStreamTest : public cudf::test::BaseFixture {};
+
+void makeStreamFromArrays(std::vector<nanoarrow::UniqueArray> arrays,
+                          nanoarrow::UniqueSchema schema,
+                          ArrowArrayStream* out)
+{
+  auto* private_data  = new VectorOfArrays{std::move(arrays), std::move(schema)};
+  out->get_schema     = VectorOfArrays::get_schema;
+  out->get_next       = VectorOfArrays::get_next;
+  out->get_last_error = VectorOfArrays::get_last_error;
+  out->release        = VectorOfArrays::release;
+  out->private_data   = private_data;
+}
+
+TEST_F(FromArrowStreamTest, BasicTest)
+{
+  constexpr auto num_copies = 3;
+  std::vector<std::unique_ptr<cudf::table>> tables;
+  // The schema is unique across all tables.
+  nanoarrow::UniqueSchema schema;
+  std::vector<nanoarrow::UniqueArray> arrays;
+  for (auto i = 0; i < num_copies; ++i) {
+    auto [tbl, sch, arr] = get_nanoarrow_host_tables(0);
+    tables.push_back(std::move(tbl));
+    arrays.push_back(std::move(arr));
+    if (i == 0) { sch.move(schema.get()); }
+  }
+  std::vector<cudf::table_view> table_views;
+  for (auto const& table : tables) {
+    table_views.push_back(table->view());
+  }
+  auto expected = cudf::concatenate(table_views);
+
+  ArrowArrayStream stream;
+  makeStreamFromArrays(std::move(arrays), std::move(schema), &stream);
+  auto result = cudf::from_arrow_stream(&stream);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result->view());
+}
+
+TEST_F(FromArrowStreamTest, EmptyTest)
+{
+  auto [tbl, sch, arr] = get_nanoarrow_host_tables(0);
+  std::vector<cudf::table_view> table_views{tbl->view()};
+  auto expected = cudf::concatenate(table_views);
+
+  ArrowArrayStream stream;
+  makeStreamFromArrays({}, std::move(sch), &stream);
+  auto result = cudf::from_arrow_stream(&stream);
+  cudf::have_same_types(expected->view(), result->view());
+}
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index 94c4372e74a..4147728b2a6 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -375,3 +375,6 @@ nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list<T> data,
 
 std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, generated_test_data>
 get_nanoarrow_cudf_table(cudf::size_type length);
+
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
+get_nanoarrow_host_tables(cudf::size_type length);
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 36338253c9b..408d54bd5ff 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -97,10 +97,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
   auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
-  auto out    = cudf::io::text::multibyte_split(
-    *source,
-    delimiter,
-    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size())});
+  cudf::io::text::parse_options options{
+    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size())}};
+  auto out = cudf::io::text::multibyte_split(*source, delimiter, options);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
@@ -113,10 +112,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange2)
   auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
-  auto out    = cudf::io::text::multibyte_split(
-    *source,
-    delimiter,
-    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size() - 1)});
+  cudf::io::text::parse_options options{
+    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size() - 1)}};
+  auto out = cudf::io::text::multibyte_split(*source, delimiter, options);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
@@ -277,9 +275,12 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRange)
   auto source    = cudf::io::text::make_source(host_input);
 
   auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
-  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
-  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
-  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+  auto out0        = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
+  auto out1 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
+  auto out2 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});
 
   auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
   auto out       = cudf::concatenate(out_views);
@@ -303,9 +304,12 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRange)
   auto source                           = cudf::io::text::make_source(host_input);
 
   auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
-  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
-  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
-  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+  auto out0        = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
+  auto out1 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
+  auto out2 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});
 
   auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
   auto out       = cudf::concatenate(out_views);
@@ -327,9 +331,12 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRangeSingleByte)
   auto source    = cudf::io::text::make_source(host_input);
 
   auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
-  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
-  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
-  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+  auto out0        = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
+  auto out1 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
+  auto out2 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});
 
   auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
   auto out       = cudf::concatenate(out_views);
@@ -352,9 +359,12 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRangeSingleByte)
   auto source                       = cudf::io::text::make_source(host_input);
 
   auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
-  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
-  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
-  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+  auto out0        = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
+  auto out1 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
+  auto out2 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});
 
   auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
   auto out       = cudf::concatenate(out_views);
@@ -383,9 +393,14 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRanges)
     SCOPED_TRACE(split1);
     for (int split2 = split1 + 1; split2 < size; split2++) {
       SCOPED_TRACE(split2);
-      auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1});
-      auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1});
-      auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2});
+      auto out1 = multibyte_split(
+        *source, delimiter, cudf::io::text::parse_options{byte_range_info{0, split1}});
+      auto out2 =
+        multibyte_split(*source,
+                        delimiter,
+                        cudf::io::text::parse_options{byte_range_info{split1, split2 - split1}});
+      auto out3 = multibyte_split(
+        *source, delimiter, cudf::io::text::parse_options{byte_range_info{split2, size - split2}});
 
       auto out_views = std::vector<cudf::column_view>({out1->view(), out2->view(), out3->view()});
       auto out       = cudf::concatenate(out_views);
@@ -416,9 +431,14 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte)
     SCOPED_TRACE(split1);
     for (int split2 = split1 + 1; split2 < size; split2++) {
       SCOPED_TRACE(split2);
-      auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1});
-      auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1});
-      auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2});
+      auto out1 = multibyte_split(
+        *source, delimiter, cudf::io::text::parse_options{byte_range_info{0, split1}});
+      auto out2 =
+        multibyte_split(*source,
+                        delimiter,
+                        cudf::io::text::parse_options{byte_range_info{split1, split2 - split1}});
+      auto out3 = multibyte_split(
+        *source, delimiter, cudf::io::text::parse_options{byte_range_info{split2, size - split2}});
 
       auto out_views = std::vector<cudf::column_view>({out1->view(), out2->view(), out3->view()});
       auto out       = cudf::concatenate(out_views);
@@ -441,7 +461,8 @@ TEST_F(MultibyteSplitTest, SingletonRangeAtEnd)
   auto source     = make_source(host_input);
   auto expected   = strings_column_wrapper{};
 
-  auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1});
+  auto out =
+    multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{5, 1}});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS);
 }
@@ -480,7 +501,8 @@ TEST_F(MultibyteSplitTest, EmptyRange)
   auto source     = make_source(host_input);
   auto expected   = strings_column_wrapper{};
 
-  auto out = multibyte_split(*source, delimiter, byte_range_info{4, 0});
+  auto out =
+    multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{4, 0}});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS);
 }
@@ -493,7 +515,8 @@ TEST_F(MultibyteSplitTest, EmptyRangeSingleByte)
   auto source     = make_source(host_input);
   auto expected   = strings_column_wrapper{};
 
-  auto out = multibyte_split(*source, delimiter, byte_range_info{3, 0});
+  auto out =
+    multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{3, 0}});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS);
 }
diff --git a/cpp/tests/streams/io/multibyte_split_test.cpp b/cpp/tests/streams/io/multibyte_split_test.cpp
new file mode 100644
index 00000000000..b0eff1d3340
--- /dev/null
+++ b/cpp/tests/streams/io/multibyte_split_test.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/io/text/byte_range_info.hpp>
+#include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
+
+#include <string>
+
+class MultibyteSplitTest : public cudf::test::BaseFixture {};
+
+TEST_F(MultibyteSplitTest, Reader)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abc:def");
+  auto source     = cudf::io::text::make_source(host_input);
+  cudf::io::text::parse_options options{};
+  auto result =
+    cudf::io::text::multibyte_split(*source, delimiter, options, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/merge_test.cpp b/cpp/tests/streams/merge_test.cpp
new file mode 100644
index 00000000000..1dfe877878d
--- /dev/null
+++ b/cpp/tests/streams/merge_test.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/merge.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+#include <vector>
+
+template <typename T>
+class MergeTest_ : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(MergeTest_, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(MergeTest_, MergeIsZeroWhenShouldNotBeZero)
+{
+  using columnFactoryT = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  columnFactoryT leftColWrap1({1, 2, 3, 4, 5});
+  cudf::test::fixed_width_column_wrapper<TypeParam> rightColWrap1{};
+
+  std::vector<cudf::size_type> key_cols{0};
+  std::vector<cudf::order> column_order;
+  column_order.push_back(cudf::order::ASCENDING);
+  std::vector<cudf::null_order> null_precedence(column_order.size(), cudf::null_order::AFTER);
+
+  cudf::table_view left_view{{leftColWrap1}};
+  cudf::table_view right_view{{rightColWrap1}};
+  cudf::table_view expected{{leftColWrap1}};
+
+  auto result = cudf::merge({left_view, right_view},
+                            key_cols,
+                            column_order,
+                            null_precedence,
+                            cudf::test::get_default_stream());
+
+  int expected_len = 5;
+  ASSERT_EQ(result->num_rows(), expected_len);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(MergeTest_, SingleTableInput)
+{
+  cudf::size_type inputRows = 40;
+
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  cudf::test::fixed_width_column_wrapper<TypeParam, typename decltype(sequence)::value_type>
+    colWrap1(sequence, sequence + inputRows);
+
+  std::vector<cudf::size_type> key_cols{0};
+  std::vector<cudf::order> column_order{cudf::order::ASCENDING};
+  std::vector<cudf::null_order> null_precedence{};
+
+  cudf::table_view left_view{{colWrap1}};
+
+  std::unique_ptr<cudf::table> p_outputTable;
+  CUDF_EXPECT_NO_THROW(
+    p_outputTable = cudf::merge(
+      {left_view}, key_cols, column_order, null_precedence, cudf::test::get_default_stream()));
+
+  auto input_column_view{left_view.column(0)};
+  auto output_column_view{p_outputTable->view().column(0)};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(input_column_view, output_column_view);
+}
+
+class MergeTest : public cudf::test::BaseFixture {};
+
+TEST_F(MergeTest, KeysWithNulls)
+{
+  cudf::size_type nrows = 13200;  // Ensures that thrust::merge uses more than one tile/block
+  auto data_iter        = thrust::make_counting_iterator<int32_t>(0);
+  auto valids1 =
+    cudf::detail::make_counting_transform_iterator(0, [](auto row) { return row % 10 != 0; });
+  cudf::test::fixed_width_column_wrapper<int32_t> data1(data_iter, data_iter + nrows, valids1);
+  auto valids2 =
+    cudf::detail::make_counting_transform_iterator(0, [](auto row) { return row % 15 != 0; });
+  cudf::test::fixed_width_column_wrapper<int32_t> data2(data_iter, data_iter + nrows, valids2);
+  auto all_data = cudf::concatenate(std::vector<cudf::column_view>{{data1, data2}},
+                                    cudf::test::get_default_stream());
+
+  std::vector<cudf::order> column_orders{cudf::order::ASCENDING, cudf::order::DESCENDING};
+  std::vector<cudf::null_order> null_precedences{cudf::null_order::AFTER, cudf::null_order::BEFORE};
+
+  for (auto co : column_orders)
+    for (auto np : null_precedences) {
+      std::vector<cudf::order> column_order{co};
+      std::vector<cudf::null_order> null_precedence{np};
+      auto sorted1 = cudf::sort(cudf::table_view({data1}),
+                                column_order,
+                                null_precedence,
+                                cudf::test::get_default_stream())
+                       ->release();
+      auto col1    = sorted1.front()->view();
+      auto sorted2 = cudf::sort(cudf::table_view({data2}),
+                                column_order,
+                                null_precedence,
+                                cudf::test::get_default_stream())
+                       ->release();
+      auto col2 = sorted2.front()->view();
+
+      auto result     = cudf::merge({cudf::table_view({col1}), cudf::table_view({col2})},
+                                    {0},
+                                column_order,
+                                null_precedence,
+                                cudf::test::get_default_stream());
+      auto sorted_all = cudf::sort(cudf::table_view({all_data->view()}),
+                                   column_order,
+                                   null_precedence,
+                                   cudf::test::get_default_stream());
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_all->view().column(0), result->view().column(0));
+    }
+}
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index 3aa7467d156..6c4afbb435a 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -532,6 +532,23 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
   }
 }
 
+TEST_F(StringsReplaceTest, EmptyTarget)
+{
+  auto const input = cudf::test::strings_column_wrapper({"hello", "world", "", "accénted"});
+  auto const sv    = cudf::strings_column_view(input);
+
+  auto const targets = cudf::test::strings_column_wrapper({"e", "", "d"});
+  auto const tv      = cudf::strings_column_view(targets);
+
+  auto const repls = cudf::test::strings_column_wrapper({"E", "_", "D"});
+  auto const rv    = cudf::strings_column_view(repls);
+
+  // empty target should be ignored
+  auto results  = cudf::strings::replace_multiple(sv, tv, rv);
+  auto expected = cudf::test::strings_column_wrapper({"hEllo", "worlD", "", "accéntED"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsReplaceTest, EmptyStringsColumn)
 {
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
diff --git a/cpp/tests/table/experimental_row_operator_tests.cu b/cpp/tests/table/experimental_row_operator_tests.cu
index 896cc7a82d4..0d9e4e27f2c 100644
--- a/cpp/tests/table/experimental_row_operator_tests.cu
+++ b/cpp/tests/table/experimental_row_operator_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -109,15 +109,14 @@ TYPED_TEST(TypedTableViewTest, TestSortSameTableFromTwoTables)
   auto const lhs       = cudf::table_view{{col1}};
   auto const empty_rhs = cudf::table_view{{col2}};
 
-  auto const stream    = cudf::get_default_stream();
-  auto const test_sort = [stream](auto const& preprocessed,
-                                  auto const& input,
-                                  auto const& comparator,
-                                  auto const& expected) {
-    auto const order = sorted_order(
-      preprocessed, input.num_rows(), cudf::detail::has_nested_columns(input), comparator, stream);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view());
-  };
+  auto const stream = cudf::get_default_stream();
+  auto const test_sort =
+    [stream](
+      auto const& preprocessed, auto const& input, auto const& comparator, auto const& expected) {
+      auto const order = sorted_order(
+        preprocessed, input.num_rows(), cudf::has_nested_columns(input), comparator, stream);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view());
+    };
 
   auto const test_sort_two_tables = [&](auto const& preprocessed_lhs,
                                         auto const& preprocessed_empty_rhs) {
@@ -188,15 +187,14 @@ TYPED_TEST(TypedTableViewTest, TestSortSameTableFromTwoTablesWithListsOfStructs)
   auto const lhs          = cudf::table_view{{*col1}};
   auto const empty_rhs    = cudf::table_view{{*col2}};
 
-  auto const stream    = cudf::get_default_stream();
-  auto const test_sort = [stream](auto const& preprocessed,
-                                  auto const& input,
-                                  auto const& comparator,
-                                  auto const& expected) {
-    auto const order = sorted_order(
-      preprocessed, input.num_rows(), cudf::detail::has_nested_columns(input), comparator, stream);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view());
-  };
+  auto const stream = cudf::get_default_stream();
+  auto const test_sort =
+    [stream](
+      auto const& preprocessed, auto const& input, auto const& comparator, auto const& expected) {
+      auto const order = sorted_order(
+        preprocessed, input.num_rows(), cudf::has_nested_columns(input), comparator, stream);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view());
+    };
 
   auto const test_sort_two_tables = [&](auto const& preprocessed_lhs,
                                         auto const& preprocessed_empty_rhs) {
diff --git a/cpp/tests/table/row_operator_tests_utilities.cu b/cpp/tests/table/row_operator_tests_utilities.cu
index cfffa1cdd54..6127864987d 100644
--- a/cpp/tests/table/row_operator_tests_utilities.cu
+++ b/cpp/tests/table/row_operator_tests_utilities.cu
@@ -42,7 +42,7 @@ std::unique_ptr<cudf::column> two_table_comparison(cudf::table_view lhs,
   auto output = cudf::make_numeric_column(
     cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED);
 
-  if (cudf::detail::has_nested_columns(lhs) || cudf::detail::has_nested_columns(rhs)) {
+  if (cudf::has_nested_columns(lhs) || cudf::has_nested_columns(rhs)) {
     thrust::transform(rmm::exec_policy(stream),
                       lhs_it,
                       lhs_it + lhs.num_rows(),
@@ -129,7 +129,7 @@ std::unique_ptr<cudf::column> two_table_equality(cudf::table_view lhs,
   auto output = cudf::make_numeric_column(
     cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED);
 
-  if (cudf::detail::has_nested_columns(lhs) or cudf::detail::has_nested_columns(rhs)) {
+  if (cudf::has_nested_columns(lhs) or cudf::has_nested_columns(rhs)) {
     auto const equal_comparator =
       table_comparator.equal_to<true>(cudf::nullate::NO{}, cudf::null_equality::EQUAL, comparator);
 
diff --git a/cpp/tests/table/row_operator_tests_utilities2.cu b/cpp/tests/table/row_operator_tests_utilities2.cu
index 057d9ee1004..17d274eba13 100644
--- a/cpp/tests/table/row_operator_tests_utilities2.cu
+++ b/cpp/tests/table/row_operator_tests_utilities2.cu
@@ -41,7 +41,7 @@ std::unique_ptr<cudf::column> self_comparison(cudf::table_view input,
   auto output = cudf::make_numeric_column(
     cudf::data_type(cudf::type_id::BOOL8), input.num_rows(), cudf::mask_state::UNALLOCATED);
 
-  if (cudf::detail::has_nested_columns(input)) {
+  if (cudf::has_nested_columns(input)) {
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator(0),
                       thrust::make_counting_iterator(input.num_rows()),
diff --git a/dependencies.yaml b/dependencies.yaml
index 38ec30a8033..27621ff9a3f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -243,7 +243,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &cmake_ver cmake>=3.26.4
+          - &cmake_ver cmake>=3.26.4,!=3.30.0
           - &ninja ninja
   build_all:
     common:
@@ -287,8 +287,8 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - librmm==24.8.*
-          - libkvikio==24.8.*
+          - librmm==24.8.*,>=0.0.0a0
+          - libkvikio==24.8.*,>=0.0.0a0
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==3.0.6
@@ -500,7 +500,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==24.8.*
+          - dask-cuda==24.8.*,>=0.0.0a0
           - *doxygen
           - make
           - myst-nb
@@ -582,7 +582,7 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pynvjitlink
+              - pynvjitlink>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
               - cubinlinker
@@ -592,7 +592,7 @@ dependencies:
           - matrix: {cuda: "12.*"}
             packages:
               - rmm-cu12==24.8.*,>=0.0.0a0
-              - pynvjitlink-cu12
+              - pynvjitlink-cu12>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
               - rmm-cu11==24.8.*,>=0.0.0a0
@@ -603,7 +603,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=0.20.30
+          - polars>=1.0
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -755,7 +755,7 @@ dependencies:
           - {matrix: null, packages: *cupy_packages_cu11}
   test_python_pandas_cudf:
     common:
-      - output_types: pyproject
+      - output_types: [requirements, pyproject]
         packages:
           # dependencies to run pandas tests
           # https://github.com/pandas-dev/pandas/blob/main/environment.yml
@@ -766,7 +766,7 @@ dependencies:
           - pytest-reportlog
   test_python_cudf_pandas:
     common:
-      - output_types: pyproject
+      - output_types: [requirements, pyproject]
         packages:
           - ipython
           - openpyxl
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 108f12bc099..c3c14ac8cad 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -372,7 +372,7 @@ def _generate_namespaces(namespaces):
 _all_namespaces = _generate_namespaces(
     {
         # Note that io::datasource is actually a nested class
-        "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression"},
+        "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"},
         "numeric": {},
         "nvtext": {},
     }
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index e9dad705cbf..bd6f0f77357 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -18,22 +18,22 @@ This page provides API documentation for pylibcudf.
     filling
     gpumemoryview
     groupby
-    io/index.rst
     interop
     join
     lists
     merge
     quantiles
     reduce
+    replace
     reshape
     rolling
     round
     scalar
     search
-    stream_compaction
     sorting
-    replace
+    stream_compaction
     table
+    traits
     types
     unary
 
@@ -41,4 +41,5 @@ This page provides API documentation for pylibcudf.
     :maxdepth: 2
     :caption: Subpackages
 
+    io/index.rst
     strings/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index 0d53ac92db9..bde6d8094ce 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -16,3 +16,4 @@ I/O Functions
     :maxdepth: 1
 
     avro
+    json
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
new file mode 100644
index 00000000000..6aeae1f322a
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
@@ -0,0 +1,6 @@
+====
+JSON
+====
+
+.. automodule:: cudf._lib.pylibcudf.io.json
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
new file mode 100644
index 00000000000..294ca8dc78c
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
@@ -0,0 +1,6 @@
+======
+traits
+======
+
+.. automodule:: cudf._lib.pylibcudf.traits
+   :members:
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index a8fef907bad..22e34feb547 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -9,38 +9,27 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_struct_field_names,
-)
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
 from cudf._lib.pylibcudf.libcudf.io.json cimport (
     json_reader_options,
     json_recovery_mode_t,
-    json_writer_options,
     read_json as libcudf_read_json,
     schema_element,
-    write_json as libcudf_write_json,
 )
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
-    column_name_info,
     compression_type,
-    sink_info,
-    table_metadata,
     table_with_metadata,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 from cudf._lib.types cimport dtype_to_data_type
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport data_from_unique_ptr
+
+import cudf._lib.pylibcudf as plc
 
 
 cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
@@ -175,45 +164,27 @@ def write_json(
     --------
     cudf.to_json
     """
-    cdef table_view input_table_view = table_view_from_table(
-        table, ignore_index=True
-    )
-
-    cdef unique_ptr[data_sink] data_sink_c
-    cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
-    cdef string na_c = na_rep.encode()
-    cdef bool include_nulls_c = include_nulls
-    cdef bool lines_c = lines
-    cdef int rows_per_chunk_c = rows_per_chunk
-    cdef string true_value_c = 'true'.encode()
-    cdef string false_value_c = 'false'.encode()
-    cdef table_metadata tbl_meta
-
-    num_index_cols_meta = 0
-    cdef column_name_info child_info
-    for i, name in enumerate(table._column_names, num_index_cols_meta):
-        child_info.name = name.encode()
-        tbl_meta.schema_info.push_back(child_info)
-        _set_col_children_metadata(
-            table[name]._column,
-            tbl_meta.schema_info[i]
-        )
+    cdef list colnames = []
 
-    cdef json_writer_options options = move(
-        json_writer_options.builder(sink_info_c, input_table_view)
-        .metadata(tbl_meta)
-        .na_rep(na_c)
-        .include_nulls(include_nulls_c)
-        .lines(lines_c)
-        .rows_per_chunk(rows_per_chunk_c)
-        .true_value(true_value_c)
-        .false_value(false_value_c)
-        .build()
-    )
+    for name in table._column_names:
+        colnames.append((name, _dtype_to_names_list(table[name]._column)))
 
     try:
-        with nogil:
-            libcudf_write_json(options)
+        plc.io.json.write_json(
+            plc.io.SinkInfo([path_or_buf]),
+            plc.io.TableWithMetadata(
+                plc.Table([
+                    c.to_pylibcudf(mode="read") for c in table._columns
+                ]),
+                colnames
+            ),
+            na_rep,
+            include_nulls,
+            lines,
+            rows_per_chunk,
+            true_value="true",
+            false_value="false"
+        )
     except OverflowError:
         raise OverflowError(
             f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
@@ -254,23 +225,12 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
         )
     return dtype_to_data_type(dtype)
 
-cdef _set_col_children_metadata(Column col,
-                                column_name_info& col_meta):
-    cdef column_name_info child_info
+
+def _dtype_to_names_list(col):
     if isinstance(col.dtype, cudf.StructDtype):
-        for i, (child_col, name) in enumerate(
-            zip(col.children, list(col.dtype.fields))
-        ):
-            child_info.name = name.encode()
-            col_meta.children.push_back(child_info)
-            _set_col_children_metadata(
-                child_col, col_meta.children[i]
-            )
+        return [(name, _dtype_to_names_list(child))
+                for name, child in zip(col.dtype.fields, col.children)]
     elif isinstance(col.dtype, cudf.ListDtype):
-        for i, child_col in enumerate(col.children):
-            col_meta.children.push_back(child_info)
-            _set_col_children_metadata(
-                child_col, col_meta.children[i]
-            )
-    else:
-        return
+        return [("", _dtype_to_names_list(child))
+                for child in col.children]
+    return []
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 0a198f431a7..d22096081af 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -38,6 +38,7 @@ set(cython_sources
     stream_compaction.pyx
     sorting.pyx
     table.pyx
+    traits.pyx
     types.pyx
     unary.pyx
     utils.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 5131df9a5cd..d4d615cde34 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -23,6 +23,7 @@ from . cimport (
     sorting,
     stream_compaction,
     strings,
+    traits,
     types,
     unary,
 )
@@ -54,12 +55,14 @@ __all__ = [
     "quantiles",
     "reduce",
     "replace",
+    "reshape",
     "rolling",
     "round",
     "search",
     "stream_compaction",
     "strings",
     "sorting",
+    "traits",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 43a9e2aca31..91f8acaf682 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -23,6 +23,7 @@
     sorting,
     stream_compaction,
     strings,
+    traits,
     types,
     unary,
 )
@@ -35,6 +36,7 @@
 __all__ = [
     "Column",
     "DataType",
+    "MaskState",
     "Scalar",
     "Table",
     "TypeId",
@@ -54,12 +56,14 @@
     "quantiles",
     "reduce",
     "replace",
+    "reshape",
     "rolling",
     "round",
     "search",
     "stream_compaction",
     "strings",
     "sorting",
+    "traits",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index 07e9d1ead11..adf7e1fd7e8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+from cpython cimport pycapsule
 from cython.operator cimport dereference
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
@@ -11,9 +12,15 @@ from functools import singledispatch
 
 from pyarrow import lib as pa
 
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.interop cimport (
+    ArrowArray,
+    ArrowArrayStream,
+    ArrowSchema,
     column_metadata,
     from_arrow as cpp_from_arrow,
+    from_arrow_column as cpp_from_arrow_column,
+    from_arrow_stream as cpp_from_arrow_stream,
     to_arrow as cpp_to_arrow,
 )
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
@@ -124,11 +131,15 @@ def _from_arrow_datatype(pyarrow_object):
 def _from_arrow_table(pyarrow_object, *, DataType data_type=None):
     if data_type is not None:
         raise ValueError("data_type may not be passed for tables")
-    cdef shared_ptr[pa.CTable] arrow_table = pa.pyarrow_unwrap_table(pyarrow_object)
+    stream = pyarrow_object.__arrow_c_stream__()
+    cdef ArrowArrayStream* c_stream = (
+        <ArrowArrayStream*>pycapsule.PyCapsule_GetPointer(stream, "arrow_array_stream")
+    )
 
     cdef unique_ptr[table] c_result
     with nogil:
-        c_result = move(cpp_from_arrow(dereference(arrow_table)))
+        # The libcudf function here will release the stream.
+        c_result = move(cpp_from_arrow_stream(c_stream))
 
     return Table.from_libcudf(move(c_result))
 
@@ -190,8 +201,25 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None):
 def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
     if data_type is not None:
         raise ValueError("data_type may not be passed for arrays")
-    pa_table = pa.table([pyarrow_object], [""])
-    return from_arrow(pa_table).columns()[0]
+
+    schema, array = pyarrow_object.__arrow_c_array__()
+    cdef ArrowSchema* c_schema = (
+        <ArrowSchema*>pycapsule.PyCapsule_GetPointer(schema, "arrow_schema")
+    )
+    cdef ArrowArray* c_array = (
+        <ArrowArray*>pycapsule.PyCapsule_GetPointer(array, "arrow_array")
+    )
+
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(cpp_from_arrow_column(c_schema, c_array))
+
+    # The capsule destructors should release automatically for us, but we
+    # choose to do it explicitly here for clarity.
+    c_schema.release(c_schema)
+    c_array.release(c_array)
+
+    return Column.from_libcudf(move(c_result))
 
 
 @singledispatch
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
index 32f0f5543e4..084b341ec48 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx datasource.pyx types.pyx)
+set(cython_sources avro.pyx datasource.pyx json.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
@@ -21,5 +21,7 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
 )
 
-set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_types)
+set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_json
+                                pylibcudf_io_types
+)
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
index cfd6d2cd281..ef4c65b277e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport avro, datasource, types
+from . cimport avro, datasource, json, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
index a54ba1834dc..fb4e4c7e4bb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, datasource, types
-from .types import SourceInfo, TableWithMetadata
+from . import avro, datasource, json, types
+from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
index 946e0896fc8..538bd8aa322 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
@@ -19,7 +19,7 @@ cpdef TableWithMetadata read_avro(
     size_type num_rows = -1
 ):
     """
-    Reads an Avro dataset into a set of columns.
+    Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.
 
     Parameters
     ----------
@@ -36,7 +36,7 @@ cpdef TableWithMetadata read_avro(
     Returns
     -------
     TableWithMetadata
-        The Table and its corresponding metadata that was read in.
+        The Table and its corresponding metadata (column names) that were read in.
     """
     cdef vector[string] c_columns
     if columns is not None and len(columns) > 0:
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
new file mode 100644
index 00000000000..a91d574131f
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cpdef void write_json(
+    SinkInfo sink_info,
+    TableWithMetadata tbl,
+    str na_rep = *,
+    bool include_nulls = *,
+    bool lines = *,
+    size_type rows_per_chunk = *,
+    str true_value = *,
+    str false_value = *
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
new file mode 100644
index 00000000000..7530eba3803
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -0,0 +1,68 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.limits cimport numeric_limits
+from libcpp.string cimport string
+
+from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.io.json cimport (
+    json_writer_options,
+    write_json as cpp_write_json,
+)
+from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata
+from cudf._lib.pylibcudf.types cimport size_type
+
+
+cpdef void write_json(
+    SinkInfo sink_info,
+    TableWithMetadata table_w_meta,
+    str na_rep = "",
+    bool include_nulls = False,
+    bool lines = False,
+    size_type rows_per_chunk = numeric_limits[size_type].max(),
+    str true_value = "true",
+    str false_value = "false"
+):
+    """
+    Writes a :py:class:`~cudf._lib.pylibcudf.table.Table` to JSON format.
+
+    Parameters
+    ----------
+    sink_info: SinkInfo
+        The SinkInfo object to write the JSON to.
+    table_w_meta: TableWithMetadata
+        The TableWithMetadata object containing the Table to write
+    na_rep: str, default ""
+        The string representation for null values.
+    include_nulls: bool, default False
+        Enables/Disables output of nulls as 'null'.
+    lines: bool, default False
+        If `True`, write output in the JSON lines format.
+    rows_per_chunk: size_type, defaults to length of the input table
+        The maximum number of rows to write at a time.
+    true_value: str, default "true"
+        The string representation for values != 0 in INT8 types.
+    false_value: str, default "false"
+        The string representation for values == 0 in INT8 types.
+    """
+    cdef table_metadata tbl_meta = table_w_meta.metadata
+    cdef string na_rep_c = na_rep.encode()
+
+    cdef json_writer_options options = (
+        json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view())
+        .metadata(tbl_meta)
+        .na_rep(na_rep_c)
+        .include_nulls(include_nulls)
+        .lines(lines)
+        .build()
+    )
+
+    if rows_per_chunk != numeric_limits[size_type].max():
+        options.set_rows_per_chunk(rows_per_chunk)
+    if true_value != "true":
+        options.set_true_value(<string>true_value.encode())
+    if false_value != "false":
+        options.set_false_value(<string>false_value.encode())
+
+    with nogil:
+        cpp_write_json(options)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
index aa846a47343..88daf54f33b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
@@ -1,4 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_encoding,
     column_in_metadata,
@@ -22,8 +26,15 @@ cdef class TableWithMetadata:
     cdef public Table tbl
     cdef table_metadata metadata
 
+    cdef vector[column_name_info] _make_column_info(self, list column_names)
+
     @staticmethod
     cdef TableWithMetadata from_libcudf(table_with_metadata& tbl)
 
 cdef class SourceInfo:
     cdef source_info c_obj
+
+cdef class SinkInfo:
+    # This vector just exists to keep the unique_ptrs to the sinks alive
+    cdef vector[unique_ptr[data_sink]] sink_storage
+    cdef sink_info c_obj
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
index ab3375da662..f94e20970a4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -1,17 +1,23 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from cpython.buffer cimport PyBUF_READ
+from cpython.memoryview cimport PyMemoryView_FromMemory
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.pylibcudf.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
 from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    column_name_info,
     host_buffer,
     source_info,
     table_with_metadata,
 )
 
+import codecs
 import errno
 import io
 import os
@@ -22,7 +28,39 @@ cdef class TableWithMetadata:
     (e.g. column names)
 
     For details, see :cpp:class:`cudf::io::table_with_metadata`.
+
+    Parameters
+    ----------
+    tbl : Table
+        The input table.
+    column_names : list
+        A list of tuples each containing the name of each column
+        and the names of its child columns (in the same format).
+        e.g.
+        [("id", []), ("name", [("first", []), ("last", [])])]
+
     """
+    def __init__(self, Table tbl, list column_names):
+        self.tbl = tbl
+
+        self.metadata.schema_info = self._make_column_info(column_names)
+
+    cdef vector[column_name_info] _make_column_info(self, list column_names):
+        cdef vector[column_name_info] col_name_infos
+        cdef column_name_info info
+
+        col_name_infos.reserve(len(column_names))
+
+        for name, child_names in column_names:
+            if not isinstance(name, str):
+                raise ValueError("Column name must be a string!")
+
+            info.name = <string> name.encode()
+            info.children = self._make_column_info(child_names)
+
+            col_name_infos.push_back(info)
+
+        return col_name_infos
 
     @property
     def columns(self):
@@ -51,6 +89,7 @@ cdef class TableWithMetadata:
         out.metadata = tbl_with_meta.metadata
         return out
 
+
 cdef class SourceInfo:
     """A class containing details on a source to read from.
 
@@ -119,7 +158,87 @@ cdef class SourceInfo:
             raise ValueError("Sources must be a list of str/paths, "
                              "bytes, io.BytesIO, or a Datasource")
 
-        if empty_buffer is True:
-            c_host_buffers.push_back(host_buffer(<char*>NULL, 0))
+        self.c_obj = source_info(c_host_buffers)
+
+
+# Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you
+# write from cudf to any python file-like object (File/BytesIO/SocketIO etc)
+cdef cppclass iobase_data_sink(data_sink):
+    object buf
+
+    iobase_data_sink(object buf_):
+        this.buf = buf_
+
+    void host_write(const void * data, size_t size) with gil:
+        if isinstance(buf, io.TextIOBase):
+            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ)
+                      .tobytes().decode())
+        else:
+            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ))
+
+    void flush() with gil:
+        buf.flush()
+
+    size_t bytes_written() with gil:
+        return buf.tell()
+
+
+cdef class SinkInfo:
+    """A class containing details on a source to read from.
+
+    For details, see :cpp:class:`cudf::io::sink_info`.
+
+    Parameters
+    ----------
+    sinks : list of str, PathLike, BytesIO, StringIO
+
+        A homogeneous list of sinks (this can be a string filename,
+        bytes, or one of the Python I/O classes) to read from.
+
+        Mixing different types of sinks will raise a `ValueError`.
+    """
+
+    def __init__(self, list sinks):
+        cdef vector[data_sink *] data_sinks
+        cdef vector[string] paths
+
+        if not sinks:
+            raise ValueError("Need to pass at least one sink")
+
+        if isinstance(sinks[0], os.PathLike):
+            sinks = [os.path.expanduser(s) for s in sinks]
+
+        cdef object initial_sink_cls = type(sinks[0])
+
+        if not all(isinstance(s, initial_sink_cls) for s in sinks):
+            raise ValueError("All sinks must be of the same type!")
+
+        if initial_sink_cls in {io.StringIO, io.BytesIO, io.TextIOBase}:
+            data_sinks.reserve(len(sinks))
+            if isinstance(sinks[0], (io.StringIO, io.BytesIO)):
+                for s in sinks:
+                    self.sink_storage.push_back(
+                        unique_ptr[data_sink](new iobase_data_sink(s))
+                    )
+            elif isinstance(sinks[0], io.TextIOBase):
+                for s in sinks:
+                    if codecs.lookup(s).name not in ('utf-8', 'ascii'):
+                        raise NotImplementedError(f"Unsupported encoding {s.encoding}")
+                    self.sink_storage.push_back(
+                        unique_ptr[data_sink](new iobase_data_sink(s.buffer))
+                    )
+            data_sinks.push_back(self.sink_storage.back().get())
+        elif initial_sink_cls is str:
+            paths.reserve(len(sinks))
+            for s in sinks:
+                paths.push_back(<string> s.encode())
+        else:
+            raise TypeError(
+                "Unrecognized input type: {}".format(type(sinks[0]))
+            )
 
-        self.c_obj = move(source_info(c_host_buffers))
+        if data_sinks.size() > 0:
+            self.c_obj = sink_info(data_sinks)
+        else:
+            # we don't have sinks so we must have paths to sinks
+            self.c_obj = sink_info(paths)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
index 471b78505fb..2151da28d4b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
@@ -7,6 +7,7 @@ from pyarrow.lib cimport CScalar, CTable
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
@@ -16,6 +17,19 @@ cdef extern from "dlpack/dlpack.h" nogil:
     ctypedef struct DLManagedTensor:
         void(*deleter)(DLManagedTensor*) except +
 
+
+# The Arrow structs are not namespaced.
+cdef extern from "cudf/interop.hpp" nogil:
+    cdef struct ArrowSchema:
+        void (*release)(ArrowSchema*) noexcept nogil
+
+    cdef struct ArrowArray:
+        void (*release)(ArrowArray*) noexcept nogil
+
+    cdef struct ArrowArrayStream:
+        void (*release)(ArrowArrayStream*) noexcept nogil
+
+
 cdef extern from "cudf/interop.hpp" namespace "cudf" \
         nogil:
     cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor
@@ -42,3 +56,9 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
         const scalar& input,
         column_metadata metadata,
     ) except +
+
+    cdef unique_ptr[table] from_arrow_stream(ArrowArrayStream* input) except +
+    cdef unique_ptr[column] from_arrow_column(
+        const ArrowSchema* schema,
+        const ArrowArray* input
+    ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd
new file mode 100644
index 00000000000..0382a5d42c3
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+
+
+cdef extern from "cudf/lists/reverse.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] reverse(
+        const lists_column_view& lists_column,
+    ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
new file mode 100644
index 00000000000..0cc58af735b
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
+
+
+cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil:
+    cdef bool is_relationally_comparable(data_type)
+    cdef bool is_equality_comparable(data_type)
+    cdef bool is_numeric(data_type)
+    cdef bool is_index_type(data_type)
+    cdef bool is_unsigned(data_type)
+    cdef bool is_integral(data_type)
+    cdef bool is_integral_not_bool(data_type)
+    cdef bool is_floating_point(data_type)
+    cdef bool is_boolean(data_type)
+    cdef bool is_timestamp(data_type)
+    cdef bool is_fixed_point(data_type)
+    cdef bool is_duration(data_type)
+    cdef bool is_chrono(data_type)
+    cdef bool is_dictionary(data_type)
+    cdef bool is_fixed_width(data_type)
+    cdef bool is_compound(data_type)
+    cdef bool is_nested(data_type)
+    cdef bool is_bit_castable(data_type, data_type)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 2ccf0139e90..c9d0a84e8ac 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -23,3 +23,5 @@ cpdef Column contains(Column, ColumnOrScalar)
 cpdef Column contains_nulls(Column)
 
 cpdef Column index_of(Column, ColumnOrScalar, bool)
+
+cpdef Column reverse(Column)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index a94d940accd..651f1346f88 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.lists cimport (
     contains as cpp_contains,
     explode as cpp_explode,
+    reverse as cpp_reverse,
 )
 from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
@@ -206,3 +207,28 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
             find_option,
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column reverse(Column input):
+    """Reverse the element order within each list of the input column.
+
+    For details, see :cpp:func:`reverse`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+
+    Returns
+    -------
+    Column
+        A new Column with reversed lists.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    with nogil:
+        c_result = move(cpp_reverse.reverse(
+            list_view.view(),
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/traits.pxd
new file mode 100644
index 00000000000..668fa775202
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/traits.pxd
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from .types cimport DataType
+
+
+cpdef bool is_relationally_comparable(DataType typ)
+cpdef bool is_equality_comparable(DataType typ)
+cpdef bool is_numeric(DataType typ)
+cpdef bool is_index_type(DataType typ)
+cpdef bool is_unsigned(DataType typ)
+cpdef bool is_integral(DataType typ)
+cpdef bool is_integral_not_bool(DataType typ)
+cpdef bool is_floating_point(DataType typ)
+cpdef bool is_boolean(DataType typ)
+cpdef bool is_timestamp(DataType typ)
+cpdef bool is_fixed_point(DataType typ)
+cpdef bool is_duration(DataType typ)
+cpdef bool is_chrono(DataType typ)
+cpdef bool is_dictionary(DataType typ)
+cpdef bool is_fixed_width(DataType typ)
+cpdef bool is_compound(DataType typ)
+cpdef bool is_nested(DataType typ)
+cpdef bool is_bit_castable(DataType source, DataType target)
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/cudf/cudf/_lib/pylibcudf/traits.pyx
new file mode 100644
index 00000000000..d2370f8d641
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/traits.pyx
@@ -0,0 +1,151 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from cudf._lib.pylibcudf.libcudf.utilities cimport traits
+
+from .types cimport DataType
+
+
+cpdef bool is_relationally_comparable(DataType typ):
+    """Checks if the given data type supports relational comparisons.
+
+    For details, see :cpp:func:`is_relationally_comparable`.
+    """
+    return traits.is_relationally_comparable(typ.c_obj)
+
+
+cpdef bool is_equality_comparable(DataType typ):
+    """Checks if the given data type supports equality comparisons.
+
+    For details, see :cpp:func:`is_equality_comparable`.
+    """
+    return traits.is_equality_comparable(typ.c_obj)
+
+
+cpdef bool is_numeric(DataType typ):
+    """Checks if the given data type is numeric.
+
+    For details, see :cpp:func:`is_numeric`.
+    """
+    return traits.is_numeric(typ.c_obj)
+
+
+cpdef bool is_index_type(DataType typ):
+    """Checks if the given data type is an index type.
+
+    For details, see :cpp:func:`is_index_type`.
+    """
+    return traits.is_index_type(typ.c_obj)
+
+
+cpdef bool is_unsigned(DataType typ):
+    """Checks if the given data type is an unsigned type.
+
+    For details, see :cpp:func:`is_unsigned`.
+    """
+    return traits.is_unsigned(typ.c_obj)
+
+
+cpdef bool is_integral(DataType typ):
+    """Checks if the given data type is an integral type.
+
+    For details, see :cpp:func:`is_integral`.
+    """
+    return traits.is_integral(typ.c_obj)
+
+
+cpdef bool is_integral_not_bool(DataType typ):
+    """Checks if the given data type is an integral type excluding booleans.
+
+    For details, see :cpp:func:`is_integral_not_bool`.
+    """
+    return traits.is_integral_not_bool(typ.c_obj)
+
+
+cpdef bool is_floating_point(DataType typ):
+    """Checks if the given data type is a floating point type.
+
+    For details, see :cpp:func:`is_floating_point`.
+    """
+    return traits.is_floating_point(typ.c_obj)
+
+
+cpdef bool is_boolean(DataType typ):
+    """Checks if the given data type is a boolean type.
+
+    For details, see :cpp:func:`is_boolean`.
+    """
+    return traits.is_boolean(typ.c_obj)
+
+
+cpdef bool is_timestamp(DataType typ):
+    """Checks if the given data type is a timestamp type.
+
+    For details, see :cpp:func:`is_timestamp`.
+    """
+    return traits.is_timestamp(typ.c_obj)
+
+
+cpdef bool is_fixed_point(DataType typ):
+    """Checks if the given data type is a fixed point type.
+
+    For details, see :cpp:func:`is_fixed_point`.
+    """
+    return traits.is_fixed_point(typ.c_obj)
+
+
+cpdef bool is_duration(DataType typ):
+    """Checks if the given data type is a duration type.
+
+    For details, see :cpp:func:`is_duration`.
+    """
+    return traits.is_duration(typ.c_obj)
+
+
+cpdef bool is_chrono(DataType typ):
+    """Checks if the given data type is a chrono type.
+
+    For details, see :cpp:func:`is_chrono`.
+    """
+    return traits.is_chrono(typ.c_obj)
+
+
+cpdef bool is_dictionary(DataType typ):
+    """Checks if the given data type is a dictionary type.
+
+    For details, see :cpp:func:`is_dictionary`.
+    """
+    return traits.is_dictionary(typ.c_obj)
+
+
+cpdef bool is_fixed_width(DataType typ):
+    """Checks if the given data type is a fixed width type.
+
+    For details, see :cpp:func:`is_fixed_width`.
+    """
+    return traits.is_fixed_width(typ.c_obj)
+
+
+cpdef bool is_compound(DataType typ):
+    """Checks if the given data type is a compound type.
+
+    For details, see :cpp:func:`is_compound`.
+    """
+    return traits.is_compound(typ.c_obj)
+
+
+cpdef bool is_nested(DataType typ):
+    """Checks if the given data type is a nested type.
+
+    For details, see :cpp:func:`is_nested`.
+    """
+    return traits.is_nested(typ.c_obj)
+
+
+cpdef bool is_bit_castable(DataType source, DataType target):
+    """Checks if the source type is bit-castable to the target type.
+
+    For details, see :cpp:func:`is_bit_castable`.
+    """
+    return traits.is_bit_castable(source.c_obj, target.c_obj)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 5db6fd904a9..e7a2863da8c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -959,6 +959,15 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
         raise NotImplementedError()
 
     def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
+        if len(self) == 0:
+            dtype = cudf.dtype(dtype)
+            if self.dtype == dtype:
+                if copy:
+                    return self.copy()
+                else:
+                    return self
+            else:
+                return column_empty(0, dtype=dtype, masked=self.nullable)
         if copy:
             col = self.copy()
         else:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 121076b69ce..c10aceba9f4 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -280,8 +280,8 @@ def __contains__(self, item: ScalarLike) -> bool:
             return False
         elif ts.tzinfo is not None:
             ts = ts.tz_convert(None)
-        return ts.to_numpy().astype("int64") in self.as_numerical_column(
-            "int64"
+        return ts.to_numpy().astype("int64") in cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
         )
 
     @functools.cached_property
@@ -503,9 +503,9 @@ def mean(
         self, skipna=None, min_count: int = 0, dtype=np.float64
     ) -> ScalarLike:
         return pd.Timestamp(
-            self.as_numerical_column("int64").mean(
-                skipna=skipna, min_count=min_count, dtype=dtype
-            ),
+            cast(
+                "cudf.core.column.NumericalColumn", self.astype("int64")
+            ).mean(skipna=skipna, min_count=min_count, dtype=dtype),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -517,7 +517,7 @@ def std(
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical_column("int64").std(
+            cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
                 skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
             )
             * _unit_to_nanoseconds_conversion[self.time_unit],
@@ -525,7 +525,9 @@ def std(
 
     def median(self, skipna: bool | None = None) -> pd.Timestamp:
         return pd.Timestamp(
-            self.as_numerical_column("int64").median(skipna=skipna),
+            cast(
+                "cudf.core.column.NumericalColumn", self.astype("int64")
+            ).median(skipna=skipna),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -534,18 +536,18 @@ def cov(self, other: DatetimeColumn) -> float:
             raise TypeError(
                 f"cannot perform cov with types {self.dtype}, {other.dtype}"
             )
-        return self.as_numerical_column("int64").cov(
-            other.as_numerical_column("int64")
-        )
+        return cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64")))
 
     def corr(self, other: DatetimeColumn) -> float:
         if not isinstance(other, DatetimeColumn):
             raise TypeError(
                 f"cannot perform corr with types {self.dtype}, {other.dtype}"
             )
-        return self.as_numerical_column("int64").corr(
-            other.as_numerical_column("int64")
-        )
+        return cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64")))
 
     def quantile(
         self,
@@ -554,7 +556,7 @@ def quantile(
         exact: bool,
         return_scalar: bool,
     ) -> ColumnBase:
-        result = self.as_numerical_column("int64").quantile(
+        result = self.astype("int64").quantile(
             q=q,
             interpolation=interpolation,
             exact=exact,
@@ -645,12 +647,12 @@ def indices_of(
     ) -> cudf.core.column.NumericalColumn:
         value = column.as_column(
             pd.to_datetime(value), dtype=self.dtype
-        ).as_numerical_column("int64")
-        return self.as_numerical_column("int64").indices_of(value)
+        ).astype("int64")
+        return self.astype("int64").indices_of(value)
 
     @property
     def is_unique(self) -> bool:
-        return self.as_numerical_column("int64").is_unique
+        return self.astype("int64").is_unique
 
     def isin(self, values: Sequence) -> ColumnBase:
         return cudf.core.tools.datetimes._isin_datetimelike(self, values)
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index d66908b5f94..3e238d65cff 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -157,7 +157,7 @@ def normalize_binop_value(self, other):
                         "Decimal columns only support binary operations with "
                         "integer numerical columns."
                     )
-                other = other.as_decimal_column(
+                other = other.astype(
                     self.dtype.__class__(self.dtype.__class__.MAX_PRECISION, 0)
                 )
             elif not isinstance(other, DecimalBaseColumn):
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index f24ca3fdad1..d09a1f66539 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -4,7 +4,7 @@
 
 import cudf
 from cudf.core.column import StructColumn
-from cudf.core.dtypes import CategoricalDtype, IntervalDtype
+from cudf.core.dtypes import IntervalDtype
 
 
 class IntervalColumn(StructColumn):
@@ -87,20 +87,16 @@ def copy(self, deep=True):
 
     def as_interval_column(self, dtype):
         if isinstance(dtype, IntervalDtype):
-            if isinstance(self.dtype, CategoricalDtype):
-                new_struct = self._get_decategorized_column()
-                return IntervalColumn.from_struct_column(new_struct)
-            else:
-                return IntervalColumn(
-                    size=self.size,
-                    dtype=dtype,
-                    mask=self.mask,
-                    offset=self.offset,
-                    null_count=self.null_count,
-                    children=tuple(
-                        child.astype(dtype.subtype) for child in self.children
-                    ),
-                )
+            return IntervalColumn(
+                size=self.size,
+                dtype=dtype,
+                mask=self.mask,
+                offset=self.offset,
+                null_count=self.null_count,
+                children=tuple(
+                    child.astype(dtype.subtype) for child in self.children
+                ),
+            )
         else:
             raise ValueError("dtype must be IntervalDtype")
 
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index c548db67344..1992d471947 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -73,10 +73,15 @@ def memory_usage(self):
             child0_size = (
                 current_base_child.size + 1 - current_offset
             ) * current_base_child.base_children[0].dtype.itemsize
-            current_offset = current_base_child.base_children[
-                0
-            ].element_indexing(current_offset)
             n += child0_size
+            current_offset_col = current_base_child.base_children[0]
+            if not len(current_offset_col):
+                # See https://github.com/rapidsai/cudf/issues/16164 why
+                # offset column can be uninitialized
+                break
+            current_offset = current_offset_col.element_indexing(
+                current_offset
+            )
             current_base_child = current_base_child.base_children[1]
 
         n += (
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 8f41bcb6422..5a0171bbbdc 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -107,7 +107,9 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool:
             # np.timedelta64 raises ValueError, hence `item`
             # cannot exist in `self`.
             return False
-        return item.view("int64") in self.as_numerical_column("int64")
+        return item.view("int64") in cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        )
 
     @property
     def values(self):
@@ -132,9 +134,7 @@ def to_arrow(self) -> pa.Array:
                 self.mask_array_view(mode="read").copy_to_host()
             )
         data = pa.py_buffer(
-            self.as_numerical_column("int64")
-            .data_array_view(mode="read")
-            .copy_to_host()
+            self.astype("int64").data_array_view(mode="read").copy_to_host()
         )
         pa_dtype = np_to_pa_dtype(self.dtype)
         return pa.Array.from_buffers(
@@ -295,13 +295,17 @@ def as_timedelta_column(
 
     def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical_column("int64").mean(skipna=skipna, dtype=dtype),
+            cast(
+                "cudf.core.column.NumericalColumn", self.astype("int64")
+            ).mean(skipna=skipna, dtype=dtype),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
     def median(self, skipna: bool | None = None) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical_column("int64").median(skipna=skipna),
+            cast(
+                "cudf.core.column.NumericalColumn", self.astype("int64")
+            ).median(skipna=skipna),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -315,7 +319,7 @@ def quantile(
         exact: bool,
         return_scalar: bool,
     ) -> ColumnBase:
-        result = self.as_numerical_column("int64").quantile(
+        result = self.astype("int64").quantile(
             q=q,
             interpolation=interpolation,
             exact=exact,
@@ -337,7 +341,7 @@ def sum(
             # Since sum isn't overridden in Numerical[Base]Column, mypy only
             # sees the signature from Reducible (which doesn't have the extra
             # parameters from ColumnBase._reduce) so we have to ignore this.
-            self.as_numerical_column("int64").sum(  # type: ignore
+            self.astype("int64").sum(  # type: ignore
                 skipna=skipna, min_count=min_count, dtype=dtype
             ),
             unit=self.time_unit,
@@ -351,7 +355,7 @@ def std(
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical_column("int64").std(
+            cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
                 skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
             ),
             unit=self.time_unit,
@@ -362,18 +366,18 @@ def cov(self, other: TimeDeltaColumn) -> float:
             raise TypeError(
                 f"cannot perform cov with types {self.dtype}, {other.dtype}"
             )
-        return self.as_numerical_column("int64").cov(
-            other.as_numerical_column("int64")
-        )
+        return cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64")))
 
     def corr(self, other: TimeDeltaColumn) -> float:
         if not isinstance(other, TimeDeltaColumn):
             raise TypeError(
                 f"cannot perform corr with types {self.dtype}, {other.dtype}"
             )
-        return self.as_numerical_column("int64").corr(
-            other.as_numerical_column("int64")
-        )
+        return cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64")))
 
     def components(self) -> dict[str, ColumnBase]:
         """
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4dfeb68b7ba..b249410c2e4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2404,7 +2404,7 @@ def scatter_by_map(
         if isinstance(map_index, cudf.core.column.StringColumn):
             cat_index = cast(
                 cudf.core.column.CategoricalColumn,
-                map_index.as_categorical_column("category"),
+                map_index.astype("category"),
             )
             map_index = cat_index.codes
             warnings.warn(
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 034849d0e71..de715191c08 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -937,7 +937,7 @@ def to_pandas(self) -> pd.IntervalDtype:
     def __eq__(self, other):
         if isinstance(other, str):
             # This means equality isn't transitive but mimics pandas
-            return other == self.name
+            return other in (self.name, str(self))
         return (
             type(self) == type(other)
             and self.subtype == other.subtype
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 9bac75dc6ac..253d200f7d4 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -927,7 +927,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
                 # of column is 0 (i.e., empty) then we will have an
                 # int8 column in result._data[name] returned by libcudf,
                 # which needs to be type-casted to 'category' dtype.
-                result[name] = result[name].as_categorical_column("category")
+                result[name] = result[name].astype("category")
             elif (
                 pandas_dtypes.get(name) == "empty"
                 and np_dtypes.get(name) == "object"
@@ -936,7 +936,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
                 # is specified as 'empty' and np_dtypes as 'object',
                 # hence handling this special case to type-cast the empty
                 # float column to str column.
-                result[name] = result[name].as_string_column(cudf.dtype("str"))
+                result[name] = result[name].astype(cudf.dtype("str"))
             elif name in data.column_names and isinstance(
                 data[name].type,
                 (
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 73a1cd26367..a5fed02cbed 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -229,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec:
     else:
         key = cudf.core.column.as_column(key)
         if isinstance(key, cudf.core.column.CategoricalColumn):
-            key = key.as_numerical_column(key.codes.dtype)
+            key = key.astype(key.codes.dtype)
         if is_bool_dtype(key.dtype):
             return MaskIndexer(BooleanMask(key, n))
         elif len(key) == 0:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 97b6bbec2d4..4a60470fafa 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3107,10 +3107,12 @@ def value_counts(
         # Pandas returns an IntervalIndex as the index of res
         # this condition makes sure we do too if bins is given
         if bins is not None and len(res) == len(res.index.categories):
-            int_index = IntervalColumn.as_interval_column(
-                res.index._column, res.index.categories.dtype
+            interval_col = IntervalColumn.from_struct_column(
+                res.index._column._get_decategorized_column()
+            )
+            res.index = cudf.IntervalIndex._from_data(
+                {res.index.name: interval_col}
             )
-            res.index = int_index
         res.name = result_name
         return res
 
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 68b23f1e059..ef6b86a04a7 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -115,11 +115,11 @@ def to_numeric(arg, errors="raise", downcast=None):
     dtype = col.dtype
 
     if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype):
-        col = col.as_numerical_column(cudf.dtype("int64"))
+        col = col.astype(cudf.dtype("int64"))
     elif isinstance(dtype, CategoricalDtype):
         cat_dtype = col.dtype.type
         if _is_non_decimal_numeric_dtype(cat_dtype):
-            col = col.as_numerical_column(cat_dtype)
+            col = col.astype(cat_dtype)
         else:
             try:
                 col = _convert_str_col(
@@ -146,8 +146,8 @@ def to_numeric(arg, errors="raise", downcast=None):
         raise ValueError("Unrecognized datatype")
 
     # str->float conversion may require lower precision
-    if col.dtype == cudf.dtype("f"):
-        col = col.as_numerical_column("d")
+    if col.dtype == cudf.dtype("float32"):
+        col = col.astype("float64")
 
     if downcast:
         if downcast == "float":
@@ -205,7 +205,7 @@ def _convert_str_col(col, errors, _downcast=None):
 
     is_integer = libstrings.is_integer(col)
     if is_integer.all():
-        return col.as_numerical_column(dtype=cudf.dtype("i8"))
+        return col.astype(dtype=cudf.dtype("i8"))
 
     col = _proc_inf_empty_strings(col)
 
@@ -218,9 +218,9 @@ def _convert_str_col(col, errors, _downcast=None):
                     "limited by float32 precision."
                 )
             )
-            return col.as_numerical_column(dtype=cudf.dtype("f"))
+            return col.astype(dtype=cudf.dtype("float32"))
         else:
-            return col.as_numerical_column(dtype=cudf.dtype("d"))
+            return col.astype(dtype=cudf.dtype("float64"))
     else:
         if errors == "coerce":
             col = libcudf.string_casting.stod(col)
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index bf927e661fe..d41e6c720bf 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -1,24 +1,39 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from __future__ import annotations
 
+import io
+import os
+
 import pyarrow as pa
 import pytest
 
 from cudf._lib import pylibcudf as plc
 
 
-def metadata_from_arrow_array(
-    pa_array: pa.Array,
+def metadata_from_arrow_type(
+    pa_type: pa.Array,
+    name: str = "",
 ) -> plc.interop.ColumnMetadata | None:
-    metadata = None
-    if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype):
+    metadata = plc.interop.ColumnMetadata(name)  # None
+    if pa.types.is_list(pa_type):
+        child_meta = [plc.interop.ColumnMetadata("offsets")]
+        for i in range(pa_type.num_fields):
+            field_meta = metadata_from_arrow_type(
+                pa_type.field(i).type, pa_type.field(i).name
+            )
+            child_meta.append(field_meta)
+        metadata = plc.interop.ColumnMetadata(name, child_meta)
+    elif pa.types.is_struct(pa_type):
+        child_meta = []
+        for i in range(pa_type.num_fields):
+            field_meta = metadata_from_arrow_type(
+                pa_type.field(i).type, pa_type.field(i).name
+            )
+            child_meta.append(field_meta)
         metadata = plc.interop.ColumnMetadata(
-            "",
+            name,
             # libcudf does not store field names, so just match pyarrow's.
-            [
-                plc.interop.ColumnMetadata(pa_array.type.field(i).name)
-                for i in range(pa_array.type.num_fields)
-            ],
+            child_meta,
         )
     return metadata
 
@@ -32,13 +47,13 @@ def assert_column_eq(
         rhs, plc.Column
     ):
         rhs = plc.interop.to_arrow(
-            rhs, metadata=metadata_from_arrow_array(lhs)
+            rhs, metadata=metadata_from_arrow_type(lhs.type)
         )
     elif isinstance(lhs, plc.Column) and isinstance(
         rhs, (pa.Array, pa.ChunkedArray)
     ):
         lhs = plc.interop.to_arrow(
-            lhs, metadata=metadata_from_arrow_array(rhs)
+            lhs, metadata=metadata_from_arrow_type(rhs.type)
         )
     else:
         raise ValueError(
@@ -87,56 +102,84 @@ def cudf_raises(expected_exception: BaseException, *args, **kwargs):
     return pytest.raises(expected_exception, *args, **kwargs)
 
 
-# TODO: Consider moving these type utilities into pylibcudf.types itself.
-def is_signed_integer(plc_dtype: plc.DataType):
-    return (
-        plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value
-    )
-
-
-def is_unsigned_integer(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.UINT8,
-        plc.TypeId.UINT16,
-        plc.TypeId.UINT32,
-        plc.TypeId.UINT64,
-    )
-
+def is_string(plc_dtype: plc.DataType):
+    return plc_dtype.id() == plc.TypeId.STRING
 
-def is_integer(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.INT8,
-        plc.TypeId.INT16,
-        plc.TypeId.INT32,
-        plc.TypeId.INT64,
-    )
 
+def nesting_level(typ) -> tuple[int, int]:
+    """Return list and struct nesting of a pyarrow type."""
+    if isinstance(typ, pa.ListType):
+        list_, struct = nesting_level(typ.value_type)
+        return list_ + 1, struct
+    elif isinstance(typ, pa.StructType):
+        lists, structs = map(max, zip(*(nesting_level(t.type) for t in typ)))
+        return lists, structs + 1
+    else:
+        return 0, 0
 
-def is_floating(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.FLOAT32,
-        plc.TypeId.FLOAT64,
-    )
 
+def is_nested_struct(typ):
+    return nesting_level(typ)[1] > 1
 
-def is_boolean(plc_dtype: plc.DataType):
-    return plc_dtype.id() == plc.TypeId.BOOL8
 
+def is_nested_list(typ):
+    return nesting_level(typ)[0] > 1
 
-def is_string(plc_dtype: plc.DataType):
-    return plc_dtype.id() == plc.TypeId.STRING
 
+def sink_to_str(sink):
+    """
+    Takes a sink (e.g. StringIO/BytesIO, filepath, etc.)
+    and reads in the contents into a string (str not bytes)
+    for comparison
+    """
+    if isinstance(sink, (str, os.PathLike)):
+        with open(sink, "r") as f:
+            str_result = f.read()
+    elif isinstance(sink, io.BytesIO):
+        sink.seek(0)
+        str_result = sink.read().decode()
+    else:
+        sink.seek(0)
+        str_result = sink.read()
+    return str_result
 
-def is_fixed_width(plc_dtype: plc.DataType):
-    return (
-        is_integer(plc_dtype)
-        or is_floating(plc_dtype)
-        or is_boolean(plc_dtype)
-    )
 
+NUMERIC_PA_TYPES = [pa.int64(), pa.float64(), pa.uint64()]
+STRING_PA_TYPES = [pa.string()]
+BOOL_PA_TYPES = [pa.bool_()]
+LIST_PA_TYPES = [
+    pa.list_(pa.int64()),
+    # Nested case
+    pa.list_(pa.list_(pa.int64())),
+]
 
 # We must explicitly specify this type via a field to ensure we don't include
 # nullability accidentally.
 DEFAULT_STRUCT_TESTING_TYPE = pa.struct(
     [pa.field("v", pa.int64(), nullable=False)]
 )
+NESTED_STRUCT_TESTING_TYPE = pa.struct(
+    [
+        pa.field("a", pa.int64(), nullable=False),
+        pa.field(
+            "b_struct",
+            pa.struct([pa.field("b", pa.float64(), nullable=False)]),
+            nullable=False,
+        ),
+    ]
+)
+
+DEFAULT_PA_STRUCT_TESTING_TYPES = [
+    DEFAULT_STRUCT_TESTING_TYPE,
+    NESTED_STRUCT_TESTING_TYPE,
+]
+
+DEFAULT_PA_TYPES = (
+    NUMERIC_PA_TYPES
+    + STRING_PA_TYPES
+    + BOOL_PA_TYPES
+    + LIST_PA_TYPES
+    + DEFAULT_PA_STRUCT_TESTING_TYPES
+)
+
+ALL_PA_TYPES = DEFAULT_PA_TYPES
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index b169bbdee5b..e4760ea7ac8 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -1,9 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 # Tell ruff it's OK that some imports occur after the sys.path.insert
 # ruff: noqa: E402
+import io
 import os
+import pathlib
 import sys
 
+import numpy as np
 import pyarrow as pa
 import pytest
 
@@ -11,7 +14,7 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
-from utils import DEFAULT_STRUCT_TESTING_TYPE
+from utils import ALL_PA_TYPES, DEFAULT_PA_TYPES, NUMERIC_PA_TYPES
 
 
 # This fixture defines the standard set of types that all tests should default to
@@ -20,14 +23,7 @@
 # across modules. Otherwise it may be defined on a per-module basis.
 @pytest.fixture(
     scope="session",
-    params=[
-        pa.int64(),
-        pa.float64(),
-        pa.string(),
-        pa.bool_(),
-        pa.list_(pa.int64()),
-        DEFAULT_STRUCT_TESTING_TYPE,
-    ],
+    params=DEFAULT_PA_TYPES,
 )
 def pa_type(request):
     return request.param
@@ -35,16 +31,96 @@ def pa_type(request):
 
 @pytest.fixture(
     scope="session",
-    params=[
-        pa.int64(),
-        pa.float64(),
-        pa.uint64(),
-    ],
+    params=NUMERIC_PA_TYPES,
 )
 def numeric_pa_type(request):
     return request.param
 
 
+# TODO: Consider adding another fixture/adapting this
+# fixture to consider nullability
+@pytest.fixture(scope="session", params=[0, 100])
+def table_data(request):
+    """
+    Returns (TableWithMetadata, pa_table).
+
+    This is the default fixture you should be using for testing
+    pylibcudf I/O writers.
+
+    Contains one of each category (e.g. int, bool, list, struct)
+    of dtypes.
+    """
+    nrows = request.param
+
+    table_dict = {}
+    # Colnames in the format expected by
+    # plc.io.TableWithMetadata
+    colnames = []
+
+    np.random.seed(42)
+
+    for typ in ALL_PA_TYPES:
+        rand_vals = np.random.randint(0, nrows, nrows)
+        child_colnames = []
+
+        def _generate_nested_data(typ):
+            child_colnames = []
+
+            # recurse to get vals for children
+            rand_arrs = []
+            for i in range(typ.num_fields):
+                rand_arr, grandchild_colnames = _generate_nested_data(
+                    typ.field(i).type
+                )
+                rand_arrs.append(rand_arr)
+                child_colnames.append((typ.field(i).name, grandchild_colnames))
+
+            if isinstance(typ, pa.StructType):
+                pa_array = pa.StructArray.from_arrays(
+                    [rand_arr for rand_arr in rand_arrs],
+                    names=[typ.field(i).name for i in range(typ.num_fields)],
+                )
+            elif isinstance(typ, pa.ListType):
+                pa_array = pa.array(
+                    [list(row_vals) for row_vals in zip(rand_arrs[0])],
+                    type=typ,
+                )
+                child_colnames.append(("", grandchild_colnames))
+            else:
+                # typ is scalar type
+                pa_array = pa.array(rand_vals).cast(typ)
+            return pa_array, child_colnames
+
+        if isinstance(typ, (pa.ListType, pa.StructType)):
+            rand_arr, child_colnames = _generate_nested_data(typ)
+        else:
+            rand_arr = pa.array(rand_vals).cast(typ)
+
+        table_dict[f"col_{typ}"] = rand_arr
+        colnames.append((f"col_{typ}", child_colnames))
+
+    pa_table = pa.Table.from_pydict(table_dict)
+
+    return plc.io.TableWithMetadata(
+        plc.interop.from_arrow(pa_table), column_names=colnames
+    ), pa_table
+
+
+@pytest.fixture(
+    params=["a.txt", pathlib.Path("a.txt"), io.BytesIO, io.StringIO],
+)
+def source_or_sink(request, tmp_path):
+    fp_or_buf = request.param
+    if isinstance(fp_or_buf, str):
+        return f"{tmp_path}/{fp_or_buf}"
+    elif isinstance(fp_or_buf, os.PathLike):
+        return tmp_path.joinpath(fp_or_buf)
+    elif issubclass(fp_or_buf, io.IOBase):
+        # Must construct io.StringIO/io.BytesIO inside
+        # fixture, or we'll end up re-using it
+        return fp_or_buf()
+
+
 @pytest.fixture(
     scope="session", params=[opt for opt in plc.types.Interpolation]
 )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_avro.py b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
similarity index 100%
rename from python/cudf/cudf/pylibcudf_tests/test_avro.py
rename to python/cudf/cudf/pylibcudf_tests/io/test_avro.py
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
new file mode 100644
index 00000000000..d6b8bfa6976
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import io
+
+import pyarrow as pa
+import pytest
+from utils import sink_to_str
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.mark.parametrize("rows_per_chunk", [8, 100])
+@pytest.mark.parametrize("lines", [True, False])
+def test_write_json_basic(table_data, source_or_sink, lines, rows_per_chunk):
+    plc_table_w_meta, pa_table = table_data
+    sink = source_or_sink
+
+    plc.io.json.write_json(
+        plc.io.SinkInfo([sink]),
+        plc_table_w_meta,
+        lines=lines,
+        rows_per_chunk=rows_per_chunk,
+    )
+
+    exp = pa_table.to_pandas()
+
+    # Convert everything to string to make
+    # comparisons easier
+    str_result = sink_to_str(sink)
+
+    pd_result = exp.to_json(orient="records", lines=lines)
+
+    assert str_result == pd_result
+
+
+@pytest.mark.parametrize("include_nulls", [True, False])
+@pytest.mark.parametrize("na_rep", ["null", "awef", ""])
+def test_write_json_nulls(na_rep, include_nulls):
+    names = ["a", "b"]
+    pa_tbl = pa.Table.from_arrays(
+        [pa.array([1.0, 2.0, None]), pa.array([True, None, False])],
+        names=names,
+    )
+    plc_tbl = plc.interop.from_arrow(pa_tbl)
+    plc_tbl_w_meta = plc.io.types.TableWithMetadata(
+        plc_tbl, column_names=[(name, []) for name in names]
+    )
+
+    sink = io.StringIO()
+
+    plc.io.json.write_json(
+        plc.io.SinkInfo([sink]),
+        plc_tbl_w_meta,
+        na_rep=na_rep,
+        include_nulls=include_nulls,
+    )
+
+    exp = pa_tbl.to_pandas()
+
+    # Convert everything to string to make
+    # comparisons easier
+    str_result = sink_to_str(sink)
+    pd_result = exp.to_json(orient="records")
+
+    if not include_nulls:
+        # No equivalent in pandas, so we just
+        # sanity check by making sure na_rep
+        # doesn't appear in the output
+
+        # don't quote null
+        for name in names:
+            assert f'{{"{name}":{na_rep}}}' not in str_result
+        return
+
+    # pandas doesn't suppport na_rep
+    # let's just manually do str.replace
+    pd_result = pd_result.replace("null", na_rep)
+
+    assert str_result == pd_result
+
+
+@pytest.mark.parametrize("true_value", ["True", "correct"])
+@pytest.mark.parametrize("false_value", ["False", "wrong"])
+def test_write_json_bool_opts(true_value, false_value):
+    names = ["a"]
+    pa_tbl = pa.Table.from_arrays([pa.array([True, None, False])], names=names)
+    plc_tbl = plc.interop.from_arrow(pa_tbl)
+    plc_tbl_w_meta = plc.io.types.TableWithMetadata(
+        plc_tbl, column_names=[(name, []) for name in names]
+    )
+
+    sink = io.StringIO()
+
+    plc.io.json.write_json(
+        plc.io.SinkInfo([sink]),
+        plc_tbl_w_meta,
+        include_nulls=True,
+        na_rep="null",
+        true_value=true_value,
+        false_value=false_value,
+    )
+
+    exp = pa_tbl.to_pandas()
+
+    # Convert everything to string to make
+    # comparisons easier
+    str_result = sink_to_str(sink)
+    pd_result = exp.to_json(orient="records")
+
+    # pandas doesn't suppport na_rep
+    # let's just manually do str.replace
+    if true_value != "true":
+        pd_result = pd_result.replace("true", true_value)
+    if false_value != "false":
+        pd_result = pd_result.replace("false", false_value)
+
+    assert str_result == pd_result
diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
similarity index 72%
rename from python/cudf/cudf/pylibcudf_tests/test_source_info.py
rename to python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
index 019321b7259..287dd8f21c8 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_source_info.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
@@ -9,6 +9,21 @@
 from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource
 
 
+@pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
+def io_class(request):
+    return request.param
+
+
+def _skip_invalid_sinks(io_class, sink):
+    """
+    Skip invalid sinks for SinkInfo
+    """
+    if io_class is plc.io.SinkInfo and isinstance(
+        sink, (bytes, NativeFileDatasource)
+    ):
+        pytest.skip(f"{sink} is not a valid input for SinkInfo")
+
+
 @pytest.mark.parametrize(
     "source",
     [
@@ -18,16 +33,15 @@
         NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
     ],
 )
-def test_source_info_ctor(source, tmp_path):
+def test_source_info_ctor(io_class, source, tmp_path):
     if isinstance(source, str):
         file = tmp_path / source
         file.write_bytes("hello world".encode("utf-8"))
         source = str(file)
 
-    plc.io.SourceInfo([source])
+    _skip_invalid_sinks(io_class, source)
 
-    # TODO: test contents of source_info buffer is correct
-    # once buffers are exposed on python side
+    io_class([source])
 
 
 @pytest.mark.parametrize(
@@ -42,7 +56,7 @@ def test_source_info_ctor(source, tmp_path):
         ],
     ],
 )
-def test_source_info_ctor_multiple(sources, tmp_path):
+def test_source_info_ctor_multiple(io_class, sources, tmp_path):
     for i in range(len(sources)):
         source = sources[i]
         if isinstance(source, str):
@@ -50,10 +64,9 @@ def test_source_info_ctor_multiple(sources, tmp_path):
             file.write_bytes("hello world".encode("utf-8"))
             sources[i] = str(file)
 
-    plc.io.SourceInfo(sources)
+        _skip_invalid_sinks(io_class, source)
 
-    # TODO: test contents of source_info buffer is correct
-    # once buffers are exposed on python side
+    io_class(sources)
 
 
 @pytest.mark.parametrize(
@@ -73,7 +86,7 @@ def test_source_info_ctor_multiple(sources, tmp_path):
         ],
     ],
 )
-def test_source_info_ctor_mixing_invalid(sources, tmp_path):
+def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path):
     # Unlike the previous test
     # don't create files so that they are missing
     for i in range(len(sources)):
@@ -82,8 +95,9 @@ def test_source_info_ctor_mixing_invalid(sources, tmp_path):
             file = tmp_path / source
             file.write_bytes("hello world".encode("utf-8"))
             sources[i] = str(file)
+        _skip_invalid_sinks(io_class, source)
     with pytest.raises(ValueError):
-        plc.io.SourceInfo(sources)
+        io_class(sources)
 
 
 def test_source_info_invalid():
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py
index da3ca3a6d1e..f27fe4e942e 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_copying.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py
@@ -5,19 +5,21 @@
 import pytest
 from utils import (
     DEFAULT_STRUCT_TESTING_TYPE,
+    NESTED_STRUCT_TESTING_TYPE,
     assert_column_eq,
     assert_table_eq,
     cudf_raises,
-    is_fixed_width,
-    is_floating,
-    is_integer,
+    is_nested_list,
+    is_nested_struct,
     is_string,
-    metadata_from_arrow_array,
+    metadata_from_arrow_type,
 )
 
 from cudf._lib import pylibcudf as plc
 
 
+# TODO: consider moving this to conftest and "pairing"
+# it with pa_type, so that they don't get out of sync
 # TODO: Test nullable data
 @pytest.fixture(scope="module")
 def input_column(pa_type):
@@ -28,10 +30,27 @@ def input_column(pa_type):
     elif pa.types.is_boolean(pa_type):
         pa_array = pa.array([True, True, False], type=pa_type)
     elif pa.types.is_list(pa_type):
-        # TODO: Add heterogenous sizes
-        pa_array = pa.array([[1], [2], [3]], type=pa_type)
+        if pa_type.value_type == pa.int64():
+            pa_array = pa.array([[1], [2, 3], [3]], type=pa_type)
+        elif (
+            isinstance(pa_type.value_type, pa.ListType)
+            and pa_type.value_type.value_type == pa.int64()
+        ):
+            pa_array = pa.array([[[1]], [[2, 3]], [[3]]], type=pa_type)
+        else:
+            raise ValueError("Unsupported type " + pa_type.value_type)
     elif pa.types.is_struct(pa_type):
-        pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type)
+        if not is_nested_struct(pa_type):
+            pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type)
+        else:
+            pa_array = pa.array(
+                [
+                    {"a": 1, "b_struct": {"b": 1.0}},
+                    {"a": 2, "b_struct": {"b": 2.0}},
+                    {"a": 3, "b_struct": {"b": 3.0}},
+                ],
+                type=pa_type,
+            )
     else:
         raise ValueError("Unsupported type")
     return pa_array, plc.interop.from_arrow(pa_array)
@@ -55,13 +74,37 @@ def target_column(pa_type):
             [False, True, True, False, True, False], type=pa_type
         )
     elif pa.types.is_list(pa_type):
-        # TODO: Add heterogenous sizes
-        pa_array = pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type)
+        if pa_type.value_type == pa.int64():
+            pa_array = pa.array(
+                [[4], [5, 6], [7], [8], [9], [10]], type=pa_type
+            )
+        elif (
+            isinstance(pa_type.value_type, pa.ListType)
+            and pa_type.value_type.value_type == pa.int64()
+        ):
+            pa_array = pa.array(
+                [[[4]], [[5, 6]], [[7]], [[8]], [[9]], [[10]]], type=pa_type
+            )
+        else:
+            raise ValueError("Unsupported type")
     elif pa.types.is_struct(pa_type):
-        pa_array = pa.array(
-            [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}],
-            type=pa_type,
-        )
+        if not is_nested_struct(pa_type):
+            pa_array = pa.array(
+                [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}],
+                type=pa_type,
+            )
+        else:
+            pa_array = pa.array(
+                [
+                    {"a": 4, "b_struct": {"b": 4.0}},
+                    {"a": 5, "b_struct": {"b": 5.0}},
+                    {"a": 6, "b_struct": {"b": 6.0}},
+                    {"a": 7, "b_struct": {"b": 7.0}},
+                    {"a": 8, "b_struct": {"b": 8.0}},
+                    {"a": 9, "b_struct": {"b": 9.0}},
+                ],
+                type=pa_type,
+            )
     else:
         raise ValueError("Unsupported type")
     return pa_array, plc.interop.from_arrow(pa_array)
@@ -96,10 +139,22 @@ def source_scalar(pa_type):
     elif pa.types.is_boolean(pa_type):
         pa_scalar = pa.scalar(False, type=pa_type)
     elif pa.types.is_list(pa_type):
-        # TODO: Longer list?
-        pa_scalar = pa.scalar([1], type=pa_type)
+        if pa_type.value_type == pa.int64():
+            pa_scalar = pa.scalar([1, 2, 3, 4], type=pa_type)
+        elif (
+            isinstance(pa_type.value_type, pa.ListType)
+            and pa_type.value_type.value_type == pa.int64()
+        ):
+            pa_scalar = pa.scalar([[1, 2, 3, 4]], type=pa_type)
+        else:
+            raise ValueError("Unsupported type")
     elif pa.types.is_struct(pa_type):
-        pa_scalar = pa.scalar({"v": 1}, type=pa_type)
+        if not is_nested_struct(pa_type):
+            pa_scalar = pa.scalar({"v": 1}, type=pa_type)
+        else:
+            pa_scalar = pa.scalar(
+                {"a": 1, "b_struct": {"b": 1.0}}, type=pa_type
+            )
     else:
         raise ValueError("Unsupported type")
     return pa_scalar, plc.interop.from_arrow(pa_scalar)
@@ -196,27 +251,54 @@ def test_scatter_table(
             )
 
         if pa.types.is_list(dtype := pa_target_table[0].type):
-            expected = pa.table(
-                [pa.array([[4], [1], [2], [3], [8], [9]])] * 3, [""] * 3
-            )
+            if is_nested_list(dtype):
+                expected = pa.table(
+                    [pa.array([[[4]], [[1]], [[2, 3]], [[3]], [[9]], [[10]]])]
+                    * 3,
+                    [""] * 3,
+                )
+            else:
+                expected = pa.table(
+                    [pa.array([[4], [1], [2, 3], [3], [9], [10]])] * 3,
+                    [""] * 3,
+                )
         elif pa.types.is_struct(dtype):
-            expected = pa.table(
-                [
-                    pa.array(
-                        [
-                            {"v": 4},
-                            {"v": 1},
-                            {"v": 2},
-                            {"v": 3},
-                            {"v": 8},
-                            {"v": 9},
-                        ],
-                        type=DEFAULT_STRUCT_TESTING_TYPE,
-                    )
-                ]
-                * 3,
-                [""] * 3,
-            )
+            if is_nested_struct(dtype):
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [
+                                {"a": 4, "b_struct": {"b": 4.0}},
+                                {"a": 1, "b_struct": {"b": 1.0}},
+                                {"a": 2, "b_struct": {"b": 2.0}},
+                                {"a": 3, "b_struct": {"b": 3.0}},
+                                {"a": 8, "b_struct": {"b": 8.0}},
+                                {"a": 9, "b_struct": {"b": 9.0}},
+                            ],
+                            type=NESTED_STRUCT_TESTING_TYPE,
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
+            else:
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [
+                                {"v": 4},
+                                {"v": 1},
+                                {"v": 2},
+                                {"v": 3},
+                                {"v": 8},
+                                {"v": 9},
+                            ],
+                            type=DEFAULT_STRUCT_TESTING_TYPE,
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
     else:
         expected = _pyarrow_boolean_mask_scatter_table(
             pa_source_table,
@@ -274,9 +356,9 @@ def test_scatter_table_type_mismatch(source_table, index_column, target_table):
     _, plc_index_column = index_column
     _, plc_target_table = target_table
     with cudf_raises(TypeError):
-        if is_integer(
+        if plc.traits.is_integral_not_bool(
             dtype := plc_target_table.columns()[0].type()
-        ) or is_floating(dtype):
+        ) or plc.traits.is_floating_point(dtype):
             pa_array = pa.array([True] * plc_source_table.num_rows())
         else:
             pa_array = pa.array([1] * plc_source_table.num_rows())
@@ -343,9 +425,9 @@ def test_scatter_scalars_type_mismatch(index_column, target_table):
     _, plc_index_column = index_column
     _, plc_target_table = target_table
     with cudf_raises(TypeError):
-        if is_integer(
+        if plc.traits.is_integral_not_bool(
             dtype := plc_target_table.columns()[0].type()
-        ) or is_floating(dtype):
+        ) or plc.traits.is_floating_point(dtype):
             plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))]
         else:
             plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))]
@@ -373,7 +455,7 @@ def test_empty_like_table(source_table):
 @pytest.mark.parametrize("size", [None, 10])
 def test_allocate_like(input_column, size):
     _, plc_input_column = input_column
-    if is_fixed_width(plc_input_column.type()):
+    if plc.traits.is_fixed_width(plc_input_column.type()):
         result = plc.copying.allocate_like(
             plc_input_column,
             plc.copying.MaskAllocationPolicy.RETAIN,
@@ -399,7 +481,7 @@ def test_copy_range_in_place(
 
     pa_target_column, _ = target_column
 
-    if not is_fixed_width(mutable_target_column.type()):
+    if not plc.traits.is_fixed_width(mutable_target_column.type()):
         with pytest.raises(TypeError):
             plc.copying.copy_range_in_place(
                 plc_input_column,
@@ -431,7 +513,7 @@ def test_copy_range_in_place_out_of_bounds(
 ):
     _, plc_input_column = input_column
 
-    if is_fixed_width(mutable_target_column.type()):
+    if plc.traits.is_fixed_width(mutable_target_column.type()):
         with cudf_raises(IndexError):
             plc.copying.copy_range_in_place(
                 plc_input_column,
@@ -443,7 +525,9 @@ def test_copy_range_in_place_out_of_bounds(
 
 
 def test_copy_range_in_place_different_types(mutable_target_column):
-    if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := mutable_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
@@ -463,7 +547,7 @@ def test_copy_range_in_place_null_mismatch(
 ):
     pa_input_column, _ = input_column
 
-    if is_fixed_width(mutable_target_column.type()):
+    if plc.traits.is_fixed_width(mutable_target_column.type()):
         pa_input_column = pc.if_else(
             _pyarrow_index_to_mask([0], len(pa_input_column)),
             pa_input_column,
@@ -483,7 +567,9 @@ def test_copy_range_in_place_null_mismatch(
 def test_copy_range(input_column, target_column):
     pa_input_column, plc_input_column = input_column
     pa_target_column, plc_target_column = target_column
-    if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype):
+    if plc.traits.is_fixed_width(
+        dtype := plc_target_column.type()
+    ) or is_string(dtype):
         result = plc.copying.copy_range(
             plc_input_column,
             plc_target_column,
@@ -525,7 +611,9 @@ def test_copy_range_out_of_bounds(input_column, target_column):
 
 def test_copy_range_different_types(target_column):
     _, plc_target_column = target_column
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
@@ -544,7 +632,9 @@ def test_shift(target_column, source_scalar):
     pa_source_scalar, plc_source_scalar = source_scalar
     pa_target_column, plc_target_column = target_column
     shift = 2
-    if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype):
+    if plc.traits.is_fixed_width(
+        dtype := plc_target_column.type()
+    ) or is_string(dtype):
         result = plc.copying.shift(plc_target_column, shift, plc_source_scalar)
         expected = pa.concat_arrays(
             [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]]
@@ -557,7 +647,9 @@ def test_shift(target_column, source_scalar):
 
 def test_shift_type_mismatch(target_column):
     _, plc_target_column = target_column
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         fill_value = plc.interop.from_arrow(pa.scalar("a"))
     else:
         fill_value = plc.interop.from_arrow(pa.scalar(1))
@@ -627,6 +719,7 @@ def test_split_column_out_of_bounds(target_column):
 
 def test_split_table(target_table):
     pa_target_table, plc_target_table = target_table
+
     upper_bounds = [1, 3, 5]
     lower_bounds = [0] + upper_bounds[:-1]
     result = plc.copying.split(plc_target_table, upper_bounds)
@@ -661,7 +754,9 @@ def test_copy_if_else_column_column(target_column, mask, source_scalar):
 def test_copy_if_else_wrong_type(target_column, mask):
     _, plc_target_column = target_column
     _, plc_mask = mask
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(
             pa.array(["a"] * plc_target_column.size())
         )
@@ -718,6 +813,7 @@ def test_copy_if_else_column_scalar(
     pa_target_column, plc_target_column = target_column
     pa_source_scalar, plc_source_scalar = source_scalar
     pa_mask, plc_mask = mask
+
     args = (
         (plc_target_column, plc_source_scalar)
         if array_left
@@ -766,27 +862,58 @@ def test_boolean_mask_scatter_from_table(
             )
 
         if pa.types.is_list(dtype := pa_target_table[0].type):
-            expected = pa.table(
-                [pa.array([[1], [5], [2], [7], [3], [9]])] * 3, [""] * 3
-            )
+            if is_nested_list(dtype):
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [[[1]], [[5, 6]], [[2, 3]], [[8]], [[3]], [[10]]]
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
+            else:
+                expected = pa.table(
+                    [pa.array([[1], [5, 6], [2, 3], [8], [3], [10]])] * 3,
+                    [""] * 3,
+                )
         elif pa.types.is_struct(dtype):
-            expected = pa.table(
-                [
-                    pa.array(
-                        [
-                            {"v": 1},
-                            {"v": 5},
-                            {"v": 2},
-                            {"v": 7},
-                            {"v": 3},
-                            {"v": 9},
-                        ],
-                        type=DEFAULT_STRUCT_TESTING_TYPE,
-                    )
-                ]
-                * 3,
-                [""] * 3,
-            )
+            if is_nested_struct(dtype):
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [
+                                {"a": 1, "b_struct": {"b": 1.0}},
+                                {"a": 5, "b_struct": {"b": 5.0}},
+                                {"a": 2, "b_struct": {"b": 2.0}},
+                                {"a": 7, "b_struct": {"b": 7.0}},
+                                {"a": 3, "b_struct": {"b": 3.0}},
+                                {"a": 9, "b_struct": {"b": 9.0}},
+                            ],
+                            type=NESTED_STRUCT_TESTING_TYPE,
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
+            else:
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [
+                                {"v": 1},
+                                {"v": 5},
+                                {"v": 2},
+                                {"v": 7},
+                                {"v": 3},
+                                {"v": 9},
+                            ],
+                            type=DEFAULT_STRUCT_TESTING_TYPE,
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
     else:
         expected = _pyarrow_boolean_mask_scatter_table(
             pa_source_table, pa_mask, pa_target_table
@@ -833,9 +960,9 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table):
 def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask):
     _, plc_target_table = target_table
     _, plc_mask = mask
-    if is_integer(
+    if plc.traits.is_integral_not_bool(
         dtype := plc_target_table.columns()[0].type()
-    ) or is_floating(dtype):
+    ) or plc.traits.is_floating_point(dtype):
         input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
@@ -887,7 +1014,7 @@ def test_get_element(input_column):
 
     assert (
         plc.interop.to_arrow(
-            result, metadata_from_arrow_array(pa_input_column)
+            result, metadata_from_arrow_type(pa_input_column.type)
         ).as_py()
         == pa_input_column[index].as_py()
     )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index c781126e388..58a1dcf8d56 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -134,3 +134,15 @@ def test_index_of_list_column(test_data, column):
     expect = pa.array(column[1], type=pa.int32())
 
     assert_column_eq(expect, res)
+
+
+def test_reverse(test_data):
+    list_column = test_data[0][0]
+    arr = pa.array(list_column)
+    plc_column = plc.interop.from_arrow(arr)
+
+    res = plc.lists.reverse(plc_column)
+
+    expect = pa.array([lst[::-1] for lst in list_column])
+
+    assert_column_eq(expect, res)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_traits.py b/python/cudf/cudf/pylibcudf_tests/test_traits.py
new file mode 100644
index 00000000000..6c22cb02f21
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_traits.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_is_relationally_comparable():
+    assert plc.traits.is_relationally_comparable(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_relationally_comparable(
+        plc.DataType(plc.TypeId.LIST)
+    )
+
+
+def test_is_equality_comparable():
+    assert plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.LIST))
+
+
+def test_is_numeric():
+    assert plc.traits.is_numeric(plc.DataType(plc.TypeId.FLOAT64))
+    assert not plc.traits.is_numeric(plc.DataType(plc.TypeId.LIST))
+
+
+def test_is_index_type():
+    assert plc.traits.is_index_type(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_index_type(plc.DataType(plc.TypeId.BOOL8))
+
+
+def test_is_unsigned():
+    assert plc.traits.is_unsigned(plc.DataType(plc.TypeId.UINT8))
+    assert not plc.traits.is_unsigned(plc.DataType(plc.TypeId.INT8))
+
+
+def test_is_integral():
+    assert plc.traits.is_integral(plc.DataType(plc.TypeId.BOOL8))
+    assert not plc.traits.is_integral(plc.DataType(plc.TypeId.DECIMAL32))
+
+
+def test_is_integral_not_bool():
+    assert plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.BOOL8))
+
+
+def test_is_floating_point():
+    assert plc.traits.is_floating_point(plc.DataType(plc.TypeId.FLOAT64))
+    assert not plc.traits.is_floating_point(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_boolean():
+    assert plc.traits.is_boolean(plc.DataType(plc.TypeId.BOOL8))
+    assert not plc.traits.is_boolean(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_timestamp():
+    assert plc.traits.is_timestamp(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+    assert not plc.traits.is_timestamp(
+        plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+    )
+
+
+def test_is_fixed_point():
+    assert plc.traits.is_fixed_point(plc.DataType(plc.TypeId.DECIMAL128))
+    assert not plc.traits.is_fixed_point(plc.DataType(plc.TypeId.FLOAT32))
+
+
+def test_is_duration():
+    assert plc.traits.is_duration(
+        plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+    )
+    assert not plc.traits.is_duration(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+
+
+def test_is_chrono():
+    assert plc.traits.is_chrono(plc.DataType(plc.TypeId.DURATION_MICROSECONDS))
+    assert plc.traits.is_chrono(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+    assert not plc.traits.is_chrono(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_dictionary():
+    assert plc.traits.is_dictionary(plc.DataType(plc.TypeId.DICTIONARY32))
+    assert not plc.traits.is_dictionary(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_fixed_width():
+    assert plc.traits.is_fixed_width(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_fixed_width(plc.DataType(plc.TypeId.STRING))
+
+
+def test_is_compound():
+    assert plc.traits.is_compound(plc.DataType(plc.TypeId.STRUCT))
+    assert not plc.traits.is_compound(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_nested():
+    assert plc.traits.is_nested(plc.DataType(plc.TypeId.STRUCT))
+    assert not plc.traits.is_nested(plc.DataType(plc.TypeId.STRING))
+
+
+def test_is_bit_castable():
+    assert plc.traits.is_bit_castable(
+        plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT8)
+    )
+    assert not plc.traits.is_bit_castable(
+        plc.DataType(plc.TypeId.UINT8), plc.DataType(plc.TypeId.UINT16)
+    )
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index ea919c786b9..c288155112c 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -515,17 +515,6 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
     np.testing.assert_array_equal(expect_mask, got_mask)
 
 
-def test_concatenate_large_column_strings():
-    num_strings = 1_000_000
-    string_scale_f = 100
-
-    s_1 = cudf.Series(["very long string " * string_scale_f] * num_strings)
-    s_2 = cudf.Series(["very long string " * string_scale_f] * num_strings)
-
-    with pytest.raises(OverflowError):
-        cudf.concat([s_1, s_2])
-
-
 @pytest.mark.parametrize(
     "alias,expect_dtype",
     [
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 1b395c09ba8..5eeea87d8e0 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -188,3 +188,9 @@ def test_from_pandas_intervaldtype():
     result = cudf.from_pandas(dtype)
     expected = cudf.IntervalDtype("int64", closed="left")
     assert_eq(result, expected)
+
+
+def test_intervaldtype_eq_string_with_attributes():
+    dtype = cudf.IntervalDtype("int64", closed="left")
+    assert dtype == "interval"
+    assert dtype == "interval[int64, left]"
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index f76143cb381..ec9d7995b05 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -12,6 +12,7 @@
 from cudf import NA
 from cudf._lib.copying import get_element
 from cudf.api.types import is_scalar
+from cudf.core.column.column import column_empty
 from cudf.testing import assert_eq
 from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
 
@@ -926,3 +927,29 @@ def test_list_iterate_error():
 def test_list_struct_list_memory_usage():
     df = cudf.DataFrame({"a": [[{"b": [1]}]]})
     assert df.memory_usage().sum() == 16
+
+
+def test_empty_nested_list_uninitialized_offsets_memory_usage():
+    col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64")))
+    nested_col = col.children[1]
+    empty_inner = type(nested_col)(
+        size=nested_col.size,
+        dtype=nested_col.dtype,
+        mask=nested_col.mask,
+        offset=nested_col.offset,
+        null_count=nested_col.null_count,
+        children=(
+            column_empty(0, nested_col.children[0].dtype),
+            nested_col.children[1],
+        ),
+    )
+    col_empty_offset = type(col)(
+        size=col.size,
+        dtype=col.dtype,
+        mask=col.mask,
+        offset=col.offset,
+        null_count=col.null_count,
+        children=(column_empty(0, col.children[0].dtype), empty_inner),
+    )
+    ser = cudf.Series._from_data({None: col_empty_offset})
+    assert ser.memory_usage() == 8
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 467d0c46ae7..8ed78d804bf 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2757,8 +2757,6 @@ def test_series_from_large_string(pa_type):
 
     assert_eq(expected, got)
 
-    assert pa_string_array.equals(got.to_arrow())
-
 
 @pytest.mark.parametrize(
     "scalar",
@@ -2873,3 +2871,42 @@ def test_nunique_all_null(dropna):
     result = pd_ser.nunique(dropna=dropna)
     expected = cudf_ser.nunique(dropna=dropna)
     assert result == expected
+
+
+@pytest.mark.parametrize(
+    "type1",
+    [
+        "category",
+        "interval[int64, right]",
+        "int64",
+        "float64",
+        "str",
+        "datetime64[ns]",
+        "timedelta64[ns]",
+    ],
+)
+@pytest.mark.parametrize(
+    "type2",
+    [
+        "category",
+        "interval[int64, right]",
+        "int64",
+        "float64",
+        "str",
+        "datetime64[ns]",
+        "timedelta64[ns]",
+    ],
+)
+@pytest.mark.parametrize(
+    "as_dtype", [lambda x: x, cudf.dtype], ids=["string", "object"]
+)
+@pytest.mark.parametrize("copy", [True, False])
+def test_empty_astype_always_castable(type1, type2, as_dtype, copy):
+    ser = cudf.Series([], dtype=as_dtype(type1))
+    result = ser.astype(as_dtype(type2), copy=copy)
+    expected = cudf.Series([], dtype=as_dtype(type2))
+    assert_eq(result, expected)
+    if not copy and cudf.dtype(type1) == cudf.dtype(type2):
+        assert ser._column is result._column
+    else:
+        assert ser._column is not result._column
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 20b731624df..dcb33b1fc1a 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -121,7 +121,7 @@ skip = [
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
 requires = [
-    "cmake>=3.26.4",
+    "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 11e18cd4f32..badfdf06d15 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -101,7 +101,7 @@ regex = "(?P<value>.*)"
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
 requires = [
-    "cmake>=3.26.4",
+    "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 979087d5273..764cdd3b3ca 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -34,7 +34,12 @@ def _callback(
         return ir.evaluate(cache={}).to_polars()
 
 
-def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None:
+def execute_with_cudf(
+    nt: NodeTraverser,
+    *,
+    raise_on_fail: bool = False,
+    exception: type[Exception] | tuple[type[Exception], ...] = Exception,
+) -> None:
     """
     A post optimization callback that attempts to execute the plan with cudf.
 
@@ -47,11 +52,15 @@ def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None
         Should conversion raise an exception rather than continuing
         without setting a callback.
 
+    exception
+        Optional exception, or tuple of exceptions, to catch during
+        translation. Defaults to ``Exception``.
+
     The NodeTraverser is mutated if the libcudf executor can handle the plan.
     """
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
             nt.set_udf(partial(_callback, translate_ir(nt)))
-    except NotImplementedError:
+    except exception:
         if raise_on_fail:
             raise
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index ec8d00c3123..d86656578d7 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import itertools
 from functools import cached_property
 from typing import TYPE_CHECKING, cast
 
@@ -160,7 +161,10 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
         -----
         If column names overlap, newer names replace older ones.
         """
-        return type(self)([*self.columns, *columns])
+        columns = list(
+            {c.name: c for c in itertools.chain(self.columns, columns)}.values()
+        )
+        return type(self)(columns)
 
     def discard_columns(self, names: Set[str]) -> Self:
         """Drop columns by name."""
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 17d7d15e4e5..93cb9db7cbd 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -27,11 +27,12 @@
 import cudf._lib.pylibcudf as plc
 
 from cudf_polars.containers import Column, NamedColumn
-from cudf_polars.utils import sorting
+from cudf_polars.utils import dtypes, sorting
 
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence
 
+    import polars.polars as plrs
     import polars.type_aliases as pl_types
 
     from cudf_polars.containers import DataFrame
@@ -43,6 +44,7 @@
     "Col",
     "BooleanFunction",
     "StringFunction",
+    "TemporalFunction",
     "Sort",
     "SortBy",
     "Gather",
@@ -369,6 +371,29 @@ def do_evaluate(
         return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
 
 
+class LiteralColumn(Expr):
+    __slots__ = ("value",)
+    _non_child = ("dtype", "value")
+    value: pa.Array[Any, Any]
+    children: tuple[()]
+
+    def __init__(self, dtype: plc.DataType, value: plrs.PySeries) -> None:
+        super().__init__(dtype)
+        data = value.to_arrow()
+        self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # datatype of pyarrow array is correct by construction.
+        return Column(plc.interop.from_arrow(self.value))
+
+
 class Col(Expr):
     __slots__ = ("name",)
     _non_child = ("dtype", "name")
@@ -679,6 +704,7 @@ def _validate_input(self):
             pl_expr.StringFunction.EndsWith,
             pl_expr.StringFunction.StartsWith,
             pl_expr.StringFunction.Contains,
+            pl_expr.StringFunction.Slice,
         ):
             raise NotImplementedError(f"String function {self.name}")
         if self.name == pl_expr.StringFunction.Contains:
@@ -692,6 +718,11 @@ def _validate_input(self):
                     raise NotImplementedError(
                         "Regex contains only supports a scalar pattern"
                     )
+        elif self.name == pl_expr.StringFunction.Slice:
+            if not all(isinstance(child, Literal) for child in self.children[1:]):
+                raise NotImplementedError(
+                    "Slice only supports literal start and stop values"
+                )
 
     def do_evaluate(
         self,
@@ -720,6 +751,36 @@ def do_evaluate(
                 flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
             )
             return Column(plc.strings.contains.contains_re(column.obj, prog))
+        elif self.name == pl_expr.StringFunction.Slice:
+            child, expr_offset, expr_length = self.children
+            assert isinstance(expr_offset, Literal)
+            assert isinstance(expr_length, Literal)
+
+            column = child.evaluate(df, context=context, mapping=mapping)
+            # libcudf slices via [start,stop).
+            # polars slices with offset + length where start == offset
+            # stop = start + length. Negative values for start look backward
+            # from the last element of the string. If the end index would be
+            # below zero, an empty string is returned.
+            # Do this maths on the host
+            start = expr_offset.value.as_py()
+            length = expr_length.value.as_py()
+
+            if length == 0:
+                stop = start
+            else:
+                # No length indicates a scan to the end
+                # The libcudf equivalent is a null stop
+                stop = start + length if length else None
+                if length and start < 0 and length >= -start:
+                    stop = None
+            return Column(
+                plc.strings.slice.slice_strings(
+                    column.obj,
+                    plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
+                    plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
+                )
+            )
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
@@ -755,6 +816,129 @@ def do_evaluate(
         )  # pragma: no cover; handled by init raising
 
 
+class TemporalFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.TemporalFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+        if self.name != pl_expr.TemporalFunction.Year:
+            raise NotImplementedError(f"String function {self.name}")
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        if self.name == pl_expr.TemporalFunction.Year:
+            (column,) = columns
+            return Column(plc.datetime.extract_year(column.obj))
+        raise NotImplementedError(
+            f"TemporalFunction {self.name}"
+        )  # pragma: no cover; init trips first
+
+
+class UnaryFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.name = name
+        self.options = options
+        self.children = children
+        if self.name not in ("round", "unique"):
+            raise NotImplementedError(f"Unary function {name=}")
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name == "round":
+            (decimal_places,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.round.round(
+                    values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP
+                )
+            ).sorted_like(values)
+        elif self.name == "unique":
+            (maintain_order,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            # Only one column, so keep_any is the same as keep_first
+            # for stable distinct
+            keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
+            if values.is_sorted:
+                maintain_order = True
+                result = plc.stream_compaction.unique(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                )
+            else:
+                distinct = (
+                    plc.stream_compaction.stable_distinct
+                    if maintain_order
+                    else plc.stream_compaction.distinct
+                )
+                result = distinct(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                    plc.types.NanEquality.ALL_EQUAL,
+                )
+            (column,) = result.columns()
+            if maintain_order:
+                return Column(column).sorted_like(values)
+            return Column(column)
+        raise NotImplementedError(
+            f"Unimplemented unary function {self.name=}"
+        )  # pragma: no cover; init trips first
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth == 1:
+            # inside aggregation, need to pre-evaluate, groupby
+            # construction has checked that we don't have nested aggs,
+            # so stop the recursion and return ourselves for pre-eval
+            return AggInfo([(self, plc.aggregation.collect_list(), self)])
+        else:
+            (child,) = self.children
+            return child.collect_agg(depth=depth)
+
+
 class Sort(Expr):
     __slots__ = ("options", "children")
     _non_child = ("dtype", "options")
@@ -954,15 +1138,15 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class Agg(Expr):
     __slots__ = ("name", "options", "op", "request", "children")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr]
+    children: tuple[Expr, ...]
 
     def __init__(
-        self, dtype: plc.DataType, name: str, options: Any, value: Expr
+        self, dtype: plc.DataType, name: str, options: Any, *children: Expr
     ) -> None:
         super().__init__(dtype)
         self.name = name
         self.options = options
-        self.children = (value,)
+        self.children = children
         if name not in Agg._SUPPORTED:
             raise NotImplementedError(
                 f"Unsupported aggregation {name=}"
@@ -1156,6 +1340,13 @@ def __init__(
         super().__init__(dtype)
         self.op = op
         self.children = (left, right)
+        if (
+            op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB)
+            and plc.traits.is_chrono(left.dtype)
+            and plc.traits.is_chrono(right.dtype)
+            and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id())
+        ):
+            raise NotImplementedError("Casting rules for timelike types")
 
     _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
         pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 3f5f3c74050..6b552642e88 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -15,6 +15,7 @@
 
 import dataclasses
 import itertools
+import json
 import types
 from functools import cache
 from typing import TYPE_CHECKING, Any, Callable, ClassVar
@@ -29,7 +30,7 @@
 
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import DataFrame, NamedColumn
-from cudf_polars.utils import sorting
+from cudf_polars.utils import dtypes, sorting
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
@@ -95,6 +96,8 @@ def broadcast(
     ``target_length`` is provided and not all columns are length-1
     (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``.
     """
+    if len(columns) == 0:
+        return []
     lengths: set[int] = {column.obj.size() for column in columns}
     if lengths == {1}:
         if target_length is None:
@@ -130,6 +133,11 @@ class IR:
     schema: Schema
     """Mapping from column names to their data types."""
 
+    def __post_init__(self):
+        """Validate preconditions."""
+        if any(dtype.id() == plc.TypeId.EMPTY for dtype in self.schema.values()):
+            raise NotImplementedError("Cannot make empty columns.")
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """
         Evaluate the node and return a dataframe.
@@ -175,8 +183,10 @@ def __post_init__(self):
 class Scan(IR):
     """Input from files."""
 
-    typ: Any
+    typ: str
     """What type of file are we reading? Parquet, CSV, etc..."""
+    options: tuple[Any, ...]
+    """Type specific options, as json-encoded strings."""
     paths: list[str]
     """List of paths to read from."""
     file_options: Any
@@ -206,17 +216,21 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         with_columns = options.with_columns
         row_index = options.row_index
         if self.typ == "csv":
+            opts, cloud_opts = map(json.loads, self.options)
             df = DataFrame.from_cudf(
                 cudf.concat(
                     [cudf.read_csv(p, usecols=with_columns) for p in self.paths]
                 )
             )
         elif self.typ == "parquet":
+            opts, cloud_opts = map(json.loads, self.options)
             cdf = cudf.read_parquet(self.paths, columns=with_columns)
             assert isinstance(cdf, cudf.DataFrame)
             df = DataFrame.from_cudf(cdf)
         else:
-            assert_never(self.typ)
+            raise NotImplementedError(
+                f"Unhandled scan type: {self.typ}"
+            )  # pragma: no cover; post init trips first
         if row_index is not None:
             name, offset = row_index
             dtype = self.schema[name]
@@ -292,15 +306,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         table = pdf.to_arrow()
         schema = table.schema
         for i, field in enumerate(schema):
-            # TODO: Nested types
-            if field.type == pa.large_string():
-                # TODO: goes away when libcudf supports large strings
-                schema = schema.set(i, pa.field(field.name, pa.string()))
-            elif isinstance(field.type, pa.LargeListType):
-                # TODO: goes away when libcudf supports large lists
-                schema = schema.set(
-                    i, pa.field(field.name, pa.list_(field.type.field(0)))
-                )
+            schema = schema.set(
+                i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
+            )
+        # No-op if the schema is unchanged.
         table = table.cast(schema)
         df = DataFrame.from_table(
             plc.interop.from_arrow(table), list(self.schema.keys())
@@ -424,7 +433,7 @@ def check_agg(agg: expr.Expr) -> int:
         NotImplementedError
             For unsupported expression nodes.
         """
-        if isinstance(agg, (expr.BinOp, expr.Cast)):
+        if isinstance(agg, (expr.BinOp, expr.Cast, expr.UnaryFunction)):
             return max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, expr.Agg):
             return 1 + max(GroupBy.check_agg(child) for child in agg.children)
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 953ff636cce..5a1e682abe7 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -12,6 +12,7 @@
 import pyarrow as pa
 from typing_extensions import assert_never
 
+import polars.polars as plrs
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 import cudf._lib.pylibcudf as plc
@@ -86,9 +87,11 @@ def _(
 def _(
     node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
+    typ, *options = node.scan_type
     return ir.Scan(
         schema,
-        node.scan_type,
+        typ,
+        tuple(options),
         node.paths,
         node.file_options,
         translate_named_expr(visitor, n=node.predicate)
@@ -358,8 +361,23 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             options,
             *(translate_expr(visitor, n=n) for n in node.input),
         )
-    else:
-        raise NotImplementedError(f"No handler for Expr function node with {name=}")
+    elif isinstance(name, pl_expr.TemporalFunction):
+        return expr.TemporalFunction(
+            dtype,
+            name,
+            options,
+            *(translate_expr(visitor, n=n) for n in node.input),
+        )
+    elif isinstance(name, str):
+        return expr.UnaryFunction(
+            dtype,
+            name,
+            options,
+            *(translate_expr(visitor, n=n) for n in node.input),
+        )
+    raise NotImplementedError(
+        f"No handler for Expr function node with {name=}"
+    )  # pragma: no cover; polars raises on the rust side for now
 
 
 @_translate_expr.register
@@ -383,6 +401,8 @@ def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr
 
 @_translate_expr.register
 def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+    if isinstance(node.value, plrs.PySeries):
+        return expr.LiteralColumn(dtype, node.value)
     value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype))
     return expr.Literal(dtype, value)
 
@@ -427,8 +447,11 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E
     # Push casts into literals so we can handle Cast(Literal(Null))
     if isinstance(inner, expr.Literal):
         return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype)))
-    else:
-        return expr.Cast(dtype, inner)
+    elif isinstance(inner, expr.Cast):
+        # Translation of Len/Count-agg put in a cast, remove double
+        # casts if we have one.
+        (inner,) = inner.children
+    return expr.Cast(dtype, inner)
 
 
 @_translate_expr.register
@@ -438,12 +461,15 @@ def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr
 
 @_translate_expr.register
 def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
-    return expr.Agg(
+    value = expr.Agg(
         dtype,
         node.name,
         node.options,
-        translate_expr(visitor, n=node.arguments),
+        *(translate_expr(visitor, n=n) for n in node.arguments),
     )
+    if value.name == "count" and value.dtype.id() != plc.TypeId.INT32:
+        return expr.Cast(value.dtype, value)
+    return value
 
 
 @_translate_expr.register
@@ -470,7 +496,10 @@ def _(
 
 @_translate_expr.register
 def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
-    return expr.Len(dtype)
+    value = expr.Len(dtype)
+    if dtype.id() != plc.TypeId.INT32:
+        return expr.Cast(dtype, value)
+    return value  # pragma: no cover; never reached since polars len has uint32 dtype
 
 
 def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr:
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 3d4a643e1fc..918cd024fa2 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -7,13 +7,79 @@
 
 from functools import cache
 
+import pyarrow as pa
 from typing_extensions import assert_never
 
 import polars as pl
 
 import cudf._lib.pylibcudf as plc
 
-__all__ = ["from_polars"]
+__all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"]
+
+
+def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId):
+    """
+    Do two datetime typeids have matching resolution for a binop.
+
+    Parameters
+    ----------
+    lid
+       Left type id
+    rid
+       Right type id
+
+    Returns
+    -------
+    True if resolutions are compatible, False otherwise.
+
+    Notes
+    -----
+    Polars has different casting rules for combining
+    datetimes/durations than libcudf, and while we don't encode the
+    casting rules fully, just reject things we can't handle.
+
+    Precondition for correctness: both lid and rid are timelike.
+    """
+    if lid == rid:
+        return True
+    # Timestamps are smaller than durations in the libcudf enum.
+    lid, rid = sorted([lid, rid])
+    if lid == plc.TypeId.TIMESTAMP_MILLISECONDS:
+        return rid == plc.TypeId.DURATION_MILLISECONDS
+    elif lid == plc.TypeId.TIMESTAMP_MICROSECONDS:
+        return rid == plc.TypeId.DURATION_MICROSECONDS
+    elif lid == plc.TypeId.TIMESTAMP_NANOSECONDS:
+        return rid == plc.TypeId.DURATION_NANOSECONDS
+    return False
+
+
+def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType:
+    """
+    Sanitize an arrow datatype from polars.
+
+    Parameters
+    ----------
+    typ
+        Arrow type to sanitize
+
+    Returns
+    -------
+    Sanitized arrow type
+
+    Notes
+    -----
+    As well as arrow ``ListType``s, polars can produce
+    ``LargeListType``s and ``FixedSizeListType``s, these are not
+    currently handled by libcudf, so we attempt to cast them all into
+    normal ``ListType``s on the arrow side before consuming the arrow
+    data.
+    """
+    if isinstance(typ, pa.LargeListType):
+        return pa.list_(downcast_arrow_lists(typ.value_type))
+    # We don't have to worry about diving into struct types for now
+    # since those are always NotImplemented before we get here.
+    assert not isinstance(typ, pa.StructType)
+    return typ
 
 
 @cache
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index effa4861e0c..0b559f7a8e9 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cudf==24.8.*,>=0.0.0a0",
-    "polars>=0.20.30",
+    "polars>=1.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -182,5 +182,3 @@ docstring-code-format = true
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
-# Pure python
-disable-cuda = true
diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py
index 6ba2a1dce1e..218101bf87c 100644
--- a/python/cudf_polars/tests/expressions/test_datetime_basic.py
+++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py
@@ -2,6 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import datetime
+from operator import methodcaller
+
 import pytest
 
 import polars as pl
@@ -32,3 +35,28 @@ def test_datetime_dataframe_scan(dtype):
 
     query = ldf.select(pl.col("b"), pl.col("a"))
     assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "field",
+    [
+        methodcaller("year"),
+        pytest.param(
+            methodcaller("day"),
+            marks=pytest.mark.xfail(reason="day extraction not implemented"),
+        ),
+    ],
+)
+def test_datetime_extract(field):
+    ldf = pl.LazyFrame(
+        {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]}
+    )
+    q = ldf.select(field(pl.col("dates").dt))
+
+    with pytest.raises(AssertionError):
+        # polars produces int32, libcudf produces int16 for the year extraction
+        # libcudf can lose data here.
+        # https://github.com/rapidsai/cudf/issues/16196
+        assert_gpu_result_equal(q)
+
+    assert_gpu_result_equal(q, check_dtypes=False)
diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py
new file mode 100644
index 00000000000..55e688428bd
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_literal.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+from cudf_polars.utils import dtypes
+
+
+@pytest.fixture(
+    params=[
+        None,
+        pl.Int8(),
+        pl.Int16(),
+        pl.Int32(),
+        pl.Int64(),
+        pl.UInt8(),
+        pl.UInt16(),
+        pl.UInt32(),
+        pl.UInt64(),
+    ]
+)
+def integer(request):
+    return pl.lit(10, dtype=request.param)
+
+
+@pytest.fixture(params=[None, pl.Float32(), pl.Float64()])
+def float(request):
+    return pl.lit(1.0, dtype=request.param)
+
+
+def test_numeric_literal(integer, float):
+    df = pl.LazyFrame({})
+
+    q = df.select(integer=integer, float_=float, sum_=integer + float)
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.fixture(
+    params=[pl.Date(), pl.Datetime("ms"), pl.Datetime("us"), pl.Datetime("ns")]
+)
+def timestamp(request):
+    return pl.lit(10_000, dtype=request.param)
+
+
+@pytest.fixture(params=[pl.Duration("ms"), pl.Duration("us"), pl.Duration("ns")])
+def timedelta(request):
+    return pl.lit(9_000, dtype=request.param)
+
+
+def test_timelike_literal(timestamp, timedelta):
+    df = pl.LazyFrame({})
+
+    q = df.select(
+        time=timestamp,
+        delta=timedelta,
+        adjusted=timestamp + timedelta,
+        two_delta=timedelta + timedelta,
+    )
+    schema = q.collect_schema()
+    time_type = schema["time"]
+    delta_type = schema["delta"]
+    if dtypes.have_compatible_resolution(
+        dtypes.from_polars(time_type).id(), dtypes.from_polars(delta_type).id()
+    ):
+        assert_gpu_result_equal(q)
+    else:
+        assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_select_literal_series():
+    df = pl.LazyFrame({})
+
+    q = df.select(
+        a=pl.Series(["a", "b", "c"], dtype=pl.String()),
+        b=pl.Series([[1, 2], [3], None], dtype=pl.List(pl.UInt16())),
+        c=pl.Series([[[1]], [], [[1, 2, 3, 4]]], dtype=pl.List(pl.List(pl.Float32()))),
+    )
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("expr", [pl.lit(None), pl.lit(10, dtype=pl.Decimal())])
+def test_unsupported_literal_raises(expr):
+    df = pl.LazyFrame({})
+
+    q = df.select(expr)
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/expressions/test_round.py b/python/cudf_polars/tests/expressions/test_round.py
new file mode 100644
index 00000000000..3af3a0ce6d1
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_round.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import math
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(params=[pl.Float32, pl.Float64])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture
+def df(dtype, with_nulls):
+    a = [-math.e, 10, 22.5, 1.5, 2.5, -1.5, math.pi, 8]
+    if with_nulls:
+        a[2] = None
+        a[-1] = None
+    return pl.LazyFrame({"a": a}, schema={"a": dtype})
+
+
+@pytest.mark.parametrize("decimals", [0, 2, 4])
+def test_round(df, decimals):
+    q = df.select(pl.col("a").round(decimals=decimals))
+
+    assert_gpu_result_equal(q, check_exact=False)
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index 9729e765948..8cf65dd51ac 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -37,6 +37,30 @@ def ldf(with_nulls):
     return pl.LazyFrame({"a": a, "b": range(len(a))})
 
 
+slice_cases = [
+    (1, 3),
+    (0, 3),
+    (0, 0),
+    (-3, 1),
+    (-100, 5),
+    (1, 1),
+    (100, 100),
+    (-3, 4),
+    (-3, 3),
+]
+
+
+@pytest.fixture(params=slice_cases)
+def slice_column_data(ldf, request):
+    start, length = request.param
+    if length:
+        return ldf.with_columns(
+            pl.lit(start).alias("start"), pl.lit(length).alias("length")
+        )
+    else:
+        return ldf.with_columns(pl.lit(start).alias("start"))
+
+
 def test_supported_stringfunction_expression(ldf):
     query = ldf.select(
         pl.col("a").str.starts_with("Z"),
@@ -104,3 +128,25 @@ def test_contains_invalid(ldf):
         query.collect()
     with pytest.raises(pl.exceptions.ComputeError):
         query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True))
+
+
+@pytest.mark.parametrize("offset", [1, -1, 0, 100, -100])
+def test_slice_scalars_offset(ldf, offset):
+    query = ldf.select(pl.col("a").str.slice(offset))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize("offset,length", slice_cases)
+def test_slice_scalars_length_and_offset(ldf, offset, length):
+    query = ldf.select(pl.col("a").str.slice(offset, length))
+    assert_gpu_result_equal(query)
+
+
+def test_slice_column(slice_column_data):
+    if "length" in slice_column_data.collect_schema():
+        query = slice_column_data.select(
+            pl.col("a").str.slice(pl.col("start"), pl.col("length"))
+        )
+    else:
+        query = slice_column_data.select(pl.col("a").str.slice(pl.col("start")))
+    assert_ir_translation_raises(query, NotImplementedError)
diff --git a/python/cudf_polars/tests/expressions/test_unique.py b/python/cudf_polars/tests/expressions/test_unique.py
new file mode 100644
index 00000000000..9b009a422c2
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_unique.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"])
+@pytest.mark.parametrize("pre_sorted", [False, True], ids=["unsorted", "sorted"])
+def test_unique(maintain_order, pre_sorted):
+    ldf = pl.DataFrame(
+        {
+            "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3],
+        }
+    ).lazy()
+    if pre_sorted:
+        ldf = ldf.sort("b")
+
+    query = ldf.select(pl.col("b").unique(maintain_order=maintain_order))
+    assert_gpu_result_equal(query, check_row_order=maintain_order)
diff --git a/python/cudf_polars/tests/test_dataframescan.py b/python/cudf_polars/tests/test_dataframescan.py
index 1ffe06ac562..b5c0fb7be9f 100644
--- a/python/cudf_polars/tests/test_dataframescan.py
+++ b/python/cudf_polars/tests/test_dataframescan.py
@@ -41,3 +41,22 @@ def test_scan_drop_nulls(subset, predicate_pushdown):
     assert_gpu_result_equal(
         q, collect_kwargs={"predicate_pushdown": predicate_pushdown}
     )
+
+
+def test_can_convert_lists():
+    df = pl.LazyFrame(
+        {
+            "a": pl.Series([[1, 2], [3]], dtype=pl.List(pl.Int8())),
+            "b": pl.Series([[1], [2]], dtype=pl.List(pl.UInt16())),
+            "c": pl.Series(
+                [
+                    [["1", "2", "3"], ["4", "567"]],
+                    [["8", "9"], []],
+                ],
+                dtype=pl.List(pl.List(pl.String())),
+            ),
+            "d": pl.Series([[[1, 2]], []], dtype=pl.List(pl.List(pl.UInt16()))),
+        }
+    )
+
+    assert_gpu_result_equal(df)
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index aefad59eb91..b84e2c16b43 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -47,6 +47,8 @@ def keys(request):
         [pl.col("float").max() - pl.col("int").min()],
         [pl.col("float").mean(), pl.col("int").std()],
         [(pl.col("float") - pl.lit(2)).max()],
+        [pl.col("float").sum().round(decimals=1)],
+        [pl.col("float").round(decimals=1).sum()],
     ],
     ids=lambda aggs: "-".join(map(str, aggs)),
 )
@@ -83,10 +85,7 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
 def test_groupby_len(df, keys):
     q = df.group_by(*keys).agg(pl.len())
 
-    # TODO: polars returns UInt32, libcudf returns Int32
-    with pytest.raises(AssertionError):
-        assert_gpu_result_equal(q, check_row_order=False)
-    assert_gpu_result_equal(q, check_dtypes=False, check_row_order=False)
+    assert_gpu_result_equal(q, check_row_order=False)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index b021d832910..865b95a7d91 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -46,3 +46,12 @@ def test_concat_vertical():
     q = pl.concat([ldf, ldf2], how="vertical")
 
     assert_gpu_result_equal(q)
+
+
+def test_concat_diagonal_empty():
+    df1 = pl.LazyFrame()
+    df2 = pl.LazyFrame({"a": [1, 2]})
+
+    q = pl.concat([df1, df2], how="diagonal_relaxed")
+
+    assert_gpu_result_equal(q, collect_kwargs={"no_optimization": True})