diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 1275aad757c..ad3f5940b94 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -63,7 +63,8 @@ jobs:
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      run_script: "ci/clang_tidy.sh"
+      run_script: "ci/cpp_linters.sh"
+      file_to_upload: iwyu_results.txt
   conda-python-cudf-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f5234f58efe..6d070a8a14c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -90,8 +90,8 @@ repos:
         entry: |
           # Check for usage of default_rng without seeding
           default_rng\(\)|
-          # Check for usage of np.random.seed
-          np.random.seed\(
+          # Check for usage of np.random.seed (NPY002 only disallows this being called)
+          np.random.seed
         language: pygrep
         types: [python]
       - id: cmake-format
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index ae4eb0d5c66..32dd5a7fa62 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -27,4 +27,6 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
+./ci/validate_wheel.sh ${package_dir} final_dist
+
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh
index 79853cdbdb2..38048125247 100755
--- a/ci/build_wheel_cudf_polars.sh
+++ b/ci/build_wheel_cudf_polars.sh
@@ -6,6 +6,7 @@ set -euo pipefail
 package_dir="python/cudf_polars"
 
 ./ci/build_wheel.sh cudf-polars ${package_dir}
+./ci/validate_wheel.sh ${package_dir} dist
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
index 00c64afa2ef..b0ae2f23abc 100755
--- a/ci/build_wheel_dask_cudf.sh
+++ b/ci/build_wheel_dask_cudf.sh
@@ -6,6 +6,7 @@ set -euo pipefail
 package_dir="python/dask_cudf"
 
 ./ci/build_wheel.sh dask-cudf ${package_dir}
+./ci/validate_wheel.sh ${package_dir} dist
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index aabd3814a24..af49942c8cd 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -37,4 +37,6 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
+./ci/validate_wheel.sh ${package_dir} final_dist
+
 RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist"
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index c4a89f20f5f..5a8f3397714 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -25,4 +25,6 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
+./ci/validate_wheel.sh ${package_dir} final_dist
+
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
diff --git a/ci/clang_tidy.sh b/ci/clang_tidy.sh
deleted file mode 100755
index 4d5d3fc3136..00000000000
--- a/ci/clang_tidy.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-rapids-logger "Create clang-tidy conda environment"
-. /opt/conda/etc/profile.d/conda.sh
-
-ENV_YAML_DIR="$(mktemp -d)"
-
-rapids-dependency-file-generator \
-  --output conda \
-  --file-key clang_tidy \
-  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
-
-rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n clang_tidy
-
-# Temporarily allow unbound variables for conda activation.
-set +u
-conda activate clang_tidy
-set -u
-
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-
-source rapids-configure-sccache
-
-# Run the build via CMake, which will run clang-tidy when CUDF_CLANG_TIDY is enabled.
-cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON -GNinja
-cmake --build cpp/build
diff --git a/ci/cpp_linters.sh b/ci/cpp_linters.sh
new file mode 100755
index 00000000000..a7c7255456f
--- /dev/null
+++ b/ci/cpp_linters.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+rapids-logger "Create checks conda environment"
+. /opt/conda/etc/profile.d/conda.sh
+
+ENV_YAML_DIR="$(mktemp -d)"
+
+rapids-dependency-file-generator \
+  --output conda \
+  --file-key clang_tidy \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
+
+rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n clang_tidy
+
+# Temporarily allow unbound variables for conda activation.
+set +u
+conda activate clang_tidy
+set -u
+
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
+source rapids-configure-sccache
+
+# TODO: For testing purposes, clone and build IWYU. We can switch to a release
+# once a clang 19-compatible version is available, which should be soon
+# (https://github.com/include-what-you-use/include-what-you-use/issues/1641).
+git clone --depth 1 https://github.com/include-what-you-use/include-what-you-use.git
+pushd include-what-you-use
+# IWYU's CMake build uses some Python scripts that assume that the cwd is
+# importable, so support that legacy behavior.
+export PYTHONPATH=${PWD}:${PYTHONPATH:-}
+cmake -S . -B build -GNinja --install-prefix=${CONDA_PREFIX}
+cmake --build build
+cmake --install build
+popd
+
+# Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled.
+cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_STATIC_LINTERS=ON -GNinja
+cmake --build cpp/build 2>&1 | python cpp/scripts/parse_iwyu_output.py
+
+# Remove invalid components of the path for local usage. The path below is
+# valid in the CI due to where the project is cloned, but presumably the fixes
+# will be applied locally from inside a clone of cudf.
+sed -i 's/\/__w\/cudf\/cudf\///' iwyu_results.txt
diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index f5bcdc62604..fefe26984cb 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -3,22 +3,6 @@
 
 set -eou pipefail
 
-# We will only fail these tests if the PR touches code in pylibcudf
-# or cudf_polars itself.
-# Note, the three dots mean we are doing diff between the merge-base
-# of upstream and HEAD. So this is asking, "does _this branch_ touch
-# files in cudf_polars/pylibcudf", rather than "are there changes
-# between upstream and this branch which touch cudf_polars/pylibcudf"
-# TODO: is the target branch exposed anywhere in an environment variable?
-if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
-then
-    HAS_CHANGES=1
-    rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
-else
-    HAS_CHANGES=0
-    rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
-fi
-
 rapids-logger "Download wheels"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
@@ -63,9 +47,4 @@ if [ ${EXITCODE} != 0 ]; then
 else
     rapids-logger "Running polars test suite PASSED"
 fi
-
-if [ ${HAS_CHANGES} == 1 ]; then
-    exit ${EXITCODE}
-else
-    exit 0
-fi
+exit ${EXITCODE}
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 2884757e46b..6c827406f78 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -3,22 +3,6 @@
 
 set -eou pipefail
 
-# We will only fail these tests if the PR touches code in pylibcudf
-# or cudf_polars itself.
-# Note, the three dots mean we are doing diff between the merge-base
-# of upstream and HEAD. So this is asking, "does _this branch_ touch
-# files in cudf_polars/pylibcudf", rather than "are there changes
-# between upstream and this branch which touch cudf_polars/pylibcudf"
-# TODO: is the target branch exposed anywhere in an environment variable?
-if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/pylibcudf/)" ];
-then
-    HAS_CHANGES=1
-    rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
-else
-    HAS_CHANGES=0
-    rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
-fi
-
 rapids-logger "Download wheels"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
@@ -65,9 +49,4 @@ if [ ${EXITCODE} != 0 ]; then
 else
     rapids-logger "Testing PASSED"
 fi
-
-if [ ${HAS_CHANGES} == 1 ]; then
-    exit ${EXITCODE}
-else
-    exit 0
-fi
+exit ${EXITCODE}
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
new file mode 100755
index 00000000000..5910a5c59fe
--- /dev/null
+++ b/ci/validate_wheel.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir=$1
+wheel_dir_relative_path=$2
+
+cd "${package_dir}"
+
+rapids-logger "validate packages with 'pydistcheck'"
+
+pydistcheck \
+    --inspect \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
+
+rapids-logger "validate packages with 'twine'"
+
+twine check \
+    --strict \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8a64ebf40c5..e91443ddba8 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -55,7 +55,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.0.13
+- numba-cuda>=0.0.13,<0.0.18
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
@@ -66,10 +66,10 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.13
+- polars>=1.11,<1.14
 - pre-commit
 - ptxcompiler
-- pyarrow>=14.0.0,<18.0.0a0
+- pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 5f779c3170f..2dccb595e59 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -54,7 +54,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.0.13
+- numba-cuda>=0.0.13,<0.0.18
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcomp==4.1.0.6
@@ -64,9 +64,9 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.13
+- polars>=1.11,<1.14
 - pre-commit
-- pyarrow>=14.0.0,<18.0.0a0
+- pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink>=0.0.0a0
 - pytest-benchmark
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index edf92b930d9..7a477291e7a 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.11,<1.12
+    - polars >=1.11,<1.14
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 2aafcae072d..04904e95630 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -80,7 +80,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
-    - numba-cuda >=0.0.13
+    - numba-cuda >=0.0.13,<0.0.18
     - numpy >=1.23,<3.0a0
     - pyarrow>=14.0.0,<18.0.0a0
     - libcudf ={{ version }}
diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy
index 12120a5c6d1..0e5699876fc 100644
--- a/cpp/.clang-tidy
+++ b/cpp/.clang-tidy
@@ -39,8 +39,8 @@ Checks:
        -clang-analyzer-optin.core.EnumCastOutOfRange,
        -clang-analyzer-optin.cplusplus.UninitializedObject'
 
-WarningsAsErrors: '*'
-HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*'
+WarningsAsErrors: ''
+HeaderFilterRegex: '.*cudf/cpp/(src|include).*'
 ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*'
 FormatStyle:     none
 CheckOptions:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index bfa4bf80724..e237b0b2856 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -88,7 +88,13 @@ option(
   ${DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL}
 )
 mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL)
-option(CUDF_CLANG_TIDY "Enable clang-tidy checking" OFF)
+option(CUDF_STATIC_LINTERS "Enable static linters during compilation" OFF)
+
+option(
+  CUDF_KVIKIO_REMOTE_IO
+  "Enable remote IO (e.g. AWS S3) support through KvikIO. If disabled, cudf-python will still be able to do remote IO through fsspec."
+  ON
+)
 
 message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
 message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
@@ -109,6 +115,9 @@ message(
   "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler): ${CUDA_ENABLE_LINEINFO}"
 )
 message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
+message(VERBOSE
+        "CUDF: Build with remote IO (e.g. AWS S3) support through KvikIO: ${CUDF_KVIKIO_REMOTE_IO}"
+)
 
 # Set a default build type if none was specified
 rapids_cmake_build_type("Release")
@@ -146,8 +155,10 @@ if(NOT CUDF_GENERATED_INCLUDE_DIR)
 endif()
 
 # ##################################################################################################
-# * clang-tidy configuration ----------------------------------------------------------------------
-if(CUDF_CLANG_TIDY)
+# * linter configuration ---------------------------------------------------------------------------
+if(CUDF_STATIC_LINTERS)
+  # For simplicity, for now we assume that all linters can be installed into an environment where
+  # any linter is being run. We could relax this requirement if desired.
   find_program(
     CLANG_TIDY_EXE
     NAMES "clang-tidy"
@@ -174,24 +185,48 @@ if(CUDF_CLANG_TIDY)
         "clang-tidy version ${expected_clang_tidy_version} is required, but found ${LLVM_VERSION}"
     )
   endif()
+
+  find_program(IWYU_EXE NAMES include-what-you-use iwyu REQUIRED)
 endif()
 
 # Turn on the clang-tidy property for a target excluding the files specified in SKIPPED_FILES.
-function(enable_clang_tidy target)
-  set(_tidy_options)
+function(enable_static_checkers target)
+  set(_tidy_options IWYU CLANG_TIDY)
   set(_tidy_one_value)
   set(_tidy_multi_value SKIPPED_FILES)
   cmake_parse_arguments(
-    _TIDY "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN}
+    _LINT "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN}
   )
 
-  if(CUDF_CLANG_TIDY)
-    # clang will complain about unused link libraries on the compile line unless we specify
-    # -Qunused-arguments.
-    set_target_properties(
-      ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
-    )
-    foreach(file IN LISTS _TIDY_SKIPPED_FILES)
+  if(CUDF_STATIC_LINTERS)
+    if(_LINT_CLANG_TIDY)
+      # clang will complain about unused link libraries on the compile line unless we specify
+      # -Qunused-arguments.
+      set_target_properties(
+        ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
+      )
+    endif()
+    if(_LINT_IWYU)
+      # A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not
+      # relevant since they don't show up in any other build so it's better to suppress them until
+      # we can figure out the cause. Setting this as part of CXX_INCLUDE_WHAT_YOU_USE does not
+      # appear to be sufficient, we must also ensure that it is set to the underlying target's CXX
+      # compile flags. To do this completely cleanly we should modify the flags on the target rather
+      # than the global CUDF_CXX_FLAGS, but this solution is good enough for now since we never run
+      # the linters on real builds.
+      foreach(_flag -Wno-missing-braces -Wno-absolute-value -Wunneeded-internal-declaration)
+        list(FIND CUDF_CXX_FLAGS "${flag}" _flag_index)
+        if(_flag_index EQUAL -1)
+          list(APPEND CUDF_CXX_FLAGS ${flag})
+        endif()
+      endforeach()
+      set(CUDF_CXX_FLAGS
+          "${CUDF_CXX_FLAGS}"
+          PARENT_SCOPE
+      )
+      set_target_properties(${target} PROPERTIES CXX_INCLUDE_WHAT_YOU_USE "${IWYU_EXE}")
+    endif()
+    foreach(file IN LISTS _LINT_SKIPPED_FILES)
       set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON)
     endforeach()
   endif()
@@ -368,11 +403,14 @@ add_library(
   src/filling/repeat.cu
   src/filling/sequence.cu
   src/groupby/groupby.cu
+  src/groupby/hash/compute_aggregations.cu
+  src/groupby/hash/compute_aggregations_null.cu
+  src/groupby/hash/compute_global_memory_aggs.cu
+  src/groupby/hash/compute_global_memory_aggs_null.cu
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_mapping_indices.cu
   src/groupby/hash/compute_mapping_indices_null.cu
   src/groupby/hash/compute_shared_memory_aggs.cu
-  src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
@@ -667,6 +705,7 @@ add_library(
   src/strings/replace/replace_slice.cu
   src/strings/reverse.cu
   src/strings/scan/scan_inclusive.cu
+  src/strings/search/contains_multiple.cu
   src/strings/search/findall.cu
   src/strings/search/find.cu
   src/strings/search/find_multiple.cu
@@ -771,11 +810,15 @@ set_target_properties(
              INTERFACE_POSITION_INDEPENDENT_CODE ON
 )
 
+# Note: This must come before the target_compile_options below so that the function can modify the
+# flags if necessary.
+enable_static_checkers(
+  cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp CLANG_TIDY IWYU
+)
 target_compile_options(
   cudf PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
                "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
 )
-enable_clang_tidy(cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp)
 
 if(CUDF_BUILD_STACKTRACE_DEBUG)
   # Remove any optimization level to avoid nvcc warning "incompatible redefinition for option
@@ -857,6 +900,9 @@ target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL
 # Define spdlog level
 target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}")
 
+# Enable remote IO through KvikIO
+target_compile_definitions(cudf PRIVATE $<$<BOOL:${CUDF_KVIKIO_REMOTE_IO}>:CUDF_KVIKIO_REMOTE_IO>)
+
 # Compile stringified JIT sources first
 add_dependencies(cudf jitify_preprocess_run)
 
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index bdc360c082b..419b78db9b0 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -348,23 +348,15 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 ConfigureBench(TEXT_BENCH text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
+  text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
 # * strings benchmark -------------------------------------------------------------------
 ConfigureBench(
-  STRINGS_BENCH
-  string/convert_datetime.cpp
-  string/convert_durations.cpp
-  string/copy.cu
-  string/factory.cu
-  string/filter.cpp
-  string/repeat_strings.cpp
-  string/replace.cpp
-  string/translate.cpp
-  string/url_decode.cu
+  STRINGS_BENCH string/factory.cu string/filter.cpp string/repeat_strings.cpp string/replace.cpp
+  string/translate.cpp string/url_decode.cu
 )
 
 ConfigureNVBench(
@@ -373,14 +365,17 @@ ConfigureNVBench(
   string/char_types.cpp
   string/combine.cpp
   string/contains.cpp
+  string/convert_datetime.cpp
+  string/convert_durations.cpp
   string/convert_fixed_point.cpp
   string/convert_numerics.cpp
+  string/copy.cpp
   string/copy_if_else.cpp
   string/copy_range.cpp
   string/count.cpp
   string/extract.cpp
   string/find.cpp
-  string/gather.cpp
+  string/find_multiple.cpp
   string/join_strings.cpp
   string/lengths.cpp
   string/like.cpp
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index bdce8a31176..8bce718c7d8 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -23,11 +23,13 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/filling.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/combine.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -540,7 +542,7 @@ struct string_generator {
   // range 32-127 is ASCII; 127-136 will be multi-byte UTF-8
   {
   }
-  __device__ void operator()(thrust::tuple<cudf::size_type, cudf::size_type> str_begin_end)
+  __device__ void operator()(thrust::tuple<int64_t, int64_t> str_begin_end)
   {
     auto begin = thrust::get<0>(str_begin_end);
     auto end   = thrust::get<1>(str_begin_end);
@@ -569,6 +571,9 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
     distribution_params<bool>{1. - profile.get_null_probability().value_or(0)});
   auto lengths   = len_dist(engine, num_rows + 1);
   auto null_mask = valid_dist(engine, num_rows + 1);
+  auto stream    = cudf::get_default_stream();
+  auto mr        = cudf::get_current_device_resource_ref();
+
   thrust::transform_if(
     thrust::device,
     lengths.begin(),
@@ -580,28 +585,26 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
   auto valid_lengths = thrust::make_transform_iterator(
     thrust::make_zip_iterator(thrust::make_tuple(lengths.begin(), null_mask.begin())),
     valid_or_zero{});
-  rmm::device_uvector<cudf::size_type> offsets(num_rows + 1, cudf::get_default_stream());
-  thrust::exclusive_scan(
-    thrust::device, valid_lengths, valid_lengths + lengths.size(), offsets.begin());
-  // offsets are ready.
-  auto chars_length = *thrust::device_pointer_cast(offsets.end() - 1);
+
+  // offsets are created as INT32 or INT64 as appropriate
+  auto [offsets, chars_length] = cudf::strings::detail::make_offsets_child_column(
+    valid_lengths, valid_lengths + num_rows, stream, mr);
+  // use the offsetalator to normalize the offset values for use by the string_generator
+  auto offsets_itr = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
   rmm::device_uvector<char> chars(chars_length, cudf::get_default_stream());
   thrust::for_each_n(thrust::device,
-                     thrust::make_zip_iterator(offsets.begin(), offsets.begin() + 1),
+                     thrust::make_zip_iterator(offsets_itr, offsets_itr + 1),
                      num_rows,
                      string_generator{chars.data(), engine});
+
   auto [result_bitmask, null_count] =
-    cudf::detail::valid_if(null_mask.begin(),
-                           null_mask.end() - 1,
-                           thrust::identity<bool>{},
-                           cudf::get_default_stream(),
-                           cudf::get_current_device_resource_ref());
+    profile.get_null_probability().has_value()
+      ? cudf::detail::valid_if(
+          null_mask.begin(), null_mask.end() - 1, thrust::identity<bool>{}, stream, mr)
+      : std::pair{rmm::device_buffer{}, 0};
+
   return cudf::make_strings_column(
-    num_rows,
-    std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
-    chars.release(),
-    null_count,
-    profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
+    num_rows, std::move(offsets), chars.release(), null_count, std::move(result_bitmask));
 }
 
 /**
diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp
index 1e3ab2b7b4f..cc548ccd3de 100644
--- a/cpp/benchmarks/io/nvbench_helpers.hpp
+++ b/cpp/benchmarks/io/nvbench_helpers.hpp
@@ -28,6 +28,7 @@ enum class data_type : int32_t {
   INTEGRAL        = static_cast<int32_t>(type_group_id::INTEGRAL),
   INTEGRAL_SIGNED = static_cast<int32_t>(type_group_id::INTEGRAL_SIGNED),
   FLOAT           = static_cast<int32_t>(type_group_id::FLOATING_POINT),
+  BOOL8           = static_cast<int32_t>(cudf::type_id::BOOL8),
   DECIMAL         = static_cast<int32_t>(type_group_id::FIXED_POINT),
   TIMESTAMP       = static_cast<int32_t>(type_group_id::TIMESTAMP),
   DURATION        = static_cast<int32_t>(type_group_id::DURATION),
@@ -44,6 +45,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
       case data_type::INTEGRAL: return "INTEGRAL";
       case data_type::INTEGRAL_SIGNED: return "INTEGRAL_SIGNED";
       case data_type::FLOAT: return "FLOAT";
+      case data_type::BOOL8: return "BOOL8";
       case data_type::DECIMAL: return "DECIMAL";
       case data_type::TIMESTAMP: return "TIMESTAMP";
       case data_type::DURATION: return "DURATION";
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index ce115fd7723..b14f9cbb67e 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -114,6 +114,7 @@ void BM_parquet_read_io_compression(nvbench::state& state)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
                                          static_cast<int32_t>(data_type::FLOAT),
+                                         static_cast<int32_t>(data_type::BOOL8),
                                          static_cast<int32_t>(data_type::DECIMAL),
                                          static_cast<int32_t>(data_type::TIMESTAMP),
                                          static_cast<int32_t>(data_type::DURATION),
@@ -298,6 +299,7 @@ void BM_parquet_read_wide_tables_mixed(nvbench::state& state)
 
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
                                             data_type::FLOAT,
+                                            data_type::BOOL8,
                                             data_type::DECIMAL,
                                             data_type::TIMESTAMP,
                                             data_type::DURATION,
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
index 62925e8d315..685e0dea0e8 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,6 +66,7 @@ void BM_parquet_read_options(nvbench::state& state,
   auto const data_types =
     dtypes_for_column_selection(get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
                                                    static_cast<int32_t>(data_type::FLOAT),
+                                                   static_cast<int32_t>(data_type::BOOL8),
                                                    static_cast<int32_t>(data_type::DECIMAL),
                                                    static_cast<int32_t>(data_type::TIMESTAMP),
                                                    static_cast<int32_t>(data_type::DURATION),
diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
index 256e50f0e64..84e4b8b93c0 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -89,6 +89,7 @@ void BM_parq_write_io_compression(
 {
   auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
                                              static_cast<int32_t>(data_type::FLOAT),
+                                             static_cast<int32_t>(data_type::BOOL8),
                                              static_cast<int32_t>(data_type::DECIMAL),
                                              static_cast<int32_t>(data_type::TIMESTAMP),
                                              static_cast<int32_t>(data_type::DURATION),
@@ -143,6 +144,7 @@ void BM_parq_write_varying_options(
 
   auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL_SIGNED),
                                              static_cast<int32_t>(data_type::FLOAT),
+                                             static_cast<int32_t>(data_type::BOOL8),
                                              static_cast<int32_t>(data_type::DECIMAL),
                                              static_cast<int32_t>(data_type::TIMESTAMP),
                                              static_cast<int32_t>(data_type::DURATION),
@@ -181,6 +183,7 @@ void BM_parq_write_varying_options(
 
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
                                             data_type::FLOAT,
+                                            data_type::BOOL8,
                                             data_type::DECIMAL,
                                             data_type::TIMESTAMP,
                                             data_type::DURATION,
diff --git a/cpp/benchmarks/ndsh/q09.cpp b/cpp/benchmarks/ndsh/q09.cpp
index 2e9a69d9ee2..98c951101ed 100644
--- a/cpp/benchmarks/ndsh/q09.cpp
+++ b/cpp/benchmarks/ndsh/q09.cpp
@@ -145,7 +145,8 @@ void run_ndsh_q9(nvbench::state& state,
 
   // Calculate the `nation`, `o_year`, and `amount` columns
   auto n_name = std::make_unique<cudf::column>(joined_table->column("n_name"));
-  auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate"));
+  auto o_year = cudf::datetime::extract_datetime_component(
+    joined_table->column("o_orderdate"), cudf::datetime::datetime_component::YEAR);
   auto amount = calculate_amount(joined_table->column("l_discount"),
                                  joined_table->column("l_extendedprice"),
                                  joined_table->column("ps_supplycost"),
diff --git a/cpp/benchmarks/string/convert_datetime.cpp b/cpp/benchmarks/string/convert_datetime.cpp
index 5deca3664b7..288aa6029d3 100644
--- a/cpp/benchmarks/string/convert_datetime.cpp
+++ b/cpp/benchmarks/string/convert_datetime.cpp
@@ -16,62 +16,59 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-class StringDateTime : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-enum class direction { to, from };
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_D, "cudf::timestamp_D", "cudf::timestamp_D");
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_s, "cudf::timestamp_s", "cudf::timestamp_s");
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_us, "cudf::timestamp_us", "cudf::timestamp_us");
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ns, "cudf::timestamp_ns", "cudf::timestamp_ns");
 
-template <class TypeParam>
-void BM_convert_datetime(benchmark::State& state, direction dir)
+using Types = nvbench::type_list<cudf::timestamp_D,
+                                 cudf::timestamp_s,
+                                 cudf::timestamp_ms,
+                                 cudf::timestamp_us,
+                                 cudf::timestamp_ns>;
+
+template <class DataType>
+void bench_convert_datetime(nvbench::state& state, nvbench::type_list<DataType>)
 {
-  auto const n_rows    = static_cast<cudf::size_type>(state.range(0));
-  auto const data_type = cudf::data_type(cudf::type_to_id<TypeParam>());
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const from_ts  = state.get_string("dir") == "from";
 
-  auto const column = create_random_column(data_type.id(), row_count{n_rows});
-  cudf::column_view input(column->view());
+  auto const data_type = cudf::data_type(cudf::type_to_id<DataType>());
+  auto const ts_col    = create_random_column(data_type.id(), row_count{num_rows});
 
-  auto source = dir == direction::to ? cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S")
-                                     : make_empty_column(cudf::data_type{cudf::type_id::STRING});
-  cudf::strings_column_view source_string(source->view());
+  auto format = std::string{"%Y-%m-%d %H:%M:%S"};
+  auto s_col  = cudf::strings::from_timestamps(ts_col->view(), format);
+  auto sv     = cudf::strings_column_view(s_col->view());
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    if (dir == direction::to)
-      cudf::strings::to_timestamps(source_string, data_type, "%Y-%m-%d %H:%M:%S");
-    else
-      cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S");
-  }
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
 
-  auto const bytes = dir == direction::to ? source_string.chars_size(cudf::get_default_stream())
-                                          : n_rows * sizeof(TypeParam);
-  state.SetBytesProcessed(state.iterations() * bytes);
+  if (from_ts) {
+    state.add_global_memory_reads<DataType>(num_rows);
+    state.add_global_memory_writes<int8_t>(sv.chars_size(stream));
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::from_timestamps(ts_col->view(), format);
+    });
+  } else {
+    state.add_global_memory_reads<int8_t>(sv.chars_size(stream));
+    state.add_global_memory_writes<DataType>(num_rows);
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::to_timestamps(sv, data_type, format);
+    });
+  }
 }
 
-#define STR_BENCHMARK_DEFINE(name, type, dir)                          \
-  BENCHMARK_DEFINE_F(StringDateTime, name)(::benchmark::State & state) \
-  {                                                                    \
-    BM_convert_datetime<type>(state, dir);                             \
-  }                                                                    \
-  BENCHMARK_REGISTER_F(StringDateTime, name)                           \
-    ->RangeMultiplier(1 << 5)                                          \
-    ->Range(1 << 10, 1 << 25)                                          \
-    ->UseManualTime()                                                  \
-    ->Unit(benchmark::kMicrosecond);
-
-STR_BENCHMARK_DEFINE(from_days, cudf::timestamp_D, direction::from);
-STR_BENCHMARK_DEFINE(from_seconds, cudf::timestamp_s, direction::from);
-STR_BENCHMARK_DEFINE(from_mseconds, cudf::timestamp_ms, direction::from);
-STR_BENCHMARK_DEFINE(from_useconds, cudf::timestamp_us, direction::from);
-STR_BENCHMARK_DEFINE(from_nseconds, cudf::timestamp_ns, direction::from);
-
-STR_BENCHMARK_DEFINE(to_days, cudf::timestamp_D, direction::to);
-STR_BENCHMARK_DEFINE(to_seconds, cudf::timestamp_s, direction::to);
-STR_BENCHMARK_DEFINE(to_mseconds, cudf::timestamp_ms, direction::to);
-STR_BENCHMARK_DEFINE(to_useconds, cudf::timestamp_us, direction::to);
-STR_BENCHMARK_DEFINE(to_nseconds, cudf::timestamp_ns, direction::to);
+NVBENCH_BENCH_TYPES(bench_convert_datetime, NVBENCH_TYPE_AXES(Types))
+  .set_name("datetime")
+  .set_type_axes_names({"DataType"})
+  .add_string_axis("dir", {"to", "from"})
+  .add_int64_axis("num_rows", {1 << 16, 1 << 18, 1 << 20, 1 << 22});
diff --git a/cpp/benchmarks/string/convert_durations.cpp b/cpp/benchmarks/string/convert_durations.cpp
index f12d292c2e7..9d2377f2d82 100644
--- a/cpp/benchmarks/string/convert_durations.cpp
+++ b/cpp/benchmarks/string/convert_durations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,92 +14,60 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/convert/convert_durations.hpp>
-#include <cudf/types.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/wrappers/durations.hpp>
 
-#include <algorithm>
-#include <random>
-
-class DurationsToString : public cudf::benchmark {};
-template <class TypeParam>
-void BM_convert_from_durations(benchmark::State& state)
-{
-  cudf::size_type const source_size = state.range(0);
-
-  // Every element is valid
-  auto data = cudf::detail::make_counting_transform_iterator(
-    0, [source_size](auto i) { return TypeParam{i - source_size / 2}; });
+#include <nvbench/nvbench.cuh>
 
-  cudf::test::fixed_width_column_wrapper<TypeParam> source_durations(data, data + source_size);
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_D, "cudf::duration_D", "cudf::duration_D");
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_s, "cudf::duration_s", "cudf::duration_s");
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_ms, "cudf::duration_ms", "cudf::duration_ms");
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_us, "cudf::duration_us", "cudf::duration_us");
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_ns, "cudf::duration_ns", "cudf::duration_ns");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::strings::from_durations(source_durations, "%D days %H:%M:%S");
-  }
-
-  state.SetBytesProcessed(state.iterations() * source_size * sizeof(TypeParam));
-}
+using Types = nvbench::type_list<cudf::duration_D,
+                                 cudf::duration_s,
+                                 cudf::duration_ms,
+                                 cudf::duration_us,
+                                 cudf::duration_ns>;
 
-class StringToDurations : public cudf::benchmark {};
-template <class TypeParam>
-void BM_convert_to_durations(benchmark::State& state)
+template <class DataType>
+void bench_convert_duration(nvbench::state& state, nvbench::type_list<DataType>)
 {
-  cudf::size_type const source_size = state.range(0);
-
-  // Every element is valid
-  auto data = cudf::detail::make_counting_transform_iterator(
-    0, [source_size](auto i) { return TypeParam{i - source_size / 2}; });
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> source_durations(data, data + source_size);
-  auto results = cudf::strings::from_durations(source_durations, "%D days %H:%M:%S");
-  cudf::strings_column_view source_string(*results);
-  auto output_type = cudf::data_type(cudf::type_to_id<TypeParam>());
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::strings::to_durations(source_string, output_type, "%D days %H:%M:%S");
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const data_type = cudf::data_type(cudf::type_to_id<DataType>());
+  auto const from_dur  = state.get_string("dir") == "from";
+
+  auto const ts_col = create_random_column(data_type.id(), row_count{num_rows});
+  cudf::column_view input(ts_col->view());
+
+  auto format = std::string{"%D days %H:%M:%S"};
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+
+  if (from_dur) {
+    state.add_global_memory_reads<DataType>(num_rows);
+    state.add_global_memory_writes<int8_t>(format.size() * num_rows);
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::from_durations(input, format); });
+  } else {
+    auto source = cudf::strings::from_durations(input, format);
+    auto view   = cudf::strings_column_view(source->view());
+    state.add_global_memory_reads<int8_t>(view.chars_size(stream));
+    state.add_global_memory_writes<DataType>(num_rows);
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::to_durations(view, data_type, format);
+    });
   }
-
-  state.SetBytesProcessed(state.iterations() * source_size * sizeof(TypeParam));
 }
 
-#define DSBM_BENCHMARK_DEFINE(name, type)                                 \
-  BENCHMARK_DEFINE_F(DurationsToString, name)(::benchmark::State & state) \
-  {                                                                       \
-    BM_convert_from_durations<type>(state);                               \
-  }                                                                       \
-  BENCHMARK_REGISTER_F(DurationsToString, name)                           \
-    ->RangeMultiplier(1 << 5)                                             \
-    ->Range(1 << 10, 1 << 25)                                             \
-    ->UseManualTime()                                                     \
-    ->Unit(benchmark::kMicrosecond);
-
-#define SDBM_BENCHMARK_DEFINE(name, type)                                 \
-  BENCHMARK_DEFINE_F(StringToDurations, name)(::benchmark::State & state) \
-  {                                                                       \
-    BM_convert_to_durations<type>(state);                                 \
-  }                                                                       \
-  BENCHMARK_REGISTER_F(StringToDurations, name)                           \
-    ->RangeMultiplier(1 << 5)                                             \
-    ->Range(1 << 10, 1 << 25)                                             \
-    ->UseManualTime()                                                     \
-    ->Unit(benchmark::kMicrosecond);
-
-DSBM_BENCHMARK_DEFINE(from_durations_D, cudf::duration_D);
-DSBM_BENCHMARK_DEFINE(from_durations_s, cudf::duration_s);
-DSBM_BENCHMARK_DEFINE(from_durations_ms, cudf::duration_ms);
-DSBM_BENCHMARK_DEFINE(from_durations_us, cudf::duration_us);
-DSBM_BENCHMARK_DEFINE(from_durations_ns, cudf::duration_ns);
-
-SDBM_BENCHMARK_DEFINE(to_durations_D, cudf::duration_D);
-SDBM_BENCHMARK_DEFINE(to_durations_s, cudf::duration_s);
-SDBM_BENCHMARK_DEFINE(to_durations_ms, cudf::duration_ms);
-SDBM_BENCHMARK_DEFINE(to_durations_us, cudf::duration_us);
-SDBM_BENCHMARK_DEFINE(to_durations_ns, cudf::duration_ns);
+NVBENCH_BENCH_TYPES(bench_convert_duration, NVBENCH_TYPE_AXES(Types))
+  .set_name("duration")
+  .set_type_axes_names({"DataType"})
+  .add_string_axis("dir", {"to", "from"})
+  .add_int64_axis("num_rows", {1 << 10, 1 << 15, 1 << 20, 1 << 25});
diff --git a/cpp/benchmarks/string/copy.cpp b/cpp/benchmarks/string/copy.cpp
new file mode 100644
index 00000000000..2baccd4fad1
--- /dev/null
+++ b/cpp/benchmarks/string/copy.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_copy(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
+  auto const api       = state.get_string("api");
+
+  data_profile const table_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
+  auto const source =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+
+  data_profile const map_profile = data_profile_builder().no_validity().distribution(
+    cudf::type_to_id<cudf::size_type>(), distribution_id::UNIFORM, 0, num_rows);
+  auto const map_table =
+    create_random_table({cudf::type_to_id<cudf::size_type>()}, row_count{num_rows}, map_profile);
+  auto const map_view = map_table->view().column(0);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+
+  if (api == "gather") {
+    auto result =
+      cudf::gather(source->view(), map_view, cudf::out_of_bounds_policy::NULLIFY, stream);
+    auto chars_size = cudf::strings_column_view(result->view().column(0)).chars_size(stream);
+    state.add_global_memory_reads<nvbench::int8_t>(chars_size +
+                                                   (map_view.size() * sizeof(cudf::size_type)));
+    state.add_global_memory_writes<nvbench::int8_t>(chars_size);
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::gather(source->view(), map_view, cudf::out_of_bounds_policy::NULLIFY, stream);
+    });
+  } else if (api == "scatter") {
+    auto const target =
+      create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+    auto result     = cudf::scatter(source->view(), map_view, target->view(), stream);
+    auto chars_size = cudf::strings_column_view(result->view().column(0)).chars_size(stream);
+    state.add_global_memory_reads<nvbench::int8_t>(chars_size +
+                                                   (map_view.size() * sizeof(cudf::size_type)));
+    state.add_global_memory_writes<nvbench::int8_t>(chars_size);
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::scatter(source->view(), map_view, target->view(), stream);
+    });
+  }
+}
+
+NVBENCH_BENCH(bench_copy)
+  .set_name("copy")
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
+  .add_string_axis("api", {"gather", "scatter"});
diff --git a/cpp/benchmarks/string/copy.cu b/cpp/benchmarks/string/copy.cu
deleted file mode 100644
index 6b2f6c3a0a7..00000000000
--- a/cpp/benchmarks/string/copy.cu
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "string_bench_args.hpp"
-
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/random.h>
-#include <thrust/shuffle.h>
-
-class StringCopy : public cudf::benchmark {};
-
-enum copy_type { gather, scatter };
-
-static void BM_copy(benchmark::State& state, copy_type ct)
-{
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
-  data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-
-  auto const source =
-    create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
-  auto const target =
-    create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
-
-  // scatter indices
-  auto index_map_col = make_numeric_column(
-    cudf::data_type{cudf::type_id::INT32}, n_rows, cudf::mask_state::UNALLOCATED);
-  auto index_map = index_map_col->mutable_view();
-  thrust::shuffle_copy(thrust::device,
-                       thrust::counting_iterator<cudf::size_type>(0),
-                       thrust::counting_iterator<cudf::size_type>(n_rows),
-                       index_map.begin<cudf::size_type>(),
-                       thrust::default_random_engine());
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (ct) {
-      case gather: cudf::gather(source->view(), index_map); break;
-      case scatter: cudf::scatter(source->view(), index_map, target->view()); break;
-    }
-  }
-
-  state.SetBytesProcessed(
-    state.iterations() *
-    cudf::strings_column_view(source->view().column(0)).chars_size(cudf::get_default_stream()));
-}
-
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
-
-  // Benchmark for very small strings
-  b->Args({67108864, 2});
-}
-
-#define COPY_BENCHMARK_DEFINE(name)                           \
-  BENCHMARK_DEFINE_F(StringCopy, name)                        \
-  (::benchmark::State & st) { BM_copy(st, copy_type::name); } \
-  BENCHMARK_REGISTER_F(StringCopy, name)                      \
-    ->Apply(generate_bench_args)                              \
-    ->UseManualTime()                                         \
-    ->Unit(benchmark::kMillisecond);
-
-COPY_BENCHMARK_DEFINE(gather)
-COPY_BENCHMARK_DEFINE(scatter)
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index 996bdcf0332..3ea3ff13a2f 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -20,9 +20,7 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/combine.hpp>
 #include <cudf/strings/find.hpp>
-#include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -44,15 +42,13 @@ static void bench_find_string(nvbench::state& state)
   auto const col    = create_string_column(n_rows, row_width, hit_rate);
   auto const input  = cudf::strings_column_view(col->view());
 
-  std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
-  cudf::string_scalar target(h_targets[2]);
-  cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+  cudf::string_scalar target("0987 5W43");
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   auto const chars_size = input.chars_size(stream);
   state.add_element_count(chars_size, "chars_size");
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
-  if (api.substr(0, 4) == "find") {
+  if (api == "find") {
     state.add_global_memory_writes<nvbench::int32_t>(input.size());
   } else {
     state.add_global_memory_writes<nvbench::int8_t>(input.size());
@@ -61,10 +57,6 @@ static void bench_find_string(nvbench::state& state)
   if (api == "find") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { cudf::strings::find(input, target); });
-  } else if (api == "find_multi") {
-    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-      cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
-    });
   } else if (api == "contains") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
@@ -79,7 +71,7 @@ static void bench_find_string(nvbench::state& state)
 
 NVBENCH_BENCH(bench_find_string)
   .set_name("find_string")
-  .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"})
+  .add_string_axis("api", {"find", "contains", "starts_with", "ends_with"})
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
   .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
   .add_int64_axis("hit_rate", {20, 80});  // percentage
diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp
new file mode 100644
index 00000000000..0e780fdb302
--- /dev/null
+++ b/cpp/benchmarks/string/find_multiple.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/strings/find.hpp>
+#include <cudf/strings/find_multiple.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_find_string(nvbench::state& state)
+{
+  auto const n_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width    = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const hit_rate     = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+  auto const target_count = static_cast<cudf::size_type>(state.get_int64("targets"));
+  auto const api          = state.get_string("api");
+
+  auto const stream = cudf::get_default_stream();
+  auto const col    = create_string_column(n_rows, row_width, hit_rate);
+  auto const input  = cudf::strings_column_view(col->view());
+
+  // Note that these all match the first row of the raw_data in create_string_column.
+  // This is so the hit_rate can properly accounted for.
+  std::vector<std::string> const target_data(
+    {" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"});
+  auto h_targets = std::vector<std::string>{};
+  for (cudf::size_type i = 0; i < target_count; i++) {
+    h_targets.emplace_back(target_data[i % target_data.size()]);
+  }
+  cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto const chars_size = input.chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  if (api == "find") {
+    state.add_global_memory_writes<nvbench::int32_t>(input.size());
+  } else {
+    state.add_global_memory_writes<nvbench::int8_t>(input.size());
+  }
+
+  if (api == "find") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
+    });
+  } else if (api == "contains") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::contains_multiple(input, cudf::strings_column_view(targets));
+    });
+  }
+}
+
+NVBENCH_BENCH(bench_find_string)
+  .set_name("find_multiple")
+  .add_string_axis("api", {"find", "contains"})
+  .add_int64_axis("targets", {10, 20, 40})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
+  .add_int64_axis("hit_rate", {20, 80});  // percentage
diff --git a/cpp/benchmarks/string/gather.cpp b/cpp/benchmarks/string/gather.cpp
deleted file mode 100644
index 5b1c679be7d..00000000000
--- a/cpp/benchmarks/string/gather.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmarks/common/generate_input.hpp>
-
-#include <cudf/copying.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <nvbench/nvbench.cuh>
-
-static void bench_gather(nvbench::state& state)
-{
-  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
-  data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
-  auto const input_table =
-    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
-
-  data_profile const map_profile = data_profile_builder().no_validity().distribution(
-    cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_rows);
-  auto const map_table =
-    create_random_table({cudf::type_id::INT32}, row_count{num_rows}, map_profile);
-
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  auto chars_size =
-    cudf::strings_column_view(input_table->view().column(0)).chars_size(cudf::get_default_stream());
-  state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
-  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
-
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::gather(
-      input_table->view(), map_table->view().column(0), cudf::out_of_bounds_policy::NULLIFY);
-  });
-}
-
-NVBENCH_BENCH(bench_gather)
-  .set_name("gather")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index 31ce60d8f9a..a80d0dcbdb8 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -20,8 +20,6 @@
 
 #include <nvtext/minhash.hpp>
 
-#include <rmm/device_buffer.hpp>
-
 #include <nvbench/nvbench.cuh>
 
 static void bench_minhash(nvbench::state& state)
@@ -29,26 +27,25 @@ static void bench_minhash(nvbench::state& state)
   auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const hash_width = static_cast<cudf::size_type>(state.get_int64("hash_width"));
-  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const parameters = static_cast<cudf::size_type>(state.get_int64("parameters"));
   auto const base64     = state.get_int64("hash_type") == 64;
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const strings_profile = data_profile_builder().distribution(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
   auto const strings_table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input(strings_table->view().column(0));
 
-  data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution(
-    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, row_width);
-  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
-  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
-  auto seeds             = seeds_table->get_column(0);
-  seeds.set_null_mask(rmm::device_buffer{}, 0);
+  data_profile const param_profile = data_profile_builder().no_validity().distribution(
+    cudf::type_to_id<cudf::hash_value_type>(),
+    distribution_id::NORMAL,
+    0u,
+    std::numeric_limits<cudf::hash_value_type>::max());
+  auto const param_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const param_table =
+    create_random_table({param_type, param_type}, row_count{parameters}, param_profile);
+  auto const parameters_a = param_table->view().column(0);
+  auto const parameters_b = param_table->view().column(1);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
@@ -57,15 +54,16 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width)
-                         : nvtext::minhash(input, seeds.view(), hash_width);
+    auto result = base64
+                    ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width)
+                    : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width);
   });
 }
 
 NVBENCH_BENCH(bench_minhash)
   .set_name("minhash")
-  .add_int64_axis("num_rows", {1024, 8192, 16364, 131072})
-  .add_int64_axis("row_width", {128, 512, 2048})
-  .add_int64_axis("hash_width", {5, 10})
-  .add_int64_axis("seed_count", {2, 26})
+  .add_int64_axis("num_rows", {15000, 30000, 60000})
+  .add_int64_axis("row_width", {6000, 28000, 50000})
+  .add_int64_axis("hash_width", {12, 24})
+  .add_int64_axis("parameters", {26, 260})
   .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake
index c949f48505e..73f875b46c2 100644
--- a/cpp/cmake/thirdparty/get_kvikio.cmake
+++ b/cpp/cmake/thirdparty/get_kvikio.cmake
@@ -22,7 +22,7 @@ function(find_and_configure_kvikio VERSION)
     GIT_REPOSITORY https://github.com/rapidsai/kvikio.git
     GIT_TAG branch-${VERSION}
     GIT_SHALLOW TRUE SOURCE_SUBDIR cpp
-    OPTIONS "KvikIO_BUILD_EXAMPLES OFF"
+    OPTIONS "KvikIO_BUILD_EXAMPLES OFF" "KvikIO_REMOTE_SUPPORT ${CUDF_KVIKIO_REMOTE_IO}"
   )
 
   include("${rapids-cmake-dir}/export/find_package_root.cmake")
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 940d03cdb41..2e2ac43d6fe 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -57,11 +57,13 @@ void write_json(data_sink* sink,
 /**
  * @brief Normalize single quotes to double quotes using FST
  *
- * @param indata Input device buffer
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
+ * @param indata    Input device buffer
+ * @param delimiter Line-separating delimiter character in JSONL inputs
+ * @param stream    CUDA stream used for device memory operations and kernel launches
+ * @param mr        Device memory resource to use for device memory allocation
  */
 void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
+                             char delimiter,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index b662b660557..7cd4697f592 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -18,6 +18,7 @@
 
 #include "types.hpp"
 
+#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -53,6 +54,11 @@ struct schema_element {
    * @brief Allows specifying this column's child columns target type
    */
   std::map<std::string, schema_element> child_types;
+
+  /**
+   * @brief Allows specifying the order of the columns
+   */
+  std::optional<std::vector<std::string>> column_order;
 };
 
 /**
@@ -87,13 +93,18 @@ enum class json_recovery_mode_t {
  * | `chunksize`          | use `byte_range_xxx` for chunking instead        |
  */
 class json_reader_options {
+ public:
+  using dtype_variant =
+    std::variant<std::vector<data_type>,
+                 std::map<std::string, data_type>,
+                 std::map<std::string, schema_element>,
+                 schema_element>;  ///< Variant type holding dtypes information for the columns
+
+ private:
   source_info _source;
 
   // Data types of the column; empty to infer dtypes
-  std::variant<std::vector<data_type>,
-               std::map<std::string, data_type>,
-               std::map<std::string, schema_element>>
-    _dtypes;
+  dtype_variant _dtypes;
   // Specify the compression format of the source or infer from file extension
   compression_type _compression = compression_type::AUTO;
 
@@ -178,13 +189,7 @@ class json_reader_options {
    *
    * @returns Data types of the columns
    */
-  [[nodiscard]] std::variant<std::vector<data_type>,
-                             std::map<std::string, data_type>,
-                             std::map<std::string, schema_element>> const&
-  get_dtypes() const
-  {
-    return _dtypes;
-  }
+  [[nodiscard]] dtype_variant const& get_dtypes() const { return _dtypes; }
 
   /**
    * @brief Returns compression format of the source.
@@ -228,7 +233,11 @@ class json_reader_options {
    */
   [[nodiscard]] size_t get_byte_range_padding() const
   {
-    auto const num_columns = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes);
+    auto const num_columns =
+      std::visit(cudf::detail::visitor_overload{
+                   [](auto const& dtypes) { return dtypes.size(); },
+                   [](schema_element const& dtypes) { return dtypes.child_types.size(); }},
+                 _dtypes);
 
     auto const max_row_bytes = 16 * 1024;  // 16KB
     auto const column_bytes  = 64;
@@ -390,6 +399,14 @@ class json_reader_options {
    */
   void set_dtypes(std::map<std::string, schema_element> types) { _dtypes = std::move(types); }
 
+  /**
+   * @brief Set data types for a potentially nested column hierarchy.
+   *
+   * @param types schema element with column names and column order to support arbitrary nesting of
+   * data types
+   */
+  void set_dtypes(schema_element types);
+
   /**
    * @brief Set the compression type.
    *
@@ -624,6 +641,18 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set data types for columns to be read.
+   *
+   * @param types Struct schema_element with Column name -> schema_element with map and order
+   * @return this for chaining
+   */
+  json_reader_options_builder& dtypes(schema_element types)
+  {
+    options.set_dtypes(std::move(types));
+    return *this;
+  }
+
   /**
    * @brief Set the compression type.
    *
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index f6bae170f03..f0039734519 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -48,6 +48,7 @@ namespace CUDF_EXPORT cudf {
  *                            ignored.
  * @param[in] exact           If true, returns doubles.
  *                            If false, returns same type as input.
+ * @param[in] stream          CUDA stream used for device memory operations and kernel launches
  * @param[in] mr              Device memory resource used to allocate the returned column's device
  memory
  * @returns Column of specified quantiles, with nulls for indeterminable values
@@ -59,6 +60,7 @@ std::unique_ptr<column> quantile(
   interpolation interp               = interpolation::LINEAR,
   column_view const& ordered_indices = {},
   bool exact                         = true,
+  rmm::cuda_stream_view stream       = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr  = cudf::get_current_device_resource_ref());
 
 /**
@@ -85,6 +87,7 @@ std::unique_ptr<column> quantile(
  * @param is_input_sorted Indicates if the input has been pre-sorted
  * @param column_order    The desired sort order for each column
  * @param null_precedence The desired order of null compared to other elements
+ * @param stream          CUDA stream used for device memory operations and kernel launches
  * @param mr              Device memory resource used to allocate the returned table's device memory
  *
  * @returns Table of specified quantiles, with nulls for indeterminable values
@@ -98,6 +101,7 @@ std::unique_ptr<table> quantiles(
   cudf::sorted is_input_sorted                   = sorted::NO,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
@@ -114,6 +118,7 @@ std::unique_ptr<table> quantiles(
  *
  * @param input           tdigest input data. One tdigest per row
  * @param percentiles     Desired percentiles in range [0, 1]
+ * @param stream          CUDA stream used for device memory operations and kernel launches
  * @param mr              Device memory resource used to allocate the returned column's device
  * memory
  *
@@ -125,6 +130,7 @@ std::unique_ptr<table> quantiles(
 std::unique_ptr<column> percentile_approx(
   tdigest::tdigest_column_view const& input,
   column_view const& percentiles,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 1fe446db8da..e090766dd07 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -28,8 +28,42 @@ namespace strings {
  */
 
 /**
- * @brief Returns a lists column with character position values where each
- * of the target strings are found in each string.
+ * @brief Searches for the given target strings within each string in the provided column
+ *
+ * Each column in the result table corresponds to the result for the target string at the same
+ * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1st for 1st,
+ * etc.
+ *
+ * If the target is not found for a string, false is returned for that entry in the output column.
+ * If the target is an empty string, true is returned for all non-null entries in the output column.
+ *
+ * Any null input strings return corresponding null entries in the output columns.
+ *
+ * @code{.pseudo}
+ * input = ["a", "b", "c"]
+ * targets = ["a", "c"]
+ * output is a table with two boolean columns:
+ *   column 0: [true, false, false]
+ *   column 1: [false, false, true]
+ * @endcode
+ *
+ * @throw std::invalid_argument if `targets` is empty or contains nulls
+ *
+ * @param input Strings instance for this operation
+ * @param targets UTF-8 encoded strings to search for in each string in `input`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Table of BOOL8 columns
+ */
+std::unique_ptr<table> contains_multiple(
+  strings_column_view const& input,
+  strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Searches for the given target strings within each string in the provided column
+ * and returns the position the targets were found
  *
  * The size of the output column is `input.size()`.
  * Each row of the output column is of size `targets.size()`.
@@ -45,7 +79,7 @@ namespace strings {
  *           [-1,-1, 1 ]}  // for "def": "a" and "b" not found, "e" at  pos 1
  * @endcode
  *
- * @throw cudf::logic_error if `targets` is empty or contains nulls
+ * @throw std::invalid_argument if `targets` is empty or contains nulls
  *
  * @param input Strings instance for this operation
  * @param targets Strings to search for in each string
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 42124461cdf..b2c1a23f57e 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -94,6 +94,53 @@ namespace CUDF_EXPORT nvtext {
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each string
+ *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
+ * The input strings are first hashed using the given `seed` over substrings
+ * of `width` characters. These hash values are then combined with the `a`
+ * and `b` parameter values using the following formula:
+ * ```
+ *   max_hash = max of uint32
+ *   mp = (1 << 61) - 1
+ *   hv[i] = hash value of a substring at i
+ *   pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash
+ * ```
+ *
+ * This calculation is performed on each substring and the minimum value is computed
+ * as follows:
+ * ```
+ *   mh[j,i] = min(pv[i]) for all substrings in row j
+ *                        and where i=[0,a.size())
+ * ```
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if parameter_a is empty
+ * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()`
+ * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param seed Seed value used for the hash algorithm
+ * @param parameter_a Values used for the permuted calculation
+ * @param parameter_b Values used for the permuted calculation
+ * @param width The character width of substrings to hash for each row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash_permuted(
+  cudf::strings_column_view const& input,
+  uint32_t seed,
+  cudf::device_span<uint32_t const> parameter_a,
+  cudf::device_span<uint32_t const> parameter_b,
+  cudf::size_type width,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Returns the minhash value for each string
  *
@@ -159,6 +206,53 @@ namespace CUDF_EXPORT nvtext {
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each string
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ *
+ * The input strings are first hashed using the given `seed` over substrings
+ * of `width` characters. These hash values are then combined with the `a`
+ * and `b` parameter values using the following formula:
+ * ```
+ *   max_hash = max of uint64
+ *   mp = (1 << 61) - 1
+ *   hv[i] = hash value of a substring at i
+ *   pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash
+ * ```
+ *
+ * This calculation is performed on each substring and the minimum value is computed
+ * as follows:
+ * ```
+ *   mh[j,i] = min(pv[i]) for all substrings in row j
+ *                        and where i=[0,a.size())
+ * ```
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if parameter_a is empty
+ * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()`
+ * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param seed Seed value used for the hash algorithm
+ * @param parameter_a Values used for the permuted calculation
+ * @param parameter_b Values used for the permuted calculation
+ * @param width The character width of substrings to hash for each row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash64_permuted(
+  cudf::strings_column_view const& input,
+  uint64_t seed,
+  cudf::device_span<uint64_t const> parameter_a,
+  cudf::device_span<uint64_t const> parameter_b,
+  cudf::size_type width,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Returns the minhash values for each row of strings per seed
  *
diff --git a/cpp/scripts/parse_iwyu_output.py b/cpp/scripts/parse_iwyu_output.py
new file mode 100644
index 00000000000..822a980a1a8
--- /dev/null
+++ b/cpp/scripts/parse_iwyu_output.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Helper script to modify IWYU output to only include removals.
+
+Lines that are not from include-what-you-use are removed from the output.
+"""
+
+import argparse
+import re
+from enum import Enum
+
+
+class Mode(Enum):
+    NORMAL = 0
+    ADD = 1
+    REMOVE = 2
+    FULL_INCLUDE_LIST = 3
+
+
+def extract_include_file(include_line):
+    """Extract the core file path from an #include directive."""
+    match = re.search(r'#include\s+[<"]([^">]+)[">]', include_line)
+    if match:
+        return match.group(1)
+    return None
+
+
+def parse_output(input_stream):
+    include_modifications = {}
+    current_file = None
+    mode = Mode.NORMAL
+
+    for line in input_stream:
+        if match := re.match(r"(\/\S+) should add these lines:", line):
+            current_file = match.group(1)
+            include_modifications.setdefault(
+                current_file,
+                {
+                    "add_includes": [],
+                    "remove_includes": [],
+                    "full_include_list": [],
+                },
+            )
+            mode = Mode.ADD
+        elif match := re.match(r"(\/\S+) should remove these lines:", line):
+            mode = Mode.REMOVE
+        elif match := re.match(r"The full include-list for (\/\S+):", line):
+            mode = Mode.FULL_INCLUDE_LIST
+        elif line.strip() == "---":
+            current_file = None
+            mode = Mode.NORMAL
+        else:
+            if current_file:
+                if mode == Mode.ADD:
+                    include_modifications[current_file]["add_includes"].append(
+                        line.strip()
+                    )
+                elif mode == Mode.REMOVE:
+                    include_modifications[current_file][
+                        "remove_includes"
+                    ].append(line.strip())
+                elif mode == Mode.FULL_INCLUDE_LIST:
+                    include_modifications[current_file][
+                        "full_include_list"
+                    ].append(line.strip())
+            else:
+                if (
+                    line.strip()
+                    and "include-what-you-use reported diagnostics" not in line
+                    and "In file included from" not in line
+                    and "has correct #includes/fwd-decls" not in line
+                ):
+                    print(line, end="")
+
+    return include_modifications
+
+
+def post_process_includes(include_modifications):
+    """Deduplicate and remove redundant entries from add and remove includes."""
+    for mods in include_modifications.values():
+        # Deduplicate add_includes and remove_includes
+        mods["add_includes"] = list(set(mods["add_includes"]))
+        mods["remove_includes"] = list(set(mods["remove_includes"]))
+
+        # Extract file paths from add_includes and remove_includes
+        add_files = {
+            extract_include_file(line) for line in mods["add_includes"]
+        }
+        remove_files = {
+            extract_include_file(line) for line in mods["remove_includes"]
+        }
+
+        # Remove entries that exist in both add_includes and remove_includes
+        common_files = add_files & remove_files
+        mods["add_includes"] = [
+            line
+            for line in mods["add_includes"]
+            if extract_include_file(line) not in common_files
+        ]
+        mods["remove_includes"] = [
+            line
+            for line in mods["remove_includes"]
+            if extract_include_file(line) not in common_files
+        ]
+
+        # Remove entries that exist in add_includes from full_include_list
+        mods["full_include_list"] = [
+            include
+            for include in mods["full_include_list"]
+            if extract_include_file(include) not in add_files
+        ]
+
+
+def write_output(include_modifications, output_stream):
+    for filename, mods in include_modifications.items():
+        if mods["remove_includes"]:
+            # IWYU requires all sections to exist, so we write out this header even
+            # though we never write out any actual additions.
+            output_stream.write(f"{filename} should add these lines:\n\n")
+
+            output_stream.write(f"{filename} should remove these lines:\n")
+            for line in mods["remove_includes"]:
+                output_stream.write(line + "\n")
+            output_stream.write("\n")
+
+            output_stream.write(f"The full include-list for {filename}:\n")
+            for line in mods["full_include_list"]:
+                output_stream.write(line + "\n")
+            output_stream.write("---\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process include modifications from a build output log."
+    )
+    parser.add_argument(
+        "input",
+        nargs="?",
+        type=argparse.FileType("r"),
+        default="-",
+        help="Input file to read (default: stdin)",
+    )
+    parser.add_argument(
+        "--output",
+        type=argparse.FileType("w"),
+        default="iwyu_results.txt",
+        help="Output file to write (default: iwyu_output.txt)",
+    )
+    args = parser.parse_args()
+
+    include_modifications = parse_output(args.input)
+    post_process_includes(include_modifications)
+    write_output(include_modifications, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index cc0682b68b9..6eb82618e2a 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -29,7 +29,6 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/reduction/detail/histogram.hpp>
-#include <cudf/strings/string_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
new file mode 100644
index 00000000000..cac6c2224f0
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_aggregations.cuh"
+#include "compute_aggregations.hpp"
+
+namespace cudf::groupby::detail::hash {
+template rmm::device_uvector<cudf::size_type> compute_aggregations<global_set_t>(
+  int64_t num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  global_set_t& global_set,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
new file mode 100644
index 00000000000..e8b29a0e7a8
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_aggregations.hpp"
+#include "compute_global_memory_aggs.hpp"
+#include "compute_mapping_indices.hpp"
+#include "compute_shared_memory_aggs.hpp"
+#include "create_sparse_results_table.hpp"
+#include "flatten_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuda/std/atomic>
+#include <thrust/for_each.h>
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_aggregations(
+  int64_t num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  SetType& global_set,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream)
+{
+  // flatten the aggs to a table that can be operated on by aggregate_row
+  auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+  auto const d_agg_kinds                   = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, rmm::mr::get_current_device_resource());
+
+  auto const grid_size =
+    max_occupancy_grid_size<typename SetType::ref_type<cuco::insert_and_find_tag>>(num_rows);
+  auto const available_shmem_size = get_available_shared_memory_size(grid_size);
+  auto const has_sufficient_shmem =
+    available_shmem_size > (compute_shmem_offsets_size(flattened_values.num_columns()) * 2);
+  auto const has_dictionary_request = std::any_of(
+    requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) {
+      return cudf::is_dictionary(request.values.type());
+    });
+  auto const is_shared_memory_compatible = !has_dictionary_request and has_sufficient_shmem;
+
+  // Performs naive global memory aggregations when the workload is not compatible with shared
+  // memory, such as when aggregating dictionary columns or when there is insufficient dynamic
+  // shared memory for shared memory aggregations.
+  if (!is_shared_memory_compatible) {
+    return compute_global_memory_aggs(num_rows,
+                                      skip_rows_with_nulls,
+                                      row_bitmask,
+                                      flattened_values,
+                                      d_agg_kinds.data(),
+                                      agg_kinds,
+                                      global_set,
+                                      aggs,
+                                      sparse_results,
+                                      stream);
+  }
+
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(num_rows, stream);
+  // 'local_mapping_index' maps from the global row index of the input table to its block-wise rank
+  rmm::device_uvector<cudf::size_type> local_mapping_index(num_rows, stream);
+  // 'global_mapping_index' maps from the block-wise rank to the row index of global aggregate table
+  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
+                                                            stream);
+  rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
+
+  // Flag indicating whether a global memory aggregation fallback is required or not
+  rmm::device_scalar<cuda::std::atomic_flag> needs_global_memory_fallback(stream);
+
+  auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
+
+  compute_mapping_indices(grid_size,
+                          num_rows,
+                          global_set_ref,
+                          row_bitmask,
+                          skip_rows_with_nulls,
+                          local_mapping_index.data(),
+                          global_mapping_index.data(),
+                          block_cardinality.data(),
+                          needs_global_memory_fallback.data(),
+                          stream);
+
+  cuda::std::atomic_flag h_needs_fallback;
+  // Cannot use `device_scalar::value` as it requires a copy constructor, which
+  // `atomic_flag` doesn't have.
+  CUDF_CUDA_TRY(cudaMemcpyAsync(&h_needs_fallback,
+                                needs_global_memory_fallback.data(),
+                                sizeof(cuda::std::atomic_flag),
+                                cudaMemcpyDefault,
+                                stream.value()));
+  stream.synchronize();
+  auto const needs_fallback = h_needs_fallback.test();
+
+  // make table that will hold sparse results
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_agg_kinds.data(),
+                                                         agg_kinds,
+                                                         needs_fallback,
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
+  // prepare to launch kernel to do the actual aggregation
+  auto d_values       = table_device_view::create(flattened_values, stream);
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+
+  compute_shared_memory_aggs(grid_size,
+                             available_shmem_size,
+                             num_rows,
+                             row_bitmask,
+                             skip_rows_with_nulls,
+                             local_mapping_index.data(),
+                             global_mapping_index.data(),
+                             block_cardinality.data(),
+                             *d_values,
+                             *d_sparse_table,
+                             d_agg_kinds.data(),
+                             stream);
+
+  // The shared memory groupby is designed so that each thread block can handle up to 128 unique
+  // keys. When a block reaches this cardinality limit, shared memory becomes insufficient to store
+  // the temporary aggregation results. In these situations, we must fall back to a global memory
+  // aggregator to process the remaining aggregation requests.
+  if (needs_fallback) {
+    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator{0},
+                       num_rows,
+                       global_memory_fallback_fn{global_set_ref,
+                                                 *d_values,
+                                                 *d_sparse_table,
+                                                 d_agg_kinds.data(),
+                                                 block_cardinality.data(),
+                                                 stride,
+                                                 row_bitmask,
+                                                 skip_rows_with_nulls});
+    extract_populated_keys(global_set, populated_keys, stream);
+  }
+
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggs.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
+  }
+
+  return populated_keys;
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp
similarity index 70%
rename from cpp/src/groupby/hash/compute_single_pass_aggs.hpp
rename to cpp/src/groupby/hash/compute_aggregations.hpp
index a7434bdf61a..829c3c808b0 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_aggregations.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace cudf::groupby::detail::hash {
 /**
@@ -28,11 +29,12 @@ namespace cudf::groupby::detail::hash {
  * over the data and stores the results in `sparse_results`
  */
 template <typename SetType>
-void compute_single_pass_aggs(int64_t num_keys,
-                              bool skip_rows_with_nulls,
-                              bitmask_type const* row_bitmask,
-                              SetType set,
-                              cudf::host_span<cudf::groupby::aggregation_request const> requests,
-                              cudf::detail::result_cache* sparse_results,
-                              rmm::cuda_stream_view stream);
+rmm::device_uvector<cudf::size_type> compute_aggregations(
+  int64_t num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  SetType& global_set,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu
new file mode 100644
index 00000000000..1d7184227ea
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_aggregations_null.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_aggregations.cuh"
+#include "compute_aggregations.hpp"
+
+namespace cudf::groupby::detail::hash {
+template rmm::device_uvector<cudf::size_type> compute_aggregations<nullable_global_set_t>(
+  int64_t num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  nullable_global_set_t& global_set,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
new file mode 100644
index 00000000000..6025686953e
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_global_memory_aggs.cuh"
+#include "compute_global_memory_aggs.hpp"
+
+namespace cudf::groupby::detail::hash {
+template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<global_set_t>(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  global_set_t& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
new file mode 100644
index 00000000000..00db149c6d9
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_global_memory_aggs.hpp"
+#include "create_sparse_results_table.hpp"
+#include "flatten_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/static_set.cuh>
+#include <thrust/for_each.h>
+
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  SetType& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream)
+{
+  auto constexpr uses_global_memory_aggs = true;
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(num_rows, stream);
+
+  // make table that will hold sparse results
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_agg_kinds,
+                                                         agg_kinds,
+                                                         uses_global_memory_aggs,
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
+
+  // prepare to launch kernel to do the actual aggregation
+  auto d_values       = table_device_view::create(flattened_values, stream);
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
+
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::counting_iterator{0},
+    num_rows,
+    hash::compute_single_pass_aggs_fn{
+      global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls});
+  extract_populated_keys(global_set, populated_keys, stream);
+
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggregations.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggregations[i], std::move(sparse_result_cols[i]));
+  }
+
+  return populated_keys;
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
new file mode 100644
index 00000000000..0777b9ffd93
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  SetType& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
new file mode 100644
index 00000000000..209e2b7f20a
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_global_memory_aggs.cuh"
+#include "compute_global_memory_aggs.hpp"
+
+namespace cudf::groupby::detail::hash {
+template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<nullable_global_set_t>(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  nullable_global_set_t& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 59457bea694..e1dbf2a3d9e 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include "compute_aggregations.hpp"
 #include "compute_groupby.hpp"
-#include "compute_single_pass_aggs.hpp"
 #include "helpers.cuh"
 #include "sparse_to_dense_results.hpp"
 
@@ -29,7 +29,6 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuco/static_set.cuh>
@@ -38,18 +37,6 @@
 #include <memory>
 
 namespace cudf::groupby::detail::hash {
-template <typename SetType>
-rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
-                                                      size_type num_keys,
-                                                      rmm::cuda_stream_view stream)
-{
-  rmm::device_uvector<size_type> populated_keys(num_keys, stream);
-  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
-
-  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
-  return populated_keys;
-}
-
 template <typename Equal, typename Hash>
 std::unique_ptr<table> compute_groupby(table_view const& keys,
                                        host_span<aggregation_request const> requests,
@@ -67,8 +54,8 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
   // column is indexed by the hash set
   cudf::detail::result_cache sparse_results(requests.size());
 
-  auto const set = cuco::static_set{
-    num_keys,
+  auto set = cuco::static_set{
+    cuco::extent<int64_t>{num_keys},
     cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% load factor
     cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
     d_row_equal,
@@ -84,17 +71,13 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
       : rmm::device_buffer{};
 
   // Compute all single pass aggs first
-  compute_single_pass_aggs(num_keys,
-                           skip_rows_with_nulls,
-                           static_cast<bitmask_type*>(row_bitmask.data()),
-                           set.ref(cuco::insert_and_find),
-                           requests,
-                           &sparse_results,
-                           stream);
-
-  // Extract the populated indices from the hash set and create a gather map.
-  // Gathering using this map from sparse results will give dense results.
-  auto gather_map = extract_populated_keys(set, keys.num_rows(), stream);
+  auto gather_map = compute_aggregations(num_keys,
+                                         skip_rows_with_nulls,
+                                         static_cast<bitmask_type*>(row_bitmask.data()),
+                                         set,
+                                         requests,
+                                         &sparse_results,
+                                         stream);
 
   // Compact all results from sparse_results and insert into cache
   sparse_to_dense_results(requests,
@@ -114,12 +97,6 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
                               mr);
 }
 
-template rmm::device_uvector<size_type> extract_populated_keys<global_set_t>(
-  global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream);
-
-template rmm::device_uvector<size_type> extract_populated_keys<nullable_global_set_t>(
-  nullable_global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream);
-
 template std::unique_ptr<table> compute_groupby<row_comparator_t, row_hash_t>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp
index 7bb3a60ff07..77243dc0a4f 100644
--- a/cpp/src/groupby/hash/compute_groupby.hpp
+++ b/cpp/src/groupby/hash/compute_groupby.hpp
@@ -22,28 +22,11 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <memory>
 
 namespace cudf::groupby::detail::hash {
-/**
- * @brief Computes and returns a device vector containing all populated keys in
- * `key_set`.
- *
- * @tparam SetType Type of key hash set
- *
- * @param key_set Key hash set
- * @param num_keys Number of input keys
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return An array of unique keys contained in `key_set`
- */
-template <typename SetType>
-rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
-                                                      size_type num_keys,
-                                                      rmm::cuda_stream_view stream);
-
 /**
  * @brief Computes groupby using hash table.
  *
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index 12c02a1865e..f0361ccced2 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -47,9 +47,8 @@ struct size_of_functor {
 /// Shared memory data alignment
 CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8;
 
-// Prepares shared memory data required by each output column, exits if
-// no enough memory space to perform the shared memory aggregation for the
-// current output column
+// Allocates shared memory required for output columns. Exits if there is insufficient memory to
+// perform shared memory aggregation for the current output column.
 __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
                                                cudf::size_type& col_end,
                                                cudf::mutable_table_device_view output_values,
@@ -74,9 +73,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
                                 ALIGNMENT);
     auto const next_col_total_size = next_col_size + valid_col_size;
 
-    if (bytes_allocated + next_col_total_size > total_agg_size) {
-      CUDF_UNREACHABLE("Not enough memory for shared memory aggregations");
-    }
+    if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
 
     shmem_agg_res_offsets[col_end]  = bytes_allocated;
     shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size;
@@ -275,7 +272,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
 }
 }  // namespace
 
-std::size_t available_shared_memory_size(cudf::size_type grid_size)
+std::size_t get_available_shared_memory_size(cudf::size_type grid_size)
 {
   auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
@@ -302,11 +299,11 @@ void compute_shared_memory_aggs(cudf::size_type grid_size,
 {
   // For each aggregation, need one offset determining where the aggregation is
   // performed, another indicating the validity of the aggregation
-  auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type);
+  auto const offsets_size = compute_shmem_offsets_size(output_values.num_columns());
   // The rest of shmem is utilized for the actual arrays in shmem
-  CUDF_EXPECTS(available_shmem_size > shmem_offsets_size * 2,
+  CUDF_EXPECTS(available_shmem_size > offsets_size * 2,
                "No enough space for shared memory aggregations");
-  auto const shmem_agg_size = available_shmem_size - shmem_offsets_size * 2;
+  auto const shmem_agg_size = available_shmem_size - offsets_size * 2;
   single_pass_shmem_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, available_shmem_size, stream>>>(
     num_input_rows,
     row_bitmask,
@@ -318,6 +315,6 @@ void compute_shared_memory_aggs(cudf::size_type grid_size,
     output_values,
     d_agg_kinds,
     shmem_agg_size,
-    shmem_offsets_size);
+    offsets_size);
 }
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
index 653821fd53b..346956cdab0 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
@@ -22,8 +22,12 @@
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf::groupby::detail::hash {
+std::size_t get_available_shared_memory_size(cudf::size_type grid_size);
 
-std::size_t available_shared_memory_size(cudf::size_type grid_size);
+std::size_t constexpr compute_shmem_offsets_size(cudf::size_type num_cols)
+{
+  return sizeof(cudf::size_type) * num_cols;
+}
 
 void compute_shared_memory_aggs(cudf::size_type grid_size,
                                 std::size_t available_shmem_size,
@@ -37,5 +41,4 @@ void compute_shared_memory_aggs(cudf::size_type grid_size,
                                 cudf::mutable_table_device_view output_values,
                                 cudf::aggregation::Kind const* d_agg_kinds,
                                 rmm::cuda_stream_view stream);
-
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
deleted file mode 100644
index e292543e6e9..00000000000
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "compute_single_pass_aggs.hpp"
-#include "create_sparse_results_table.hpp"
-#include "flatten_single_pass_aggs.hpp"
-#include "helpers.cuh"
-#include "single_pass_functors.cuh"
-#include "var_hash_functor.cuh"
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/aggregation/aggregation.cuh>
-#include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <algorithm>
-#include <memory>
-#include <vector>
-
-namespace cudf::groupby::detail::hash {
-/**
- * @brief Computes all aggregations from `requests` that require a single pass
- * over the data and stores the results in `sparse_results`
- */
-template <typename SetType>
-void compute_single_pass_aggs(int64_t num_keys,
-                              bool skip_rows_with_nulls,
-                              bitmask_type const* row_bitmask,
-                              SetType set,
-                              host_span<aggregation_request const> requests,
-                              cudf::detail::result_cache* sparse_results,
-                              rmm::cuda_stream_view stream)
-{
-  // flatten the aggs to a table that can be operated on by aggregate_row
-  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-
-  // make table that will hold sparse results
-  table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream);
-  // prepare to launch kernel to do the actual aggregation
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto d_values       = table_device_view::create(flattened_values, stream);
-  auto const d_aggs   = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, cudf::get_current_device_resource_ref());
-
-  thrust::for_each_n(
-    rmm::exec_policy_nosync(stream),
-    thrust::make_counting_iterator(0),
-    num_keys,
-    hash::compute_single_pass_aggs_fn{
-      set, *d_values, *d_sparse_table, d_aggs.data(), row_bitmask, skip_rows_with_nulls});
-  // Add results back to sparse_results cache
-  auto sparse_result_cols = sparse_table.release();
-  for (size_t i = 0; i < aggs.size(); i++) {
-    // Note that the cache will make a copy of this temporary aggregation
-    sparse_results->add_result(
-      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
-  }
-}
-
-template void compute_single_pass_aggs<hash_set_ref_t<cuco::insert_and_find_tag>>(
-  int64_t num_keys,
-  bool skip_rows_with_nulls,
-  bitmask_type const* row_bitmask,
-  hash_set_ref_t<cuco::insert_and_find_tag> set,
-  host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  rmm::cuda_stream_view stream);
-
-template void compute_single_pass_aggs<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(
-  int64_t num_keys,
-  bool skip_rows_with_nulls,
-  bitmask_type const* row_bitmask,
-  nullable_hash_set_ref_t<cuco::insert_and_find_tag> set,
-  host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  rmm::cuda_stream_view stream);
-}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu
index 22fa4fc584c..bc32e306b3f 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.cu
+++ b/cpp/src/groupby/hash/create_sparse_results_table.cu
@@ -15,53 +15,110 @@
  */
 
 #include "create_sparse_results_table.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
 
+#include <cudf/aggregation.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cuco/static_set.cuh>
 
 #include <algorithm>
 #include <memory>
 #include <vector>
 
 namespace cudf::groupby::detail::hash {
+template <typename SetType>
+void extract_populated_keys(SetType const& key_set,
+                            rmm::device_uvector<cudf::size_type>& populated_keys,
+                            rmm::cuda_stream_view stream)
+{
+  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
+
+  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
+}
+
 // make table that will hold sparse results
-cudf::table create_sparse_results_table(table_view const& flattened_values,
-                                        std::vector<aggregation::Kind> aggs,
+template <typename GlobalSetType>
+cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
+                                        cudf::aggregation::Kind const* d_agg_kinds,
+                                        std::vector<cudf::aggregation::Kind> agg_kinds,
+                                        bool direct_aggregations,
+                                        GlobalSetType const& global_set,
+                                        rmm::device_uvector<cudf::size_type>& populated_keys,
                                         rmm::cuda_stream_view stream)
 {
   // TODO single allocation - room for performance improvement
-  std::vector<std::unique_ptr<column>> sparse_columns;
-  sparse_columns.reserve(flattened_values.num_columns());
-  std::transform(
-    flattened_values.begin(),
-    flattened_values.end(),
-    aggs.begin(),
-    std::back_inserter(sparse_columns),
-    [stream](auto const& col, auto const& agg) {
-      bool nullable =
-        (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL)
-          ? false
-          : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD);
-      auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED;
+  std::vector<std::unique_ptr<cudf::column>> sparse_columns;
+  std::transform(flattened_values.begin(),
+                 flattened_values.end(),
+                 agg_kinds.begin(),
+                 std::back_inserter(sparse_columns),
+                 [stream](auto const& col, auto const& agg) {
+                   auto const nullable =
+                     (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
+                       ? false
+                       : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
+                          agg == cudf::aggregation::STD);
+                   auto const mask_flag =
+                     (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
+                   auto const col_type = cudf::is_dictionary(col.type())
+                                           ? cudf::dictionary_column_view(col).keys().type()
+                                           : col.type();
+                   return make_fixed_width_column(
+                     cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
+                 });
+  cudf::table sparse_table(std::move(sparse_columns));
+  // If no direct aggregations, initialize the sparse table
+  // only for the keys inserted in global hash set
+  if (!direct_aggregations) {
+    auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream);
+    extract_populated_keys(global_set, populated_keys, stream);
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      populated_keys.size(),
+      initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds});
+  }
+  // Else initialize the whole table
+  else {
+    cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
+    cudf::detail::initialize_with_identity(sparse_table_view, agg_kinds, stream);
+  }
+  return sparse_table;
+}
 
-      auto col_type = cudf::is_dictionary(col.type())
-                        ? cudf::dictionary_column_view(col).keys().type()
-                        : col.type();
+template void extract_populated_keys<global_set_t>(
+  global_set_t const& key_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
 
-      return make_fixed_width_column(
-        cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
-    });
+template void extract_populated_keys<nullable_global_set_t>(
+  nullable_global_set_t const& key_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
 
-  table sparse_table(std::move(sparse_columns));
-  mutable_table_view table_view = sparse_table.mutable_view();
-  cudf::detail::initialize_with_identity(table_view, aggs, stream);
-  return sparse_table;
-}
+template cudf::table create_sparse_results_table<global_set_t>(
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> agg_kinds,
+  bool direct_aggregations,
+  global_set_t const& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+
+template cudf::table create_sparse_results_table<nullable_global_set_t>(
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> agg_kinds,
+  bool direct_aggregations,
+  nullable_global_set_t const& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp
index c1d4e0d3f20..8155ce852e0 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.hpp
+++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp
@@ -15,18 +15,41 @@
  */
 #pragma once
 
+#include <cudf/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <vector>
 
 namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes and returns a device vector containing all populated keys in
+ * `key_set`.
+ *
+ * @tparam SetType Type of the key hash set
+ *
+ * @param key_set Key hash set
+ * @param populated_keys Array of unique keys
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return An array of unique keys contained in `key_set`
+ */
+template <typename SetType>
+void extract_populated_keys(SetType const& key_set,
+                            rmm::device_uvector<cudf::size_type>& populated_keys,
+                            rmm::cuda_stream_view stream);
+
 // make table that will hold sparse results
-cudf::table create_sparse_results_table(table_view const& flattened_values,
-                                        std::vector<aggregation::Kind> aggs_kinds,
+template <typename GlobalSetType>
+cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
+                                        cudf::aggregation::Kind const* d_agg_kinds,
+                                        std::vector<cudf::aggregation::Kind> agg_kinds,
+                                        bool direct_aggregations,
+                                        GlobalSetType const& global_set,
+                                        rmm::device_uvector<cudf::size_type>& populated_keys,
                                         rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index 00836567b4f..f950e03e0fb 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -23,8 +23,6 @@
 #include <cuco/static_set.cuh>
 
 namespace cudf::groupby::detail::hash {
-// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
-// types and `cg_size = 1`for flat data to improve performance
 /// Number of threads to handle each input element
 CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1;
 
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 28a5b578e00..048c9252773 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -15,12 +15,14 @@
  */
 #pragma once
 
-#include <cudf/detail/aggregation/aggregation.hpp>
+#include "helpers.cuh"
+
 #include <cudf/detail/aggregation/device_aggregators.cuh>
-#include <cudf/groupby.hpp>
-#include <cudf/utilities/bit.hpp>
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/utilities/traits.cuh>
 
-#include <cuco/static_set_ref.cuh>
+#include <cuda/std/cstddef>
 
 namespace cudf::groupby::detail::hash {
 // TODO: TO BE REMOVED issue tracked via #17171
@@ -104,6 +106,114 @@ struct initialize_shmem {
   }
 };
 
+template <typename Target, cudf::aggregation::Kind k, typename Enable = void>
+struct initialize_target_element_gmem {
+  __device__ void operator()(cudf::mutable_column_device_view, cudf::size_type) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Target, cudf::aggregation::Kind k>
+struct initialize_target_element_gmem<
+  Target,
+  k,
+  std::enable_if_t<is_supported<Target, k>() && cudf::is_fixed_width<Target>() &&
+                   !cudf::is_fixed_point<Target>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index) const noexcept
+  {
+    using DeviceType                     = cudf::device_storage_type_t<Target>;
+    target.element<Target>(target_index) = get_identity<DeviceType, k>();
+  }
+};
+
+template <typename Target, cudf::aggregation::Kind k>
+struct initialize_target_element_gmem<
+  Target,
+  k,
+  std::enable_if_t<is_supported<Target, k>() && cudf::is_fixed_point<Target>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index) const noexcept
+  {
+    using DeviceType                         = cudf::device_storage_type_t<Target>;
+    target.element<DeviceType>(target_index) = get_identity<DeviceType, k>();
+  }
+};
+
+struct initialize_gmem {
+  template <typename Target, cudf::aggregation::Kind k>
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index) const noexcept
+  {
+    initialize_target_element_gmem<Target, k>{}(target, target_index);
+  }
+};
+
+struct initialize_sparse_table {
+  cudf::size_type const* row_indices;
+  cudf::mutable_table_device_view sparse_table;
+  cudf::aggregation::Kind const* __restrict__ aggs;
+  initialize_sparse_table(cudf::size_type const* row_indices,
+                          cudf::mutable_table_device_view sparse_table,
+                          cudf::aggregation::Kind const* aggs)
+    : row_indices(row_indices), sparse_table(sparse_table), aggs(aggs)
+  {
+  }
+  __device__ void operator()(cudf::size_type i)
+  {
+    auto key_idx = row_indices[i];
+    for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) {
+      cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(),
+                                                  aggs[col_idx],
+                                                  initialize_gmem{},
+                                                  sparse_table.column(col_idx),
+                                                  key_idx);
+    }
+  }
+};
+
+template <typename SetType>
+struct global_memory_fallback_fn {
+  SetType set;
+  cudf::table_device_view input_values;
+  cudf::mutable_table_device_view output_values;
+  cudf::aggregation::Kind const* __restrict__ aggs;
+  cudf::size_type* block_cardinality;
+  cudf::size_type stride;
+  bitmask_type const* __restrict__ row_bitmask;
+  bool skip_rows_with_nulls;
+
+  global_memory_fallback_fn(SetType set,
+                            cudf::table_device_view input_values,
+                            cudf::mutable_table_device_view output_values,
+                            cudf::aggregation::Kind const* aggs,
+                            cudf::size_type* block_cardinality,
+                            cudf::size_type stride,
+                            bitmask_type const* row_bitmask,
+                            bool skip_rows_with_nulls)
+    : set(set),
+      input_values(input_values),
+      output_values(output_values),
+      aggs(aggs),
+      block_cardinality(block_cardinality),
+      stride(stride),
+      row_bitmask(row_bitmask),
+      skip_rows_with_nulls(skip_rows_with_nulls)
+  {
+  }
+
+  __device__ void operator()(cudf::size_type i)
+  {
+    auto const block_id = (i % stride) / GROUPBY_BLOCK_SIZE;
+    if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and
+        (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) {
+      auto const result = set.insert_and_find(i);
+      cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs);
+    }
+  }
+};
+
 /**
  * @brief Computes single-pass aggregations and store results into a sparse `output_values` table,
  * and populate `set` with indices of unique keys
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 570a00cbfc2..7fafa885c66 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -269,7 +269,8 @@ std::map<std::string, schema_element> unified_schema(cudf::io::json_reader_optio
                        });
         return dnew;
       },
-      [](std::map<std::string, schema_element> const& user_dtypes) { return user_dtypes; }},
+      [](std::map<std::string, schema_element> const& user_dtypes) { return user_dtypes; },
+      [](schema_element const& user_dtypes) { return user_dtypes.child_types; }},
     options.get_dtypes());
 }
 
@@ -492,7 +493,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
   auto expected_types = cudf::detail::make_host_vector<NodeT>(num_columns, stream);
   std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES);
 
-  auto lookup_names = [&column_names](auto child_ids, auto name) {
+  auto lookup_names = [&column_names](auto const& child_ids, auto const& name) {
     for (auto const& child_id : child_ids) {
       if (column_names[child_id] == name) return child_id;
     }
@@ -569,7 +570,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                    for (size_t i = 0; i < adj[root_list_col_id].size() && i < user_dtypes.size();
                         i++) {
                      NodeIndexT const first_child_id = adj[root_list_col_id][i];
-                     auto name                       = column_names[first_child_id];
+                     auto const& name                = column_names[first_child_id];
                      auto value_id                   = std::stol(name);
                      if (value_id >= 0 and value_id < static_cast<long>(user_dtypes.size()))
                        mark_is_pruned(first_child_id, schema_element{user_dtypes[value_id]});
@@ -580,7 +581,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                    std::map<std::string, data_type> const& user_dtypes) -> void {
                    for (size_t i = 0; i < adj[root_list_col_id].size(); i++) {
                      auto const first_child_id = adj[root_list_col_id][i];
-                     auto name                 = column_names[first_child_id];
+                     auto const& name          = column_names[first_child_id];
                      if (user_dtypes.count(name))
                        mark_is_pruned(first_child_id, schema_element{user_dtypes.at(name)});
                    }
@@ -589,10 +590,19 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                    std::map<std::string, schema_element> const& user_dtypes) -> void {
                    for (size_t i = 0; i < adj[root_list_col_id].size(); i++) {
                      auto const first_child_id = adj[root_list_col_id][i];
-                     auto name                 = column_names[first_child_id];
+                     auto const& name          = column_names[first_child_id];
                      if (user_dtypes.count(name))
                        mark_is_pruned(first_child_id, user_dtypes.at(name));
                    }
+                 },
+                 [&root_list_col_id, &adj, &mark_is_pruned, &column_names](
+                   schema_element const& user_dtypes) -> void {
+                   for (size_t i = 0; i < adj[root_list_col_id].size(); i++) {
+                     auto const first_child_id = adj[root_list_col_id][i];
+                     auto const& name          = column_names[first_child_id];
+                     if (user_dtypes.child_types.count(name) != 0)
+                       mark_is_pruned(first_child_id, user_dtypes.child_types.at(name));
+                   }
                  }},
                options.get_dtypes());
   } else {
@@ -626,7 +636,9 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
         [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema](
           std::map<std::string, schema_element> const& user_dtypes) -> void {
           mark_is_pruned(root_struct_col_id, u_schema);
-        }},
+        },
+        [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema](schema_element const& user_dtypes)
+          -> void { mark_is_pruned(root_struct_col_id, u_schema); }},
       options.get_dtypes());
   }
   // Useful for array of arrays
@@ -714,7 +726,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
     if (expected_category == NC_STRUCT) {
       // find field column ids, and its children and create columns.
       for (auto const& field_id : child_ids) {
-        auto name = column_names[field_id];
+        auto const& name = column_names[field_id];
         if (is_pruned[field_id]) continue;
         auto inserted =
           ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second;
@@ -745,7 +757,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
         std::map<NodeIndexT, std::vector<NodeIndexT>> array_values;
         for (auto const& child_id : child_ids) {
           if (is_pruned[child_id]) continue;
-          auto name = column_names[child_id];
+          auto const& name = column_names[child_id];
           array_values[std::stoi(name)].push_back(child_id);
         }
         //
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 7e4d975e431..30a154fdda2 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -399,9 +399,9 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       // - String columns will be returned as nullable, iff there's at least one null entry
       if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); }
 
-      // For string columns return ["offsets", "char"] schema
+      // For string columns return ["offsets"] schema
       if (target_type.id() == type_id::STRING) {
-        return {std::move(col), std::vector<column_name_info>{{"offsets"}, {"chars"}}};
+        return {std::move(col), std::vector<column_name_info>{{"offsets"}}};
       }
       // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema
       return {std::move(col), std::vector<column_name_info>{}};
@@ -410,12 +410,37 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       std::vector<std::unique_ptr<column>> child_columns;
       std::vector<column_name_info> column_names{};
       size_type num_rows{json_col.num_rows};
+
+      bool const has_column_order =
+        prune_columns and not schema.value_or(schema_element{})
+                                .column_order.value_or(std::vector<std::string>{})
+                                .empty();
+
+      auto const& col_order =
+        has_column_order ? schema.value().column_order.value() : json_col.column_order;
+
       // Create children columns
-      for (auto const& col_name : json_col.column_order) {
-        auto const& col = json_col.child_columns.find(col_name);
-        column_names.emplace_back(col->first);
-        auto& child_col           = col->second;
+      for (auto const& col_name : col_order) {
         auto child_schema_element = get_child_schema(col_name);
+        auto const found_it       = json_col.child_columns.find(col_name);
+
+        if (prune_columns and found_it == std::end(json_col.child_columns)) {
+          CUDF_EXPECTS(child_schema_element.has_value(),
+                       "Column name not found in input schema map, but present in column order and "
+                       "prune_columns is enabled");
+          column_names.emplace_back(make_column_name_info(
+            child_schema_element.value_or(schema_element{data_type{type_id::EMPTY}}), col_name));
+          auto all_null_column = make_all_nulls_column(
+            child_schema_element.value_or(schema_element{data_type{type_id::EMPTY}}),
+            num_rows,
+            stream,
+            mr);
+          child_columns.emplace_back(std::move(all_null_column));
+          continue;
+        }
+        column_names.emplace_back(found_it->first);
+
+        auto& child_col = found_it->second;
         if (!prune_columns or child_schema_element.has_value()) {
           auto [child_column, names] = device_json_column_to_cudf_column(
             child_col, d_input, options, prune_columns, child_schema_element, stream, mr);
@@ -576,11 +601,21 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
   std::vector<column_name_info> out_column_names;
   auto parse_opt = parsing_options(options, stream);
 
-  // Iterate over the struct's child columns and convert to cudf column
-  size_type column_index = 0;
-  for (auto const& col_name : root_struct_col.column_order) {
-    auto& json_col = root_struct_col.child_columns.find(col_name)->second;
+  schema_element const* prune_schema = std::get_if<schema_element>(&options.get_dtypes());
+  bool const has_column_order = options.is_enabled_prune_columns() and prune_schema != nullptr and
+                                prune_schema->column_order.has_value() and
+                                not prune_schema->column_order->empty();
+  auto const& col_order =
+    has_column_order ? prune_schema->column_order.value() : root_struct_col.column_order;
+  if (has_column_order) {
+    CUDF_EXPECTS(prune_schema->child_types.size() == col_order.size(),
+                 "Input schema column order size mismatch with input schema child types");
+  }
+  auto root_col_size = root_struct_col.num_rows;
 
+  // Iterate over the struct's child columns/column_order and convert to cudf column
+  size_type column_index = 0;
+  for (auto const& col_name : col_order) {
     std::optional<schema_element> child_schema_element = std::visit(
       cudf::detail::visitor_overload{
         [column_index](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
@@ -590,17 +625,23 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
         },
         [col_name](
           std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
-          return (user_dtypes.find(col_name) != std::end(user_dtypes))
-                   ? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
-                   : std::optional<schema_element>{};
+          if (auto it = user_dtypes.find(col_name); it != std::end(user_dtypes))
+            return std::optional<schema_element>{{it->second}};
+          return std::nullopt;
         },
         [col_name](std::map<std::string, schema_element> const& user_dtypes)
           -> std::optional<schema_element> {
-          return (user_dtypes.find(col_name) != std::end(user_dtypes))
-                   ? user_dtypes.find(col_name)->second
-                   : std::optional<schema_element>{};
+          if (auto it = user_dtypes.find(col_name); it != std::end(user_dtypes)) return it->second;
+          return std::nullopt;
+        },
+        [col_name](schema_element const& user_dtypes) -> std::optional<schema_element> {
+          if (auto it = user_dtypes.child_types.find(col_name);
+              it != std::end(user_dtypes.child_types))
+            return it->second;
+          return std::nullopt;
         }},
       options.get_dtypes());
+
 #ifdef NJP_DEBUG_PRINT
     auto debug_schema_print = [](auto ret) {
       std::cout << ", type id: "
@@ -608,20 +649,39 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                 << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children"
                 << "\n";
     };
-    std::visit(
-      cudf::detail::visitor_overload{[column_index](std::vector<data_type> const&) {
-                                       std::cout << "Column by index: #" << column_index;
-                                     },
-                                     [col_name](std::map<std::string, data_type> const&) {
-                                       std::cout << "Column by flat name: '" << col_name;
-                                     },
-                                     [col_name](std::map<std::string, schema_element> const&) {
-                                       std::cout << "Column by nested name: #" << col_name;
-                                     }},
-      options.get_dtypes());
+    std::visit(cudf::detail::visitor_overload{
+                 [column_index](std::vector<data_type> const&) {
+                   std::cout << "Column by index: #" << column_index;
+                 },
+                 [col_name](std::map<std::string, data_type> const&) {
+                   std::cout << "Column by flat name: '" << col_name;
+                 },
+                 [col_name](std::map<std::string, schema_element> const&) {
+                   std::cout << "Column by nested name: #" << col_name;
+                 },
+                 [col_name](schema_element const&) {
+                   std::cout << "Column by nested schema with column order: #" << col_name;
+                 }},
+               options.get_dtypes());
     debug_schema_print(child_schema_element);
 #endif
 
+    auto const found_it = root_struct_col.child_columns.find(col_name);
+    if (options.is_enabled_prune_columns() and
+        found_it == std::end(root_struct_col.child_columns)) {
+      CUDF_EXPECTS(child_schema_element.has_value(),
+                   "Column name not found in input schema map, but present in column order and "
+                   "prune_columns is enabled");
+      // inserts all null column
+      out_column_names.emplace_back(make_column_name_info(child_schema_element.value(), col_name));
+      auto all_null_column =
+        make_all_nulls_column(child_schema_element.value(), root_col_size, stream, mr);
+      out_columns.emplace_back(std::move(all_null_column));
+      column_index++;
+      continue;
+    }
+    auto& json_col = found_it->second;
+
     if (!options.is_enabled_prune_columns() or child_schema_element.has_value()) {
       // Get this JSON column's cudf column and schema info, (modifies json_col)
       auto [cudf_col, col_name_info] =
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 34a87918e57..1b61be20202 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -58,7 +58,7 @@ enum class dfa_symbol_group_id : uint32_t {
   DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
   SINGLE_QUOTE_CHAR,  ///< Quote character SG: '
   ESCAPE_CHAR,        ///< Escape character SG: '\'
-  NEWLINE_CHAR,       ///< Newline character SG: '\n'
+  DELIM_CHAR,         ///< Delimiter character SG
   OTHER_SYMBOLS,      ///< SG implicitly matching all other characters
   NUM_SYMBOL_GROUPS   ///< Total number of symbol groups
 };
@@ -72,13 +72,17 @@ constexpr auto TT_SEC            = dfa_states::TT_SEC;
 constexpr auto TT_NUM_STATES     = static_cast<StateT>(dfa_states::TT_NUM_STATES);
 constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
 
-// The i-th string representing all the characters of a symbol group
-std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const qna_sgs{
-  {{'\"'}, {'\''}, {'\\'}, {'\n'}}};
+auto get_sgid_lut(SymbolT delim)
+{
+  // The i-th string representing all the characters of a symbol group
+  std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> symbol_groups{
+    {{'\"'}, {'\''}, {'\\'}, {delim}}};
+  return symbol_groups;
+}
 
 // Transition table
 std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_state_tt{{
-  /* IN_STATE      "       '       \       \n    OTHER  */
+  /* IN_STATE      "       '       \     <delim>    OTHER  */
   /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}},
   /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}},
   /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}},
@@ -199,28 +203,26 @@ struct TransduceToNormalizedQuotes {
 
 namespace normalize_whitespace {
 
+// We do not need a symbol group for the delimiter character since whitespace normalization
+// now occurs after tokenization.
 enum class dfa_symbol_group_id : uint32_t {
   DOUBLE_QUOTE_CHAR,   ///< Quote character SG: "
   ESCAPE_CHAR,         ///< Escape character SG: '\\'
-  NEWLINE_CHAR,        ///< Newline character SG: '\n'
   WHITESPACE_SYMBOLS,  ///< Whitespace characters SG: '\t' or ' '
   OTHER_SYMBOLS,       ///< SG implicitly matching all other characters
   NUM_SYMBOL_GROUPS    ///< Total number of symbol groups
 };
 // Alias for readability of symbol group ids
 constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
-// The i-th string representing all the characters of a symbol group
-std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
-  {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};
+
+std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{{{'"'}, {'\\'}, {' ', '\t'}}};
 
 /**
  * -------- FST states ---------
  * -----------------------------
  * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
- *        |   quotes as well as any other character not enclosed by a string. Also handles
- *        |   newline character present within a string
- * TT_DQS | Double-quoted string state handling all characters within double quotes except
- *        |   newline character
+ *        |   quotes as well as any other character not enclosed by a string.
+ * TT_DQS | Double-quoted string state handling all characters within double quotes
  * TT_DEC | State handling escaped characters inside double-quoted string. Note that this
  *        |   state is necessary to process escaped double-quote characters. Without this
  *        |   state, whitespaces following escaped double quotes inside strings may be removed.
@@ -235,10 +237,10 @@ constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
 
 // Transition table
 std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
-  {/* IN_STATE      "       \       \n    <SPC>   OTHER  */
-   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
-   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
-   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
+  {/* IN_STATE      "       \     <SPC>   OTHER  */
+   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS}},
+   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_DQS, TT_DQS}},
+   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
 
 // The DFA's starting state
 constexpr StateT start_state = static_cast<StateT>(TT_OOS);
@@ -302,18 +304,19 @@ struct TransduceToNormalizedWS {
 namespace detail {
 
 void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
+                             char delimiter,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   static constexpr std::int32_t min_out = 0;
   static constexpr std::int32_t max_out = 2;
-  auto parser =
-    fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
-                          fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
-                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
-                            normalize_quotes::TransduceToNormalizedQuotes{}),
-                          stream);
+  auto parser                           = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(normalize_quotes::get_sgid_lut(delimiter)),
+    fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
+    fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+      normalize_quotes::TransduceToNormalizedQuotes{}),
+    stream);
 
   rmm::device_buffer outbuf(indata.size() * 2, stream, mr);
   cudf::detail::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 7b3b04dea16..4989fff4b30 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -429,6 +429,29 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
+/**
+ * @brief Create all null column of a given nested schema
+ *
+ * @param schema The schema of the column to create
+ * @param num_rows The number of rows in the column
+ * @param stream The CUDA stream to which kernels are dispatched
+ * @param mr resource with which to allocate
+ * @return The all null column
+ */
+std::unique_ptr<column> make_all_nulls_column(schema_element const& schema,
+                                              size_type num_rows,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::device_async_resource_ref mr);
+
+/**
+ * @brief Create metadata for a column of a given schema
+ *
+ * @param schema The schema of the column
+ * @param col_name The name of the column
+ * @return column metadata for a given schema
+ */
+column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name);
+
 /**
  * @brief Get the path data type of a column by path if present in input schema
  *
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 60e78f4763d..f1c2826c62a 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -2198,9 +2198,9 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       // - String columns will be returned as nullable, iff there's at least one null entry
       if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); }
 
-      // For string columns return ["offsets", "char"] schema
+      // For string columns return ["offsets"] schema
       if (target_type.id() == type_id::STRING) {
-        return {std::move(col), std::vector<column_name_info>{{"offsets"}, {"chars"}}};
+        return {std::move(col), std::vector<column_name_info>{{"offsets"}}};
       }
       // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema
       else {
diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp
index 4caa5cd9e24..401a6e992de 100644
--- a/cpp/src/io/json/parser_features.cpp
+++ b/cpp/src/io/json/parser_features.cpp
@@ -16,14 +16,201 @@
 
 #include "nested_json.hpp"
 
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
+#include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/strings/string_view.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <optional>
 #include <string>
 #include <vector>
 
+namespace cudf::io {
+namespace {
+bool validate_column_order(schema_element const& types)
+{
+  // For struct types, check if column_order size matches child_types size and all elements in
+  // column_order are in child_types, in child_types, call this function recursively.
+  // For list types, check if child_types size is 1 and call this function recursively.
+  if (types.type.id() == type_id::STRUCT) {
+    if (types.column_order.has_value()) {
+      if (types.column_order.value().size() != types.child_types.size()) { return false; }
+      for (auto const& column_name : types.column_order.value()) {
+        auto it = types.child_types.find(column_name);
+        if (it == types.child_types.end()) { return false; }
+        if (it->second.type.id() == type_id::STRUCT or it->second.type.id() == type_id::LIST) {
+          if (!validate_column_order(it->second)) { return false; }
+        }
+      }
+    }
+  } else if (types.type.id() == type_id::LIST) {
+    if (types.child_types.size() != 1) { return false; }
+    auto it = types.child_types.begin();
+    if (it->second.type.id() == type_id::STRUCT or it->second.type.id() == type_id::LIST) {
+      if (!validate_column_order(it->second)) { return false; }
+    }
+  }
+  return true;
+}
+}  // namespace
+
+void json_reader_options::set_dtypes(schema_element types)
+{
+  CUDF_EXPECTS(
+    validate_column_order(types), "Column order does not match child types", std::invalid_argument);
+  _dtypes = std::move(types);
+}
+}  // namespace cudf::io
+
 namespace cudf::io::json::detail {
 
+/// Created an empty column of the specified schema
+struct empty_column_functor {
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+  template <typename T, CUDF_ENABLE_IF(!cudf::is_nested<T>())>
+  std::unique_ptr<column> operator()(schema_element const& schema) const
+  {
+    return make_empty_column(schema.type);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, cudf::list_view>)>
+  std::unique_ptr<column> operator()(schema_element const& schema) const
+  {
+    CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child");
+    auto const& child_name        = schema.child_types.begin()->first;
+    std::unique_ptr<column> child = cudf::type_dispatcher(
+      schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name));
+    auto offsets = make_empty_column(data_type(type_to_id<size_type>()));
+    return make_lists_column(0, std::move(offsets), std::move(child), 0, {}, stream, mr);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, cudf::struct_view>)>
+  std::unique_ptr<column> operator()(schema_element const& schema) const
+  {
+    std::vector<std::unique_ptr<column>> child_columns;
+    for (auto const& child_name : schema.column_order.value_or(std::vector<std::string>{})) {
+      child_columns.push_back(cudf::type_dispatcher(
+        schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name)));
+    }
+    return make_structs_column(0, std::move(child_columns), 0, {}, stream, mr);
+  }
+};
+
+/// Created all null column of the specified schema
+struct allnull_column_functor {
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+ private:
+  auto make_zeroed_offsets(size_type size) const
+  {
+    auto offsets_buff =
+      cudf::detail::make_zeroed_device_uvector_async<size_type>(size + 1, stream, mr);
+    return std::make_unique<column>(std::move(offsets_buff), rmm::device_buffer{}, 0);
+  }
+
+ public:
+  template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
+  std::unique_ptr<column> operator()(schema_element const& schema, size_type size) const
+  {
+    return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(cudf::is_dictionary<T>())>
+  std::unique_ptr<column> operator()(schema_element const& schema, size_type size) const
+  {
+    CUDF_EXPECTS(schema.child_types.size() == 1, "Dictionary column should have only one child");
+    auto const& child_name        = schema.child_types.begin()->first;
+    std::unique_ptr<column> child = cudf::type_dispatcher(schema.child_types.at(child_name).type,
+                                                          empty_column_functor{stream, mr},
+                                                          schema.child_types.at(child_name));
+    return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr);
+    auto indices   = make_zeroed_offsets(size - 1);
+    auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
+    return make_dictionary_column(
+      std::move(child), std::move(indices), std::move(null_mask), size, stream, mr);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, cudf::string_view>)>
+  std::unique_ptr<column> operator()(schema_element const& schema, size_type size) const
+  {
+    auto offsets   = make_zeroed_offsets(size);
+    auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
+    return make_strings_column(
+      size, std::move(offsets), rmm::device_buffer{}, size, std::move(null_mask));
+  }
+  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, cudf::list_view>)>
+  std::unique_ptr<column> operator()(schema_element const& schema, size_type size) const
+  {
+    CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child");
+    auto const& child_name        = schema.child_types.begin()->first;
+    std::unique_ptr<column> child = cudf::type_dispatcher(schema.child_types.at(child_name).type,
+                                                          empty_column_functor{stream, mr},
+                                                          schema.child_types.at(child_name));
+    auto offsets                  = make_zeroed_offsets(size);
+    auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
+    return make_lists_column(
+      size, std::move(offsets), std::move(child), size, std::move(null_mask), stream, mr);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, cudf::struct_view>)>
+  std::unique_ptr<column> operator()(schema_element const& schema, size_type size) const
+  {
+    std::vector<std::unique_ptr<column>> child_columns;
+    for (auto const& child_name : schema.column_order.value_or(std::vector<std::string>{})) {
+      child_columns.push_back(cudf::type_dispatcher(
+        schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name), size));
+    }
+    auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
+    return make_structs_column(
+      size, std::move(child_columns), size, std::move(null_mask), stream, mr);
+  }
+};
+
+std::unique_ptr<column> make_all_nulls_column(schema_element const& schema,
+                                              size_type num_rows,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::device_async_resource_ref mr)
+{
+  return cudf::type_dispatcher(schema.type, allnull_column_functor{stream, mr}, schema, num_rows);
+}
+
+column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name)
+{
+  column_name_info info;
+  info.name = col_name;
+  switch (schema.type.id()) {
+    case type_id::STRUCT:
+      for (auto const& child_name : schema.column_order.value_or(std::vector<std::string>{})) {
+        info.children.push_back(
+          make_column_name_info(schema.child_types.at(child_name), child_name));
+      }
+      break;
+    case type_id::LIST:
+      info.children.emplace_back("offsets");
+      for (auto const& [child_name, child_schema] : schema.child_types) {
+        info.children.push_back(make_column_name_info(child_schema, child_name));
+      }
+      break;
+    case type_id::DICTIONARY32:
+      info.children.emplace_back("indices");
+      for (auto const& [child_name, child_schema] : schema.child_types) {
+        info.children.push_back(make_column_name_info(child_schema, child_name));
+      }
+      break;
+    case type_id::STRING: info.children.emplace_back("offsets"); break;
+    default: break;
+  }
+  return info;
+}
+
 std::optional<schema_element> child_schema_element(std::string const& col_name,
                                                    cudf::io::json_reader_options const& options)
 {
@@ -46,6 +233,11 @@ std::optional<schema_element> child_schema_element(std::string const& col_name,
         return (user_dtypes.find(col_name) != std::end(user_dtypes))
                  ? user_dtypes.find(col_name)->second
                  : std::optional<schema_element>{};
+      },
+      [col_name](schema_element const& user_dtypes) -> std::optional<schema_element> {
+        return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types))
+                 ? user_dtypes.child_types.find(col_name)->second
+                 : std::optional<schema_element>{};
       }},
     options.get_dtypes());
 }
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 2bc15ea19cb..279f5e71351 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -248,7 +248,8 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
   // invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_single_quotes()) {
-    normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref());
+    normalize_single_quotes(
+      bufview, reader_opts.get_delimiter(), stream, cudf::get_current_device_resource_ref());
   }
 
   auto buffer =
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 45380e6ea20..aaf5ebfbe7d 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -147,6 +147,8 @@ __device__ void gpuDecodeFixedWidthValues(
             }
             break;
         }
+      } else if (dtype == BOOLEAN) {
+        gpuOutputBoolean(sb, src_pos, static_cast<uint8_t*>(dst));
       } else if (dtype == INT96) {
         gpuOutputInt96Timestamp(s, sb, src_pos, static_cast<int64_t*>(dst));
       } else if (dtype_len == 8) {
@@ -841,6 +843,33 @@ __device__ inline bool maybe_has_nulls(page_state_s* s)
   return run_val != s->col.max_level[lvl];
 }
 
+template <int decode_block_size_t, typename state_buf>
+inline __device__ void bool_plain_decode(page_state_s* s, state_buf* sb, int t, int to_decode)
+{
+  int pos              = s->dict_pos;
+  int const target_pos = pos + to_decode;
+
+  while (pos < target_pos) {
+    int const batch_len = min(target_pos - pos, decode_block_size_t);
+
+    if (t < batch_len) {
+      int const bit_pos           = pos + t;
+      int const byte_offset       = bit_pos >> 3;
+      int const bit_in_byte_index = bit_pos & 7;
+
+      uint8_t const* const read_from = s->data_start + byte_offset;
+      bool const read_bit            = (*read_from) & (1 << bit_in_byte_index);
+
+      int const write_to_index     = rolling_index<state_buf::dict_buf_size>(bit_pos);
+      sb->dict_idx[write_to_index] = read_bit;
+    }
+
+    pos += batch_len;
+  }
+
+  if (t == 0) { s->dict_pos = pos; }
+}
+
 template <int rolling_buf_size, typename stream_type>
 __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t)
 {
@@ -872,14 +901,7 @@ __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t)
  * @param num_rows Maximum number of rows to read
  * @param error_code Error code to set if an error is encountered
  */
-template <typename level_t,
-          int decode_block_size_t,
-          decode_kernel_mask kernel_mask_t,
-          bool has_dict_t,
-          bool has_nesting_t,
-          bool has_lists_t,
-          template <int block_size, bool decode_has_lists_t, typename state_buf>
-          typename DecodeValuesFunc>
+template <typename level_t, int decode_block_size_t, decode_kernel_mask kernel_mask_t>
 CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
   gpuDecodePageDataGeneric(PageInfo* pages,
                            device_span<ColumnChunkDesc const> chunks,
@@ -887,12 +909,33 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
                            size_t num_rows,
                            kernel_error::pointer error_code)
 {
+  constexpr bool has_dict_t = (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT) ||
+                              (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) ||
+                              (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_LIST);
+  constexpr bool has_bools_t = (kernel_mask_t == decode_kernel_mask::BOOLEAN) ||
+                               (kernel_mask_t == decode_kernel_mask::BOOLEAN_NESTED) ||
+                               (kernel_mask_t == decode_kernel_mask::BOOLEAN_LIST);
+  constexpr bool has_nesting_t =
+    (kernel_mask_t == decode_kernel_mask::BOOLEAN_NESTED) ||
+    (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) ||
+    (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED) ||
+    (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED);
+  constexpr bool has_lists_t =
+    (kernel_mask_t == decode_kernel_mask::BOOLEAN_LIST) ||
+    (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_LIST) ||
+    (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) ||
+    (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST);
+  constexpr bool split_decode_t =
+    (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT) ||
+    (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED) ||
+    (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST);
+
   constexpr int rolling_buf_size    = decode_block_size_t * 2;
   constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size_t>();
 
   __shared__ __align__(16) page_state_s state_g;
   using state_buf_t = page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                           has_dict_t ? rolling_buf_size : 1,
+                                           has_dict_t || has_bools_t ? rolling_buf_size : 1,
                                            1>;
   __shared__ __align__(16) state_buf_t state_buffers;
 
@@ -920,32 +963,31 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   if (s->num_rows == 0) { return; }
 
-  DecodeValuesFunc<decode_block_size_t, has_lists_t, state_buf_t> decode_values;
+  using value_decoder_type = std::conditional_t<
+    split_decode_t,
+    decode_fixed_width_split_values_func<decode_block_size_t, has_lists_t, state_buf_t>,
+    decode_fixed_width_values_func<decode_block_size_t, has_lists_t, state_buf_t>>;
+  value_decoder_type decode_values;
 
   bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s);
 
   // shared buffer. all shared memory is suballocated out of here
-  constexpr int shared_rep_size =
-    has_lists_t
-      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16})
-      : 0;
-  constexpr int shared_dict_size =
-    has_dict_t
-      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), size_t{16})
-      : 0;
-  constexpr int shared_def_size =
-    cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16});
-  constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size;
+  constexpr int rle_run_buffer_bytes =
+    cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16});
+  constexpr int shared_buf_size =
+    rle_run_buffer_bytes * (static_cast<int>(has_dict_t) + static_cast<int>(has_bools_t) +
+                            static_cast<int>(has_lists_t) + 1);
   __shared__ __align__(16) uint8_t shared_buf[shared_buf_size];
 
   // setup all shared memory buffers
-  int shared_offset          = 0;
-  rle_run<level_t>* rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
-  if constexpr (has_lists_t) { shared_offset += shared_rep_size; }
-
-  rle_run<uint32_t>* dict_runs = reinterpret_cast<rle_run<uint32_t>*>(shared_buf + shared_offset);
-  if constexpr (has_dict_t) { shared_offset += shared_dict_size; }
-  rle_run<level_t>* def_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
+  int shared_offset = 0;
+  auto rep_runs     = reinterpret_cast<rle_run*>(shared_buf + shared_offset);
+  if constexpr (has_lists_t) { shared_offset += rle_run_buffer_bytes; }
+  auto dict_runs = reinterpret_cast<rle_run*>(shared_buf + shared_offset);
+  if constexpr (has_dict_t) { shared_offset += rle_run_buffer_bytes; }
+  auto bool_runs = reinterpret_cast<rle_run*>(shared_buf + shared_offset);
+  if constexpr (has_bools_t) { shared_offset += rle_run_buffer_bytes; }
+  auto def_runs = reinterpret_cast<rle_run*>(shared_buf + shared_offset);
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
   rle_stream<level_t, decode_block_size_t, rolling_buf_size> def_decoder{def_runs};
@@ -974,6 +1016,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
       s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
   }
 
+  // Use dictionary stream memory for bools
+  rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> bool_stream{bool_runs};
+  bool bools_are_rle_stream = (s->dict_run == 0);
+  if constexpr (has_bools_t) {
+    if (bools_are_rle_stream) {
+      bool_stream.init(1, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
+    }
+  }
+  __syncthreads();
+
   // We use two counters in the loop below: processed_count and valid_count.
   // - processed_count: number of values out of num_input_values that we have decoded so far.
   //   the definition stream returns the number of total rows it has processed in each call
@@ -1041,13 +1093,20 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
     }
     __syncthreads();
 
-    // if we have dictionary data
+    // if we have dictionary or bool data
+    // We want to limit the number of dictionary/bool items we decode, that correspond to
+    // the rows we have processed in this iteration that are valid.
+    // We know the number of valid rows to process with: next_valid_count - valid_count.
     if constexpr (has_dict_t) {
-      // We want to limit the number of dictionary items we decode, that correspond to
-      // the rows we have processed in this iteration that are valid.
-      // We know the number of valid rows to process with: next_valid_count - valid_count.
       dict_stream.decode_next(t, next_valid_count - valid_count);
       __syncthreads();
+    } else if constexpr (has_bools_t) {
+      if (bools_are_rle_stream) {
+        bool_stream.decode_next(t, next_valid_count - valid_count);
+      } else {
+        bool_plain_decode<decode_block_size_t>(s, sb, t, next_valid_count - valid_count);
+      }
+      __syncthreads();
     }
 
     // decode the values themselves
@@ -1061,250 +1120,82 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
 
 }  // anonymous namespace
 
-void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
-                                  cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                                  size_t num_rows,
-                                  size_t min_row,
-                                  int level_type_size,
-                                  bool has_nesting,
-                                  bool is_list,
-                                  kernel_error::pointer error_code,
-                                  rmm::cuda_stream_view stream)
-{
-  constexpr int decode_block_size = 128;
-
-  dim3 dim_block(decode_block_size, 1);
-  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
-
-  if (level_type_size == 1) {
-    if (is_list) {
-      gpuDecodePageDataGeneric<uint8_t,
-                               decode_block_size,
-                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST,
-                               false,
-                               true,
-                               true,
-                               decode_fixed_width_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    } else if (has_nesting) {
-      gpuDecodePageDataGeneric<uint8_t,
-                               decode_block_size,
-                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
-                               false,
-                               true,
-                               false,
-                               decode_fixed_width_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    } else {
-      gpuDecodePageDataGeneric<uint8_t,
-                               decode_block_size,
-                               decode_kernel_mask::FIXED_WIDTH_NO_DICT,
-                               false,
-                               false,
-                               false,
-                               decode_fixed_width_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    }
-  } else {
-    if (is_list) {
-      gpuDecodePageDataGeneric<uint16_t,
-                               decode_block_size,
-                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST,
-                               false,
-                               true,
-                               true,
-                               decode_fixed_width_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    } else if (has_nesting) {
-      gpuDecodePageDataGeneric<uint16_t,
-                               decode_block_size,
-                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
-                               false,
-                               true,
-                               false,
-                               decode_fixed_width_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    } else {
-      gpuDecodePageDataGeneric<uint16_t,
-                               decode_block_size,
-                               decode_kernel_mask::FIXED_WIDTH_NO_DICT,
-                               false,
-                               false,
-                               false,
-                               decode_fixed_width_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    }
-  }
-}
+template <decode_kernel_mask mask>
+using kernel_tag_t = std::integral_constant<decode_kernel_mask, mask>;
 
-void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
-                                      cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                                      size_t num_rows,
-                                      size_t min_row,
-                                      int level_type_size,
-                                      bool has_nesting,
-                                      bool is_list,
-                                      kernel_error::pointer error_code,
-                                      rmm::cuda_stream_view stream)
-{
-  constexpr int decode_block_size = 128;
-
-  dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
-  dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
-
-  if (level_type_size == 1) {
-    if (is_list) {
-      gpuDecodePageDataGeneric<uint8_t,
-                               decode_block_size,
-                               decode_kernel_mask::FIXED_WIDTH_DICT_LIST,
-                               true,
-                               true,
-                               true,
-                               decode_fixed_width_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    } else if (has_nesting) {
-      gpuDecodePageDataGeneric<uint8_t,
-                               decode_block_size,
-                               decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
-                               true,
-                               true,
-                               false,
-                               decode_fixed_width_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    } else {
-      gpuDecodePageDataGeneric<uint8_t,
-                               decode_block_size,
-                               decode_kernel_mask::FIXED_WIDTH_DICT,
-                               true,
-                               false,
-                               false,
-                               decode_fixed_width_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    }
-  } else {
-    if (is_list) {
-      gpuDecodePageDataGeneric<uint16_t,
-                               decode_block_size,
-                               decode_kernel_mask::FIXED_WIDTH_DICT_LIST,
-                               true,
-                               true,
-                               true,
-                               decode_fixed_width_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    } else if (has_nesting) {
-      gpuDecodePageDataGeneric<uint16_t,
-                               decode_block_size,
-                               decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
-                               true,
-                               true,
-                               false,
-                               decode_fixed_width_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    } else {
-      gpuDecodePageDataGeneric<uint16_t,
-                               decode_block_size,
-                               decode_kernel_mask::FIXED_WIDTH_DICT,
-                               true,
-                               false,
-                               true,
-                               decode_fixed_width_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    }
-  }
-}
+template <int value>
+using int_tag_t = std::integral_constant<int, value>;
 
-void __host__
-DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
-                              cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                              size_t num_rows,
-                              size_t min_row,
-                              int level_type_size,
-                              bool has_nesting,
-                              bool is_list,
-                              kernel_error::pointer error_code,
-                              rmm::cuda_stream_view stream)
+void __host__ DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
+                             cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                             size_t num_rows,
+                             size_t min_row,
+                             int level_type_size,
+                             decode_kernel_mask kernel_mask,
+                             kernel_error::pointer error_code,
+                             rmm::cuda_stream_view stream)
 {
-  constexpr int decode_block_size = 128;
-
-  dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
-  dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
-
-  if (level_type_size == 1) {
-    if (is_list) {
-      gpuDecodePageDataGeneric<uint8_t,
-                               decode_block_size,
-                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST,
-                               true,
-                               true,
-                               true,
-                               decode_fixed_width_split_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    } else if (has_nesting) {
-      gpuDecodePageDataGeneric<uint8_t,
-                               decode_block_size,
-                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
-                               false,
-                               true,
-                               false,
-                               decode_fixed_width_split_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    } else {
-      gpuDecodePageDataGeneric<uint8_t,
-                               decode_block_size,
-                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
-                               false,
-                               false,
-                               false,
-                               decode_fixed_width_split_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    }
-  } else {
-    if (is_list) {
-      gpuDecodePageDataGeneric<uint16_t,
-                               decode_block_size,
-                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST,
-                               true,
-                               true,
-                               true,
-                               decode_fixed_width_split_values_func>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
-    } else if (has_nesting) {
-      gpuDecodePageDataGeneric<uint16_t,
-                               decode_block_size,
-                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
-                               false,
-                               true,
-                               false,
-                               decode_fixed_width_split_values_func>
+  // No template parameters on lambdas until C++20, so use type tags instead
+  auto launch_kernel = [&](auto block_size_tag, auto kernel_mask_tag) {
+    constexpr int decode_block_size   = decltype(block_size_tag)::value;
+    constexpr decode_kernel_mask mask = decltype(kernel_mask_tag)::value;
+
+    dim3 dim_block(decode_block_size, 1);
+    dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+    if (level_type_size == 1) {
+      gpuDecodePageDataGeneric<uint8_t, decode_block_size, mask>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     } else {
-      gpuDecodePageDataGeneric<uint16_t,
-                               decode_block_size,
-                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
-                               false,
-                               false,
-                               false,
-                               decode_fixed_width_split_values_func>
+      gpuDecodePageDataGeneric<uint16_t, decode_block_size, mask>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
+  };
+
+  switch (kernel_mask) {
+    case decode_kernel_mask::FIXED_WIDTH_NO_DICT:
+      launch_kernel(int_tag_t<128>{}, kernel_tag_t<decode_kernel_mask::FIXED_WIDTH_NO_DICT>{});
+      break;
+    case decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED:
+      launch_kernel(int_tag_t<128>{},
+                    kernel_tag_t<decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED>{});
+      break;
+    case decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST:
+      launch_kernel(int_tag_t<128>{}, kernel_tag_t<decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST>{});
+      break;
+    case decode_kernel_mask::FIXED_WIDTH_DICT:
+      launch_kernel(int_tag_t<128>{}, kernel_tag_t<decode_kernel_mask::FIXED_WIDTH_DICT>{});
+      break;
+    case decode_kernel_mask::FIXED_WIDTH_DICT_NESTED:
+      launch_kernel(int_tag_t<128>{}, kernel_tag_t<decode_kernel_mask::FIXED_WIDTH_DICT_NESTED>{});
+      break;
+    case decode_kernel_mask::FIXED_WIDTH_DICT_LIST:
+      launch_kernel(int_tag_t<128>{}, kernel_tag_t<decode_kernel_mask::FIXED_WIDTH_DICT_LIST>{});
+      break;
+    case decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT:
+      launch_kernel(int_tag_t<128>{},
+                    kernel_tag_t<decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT>{});
+      break;
+    case decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED:
+      launch_kernel(int_tag_t<128>{},
+                    kernel_tag_t<decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED>{});
+      break;
+    case decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST:
+      launch_kernel(int_tag_t<128>{},
+                    kernel_tag_t<decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST>{});
+      break;
+    case decode_kernel_mask::BOOLEAN:
+      launch_kernel(int_tag_t<128>{}, kernel_tag_t<decode_kernel_mask::BOOLEAN>{});
+      break;
+    case decode_kernel_mask::BOOLEAN_NESTED:
+      launch_kernel(int_tag_t<128>{}, kernel_tag_t<decode_kernel_mask::BOOLEAN_NESTED>{});
+      break;
+    case decode_kernel_mask::BOOLEAN_LIST:
+      launch_kernel(int_tag_t<128>{}, kernel_tag_t<decode_kernel_mask::BOOLEAN_LIST>{});
+      break;
+    default: CUDF_EXPECTS(false, "Kernel type not handled by this function"); break;
   }
 }
 
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 62f1ee88036..5b9831668e6 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -343,8 +343,8 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size)
   bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
 
   // the level stream decoders
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  __shared__ rle_run<level_t> rep_runs[rle_run_buffer_size];
+  __shared__ rle_run def_runs[rle_run_buffer_size];
+  __shared__ rle_run rep_runs[rle_run_buffer_size];
   rle_stream<level_t, preprocess_block_size, rolling_buf_size>
     decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
 
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 52d53cb8225..a8a8c441a84 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -181,9 +181,13 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
   } else if (is_string_col(chunk)) {
     // check for string before byte_stream_split so FLBA will go to the right kernel
     return decode_kernel_mask::STRING;
+  } else if (is_boolean(chunk)) {
+    return is_list(chunk)     ? decode_kernel_mask::BOOLEAN_LIST
+           : is_nested(chunk) ? decode_kernel_mask::BOOLEAN_NESTED
+                              : decode_kernel_mask::BOOLEAN;
   }
 
-  if (!is_byte_array(chunk) && !is_boolean(chunk)) {
+  if (!is_byte_array(chunk)) {
     if (page.encoding == Encoding::PLAIN) {
       return is_list(chunk)     ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST
              : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index ca74a1c2ba0..5ece3a54892 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -618,8 +618,8 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBo
   constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<preprocess_block_size>();
 
   // the level stream decoders
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  __shared__ rle_run<level_t> rep_runs[rle_run_buffer_size];
+  __shared__ rle_run def_runs[rle_run_buffer_size];
+  __shared__ rle_run rep_runs[rle_run_buffer_size];
   rle_stream<level_t, preprocess_block_size, preproc_buf_size>
     decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index dba24b553e6..3b4d0e6dc80 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -224,6 +224,9 @@ enum class decode_kernel_mask {
   FIXED_WIDTH_NO_DICT_LIST   = (1 << 13),  // Run decode kernel for fixed width non-dictionary pages
   BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST =
     (1 << 14),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists
+  BOOLEAN        = (1 << 15),  // Run decode kernel for boolean data
+  BOOLEAN_NESTED = (1 << 16),  // Run decode kernel for nested boolean data
+  BOOLEAN_LIST   = (1 << 17),  // Run decode kernel for list boolean data
 };
 
 // mask representing all the ways in which a string can be encoded
@@ -539,7 +542,7 @@ enum class encode_kernel_mask {
   DELTA_BINARY      = (1 << 2),  // Run DELTA_BINARY_PACKED encoding kernel
   DELTA_LENGTH_BA   = (1 << 3),  // Run DELTA_LENGTH_BYTE_ARRAY encoding kernel
   DELTA_BYTE_ARRAY  = (1 << 4),  // Run DELTA_BYtE_ARRAY encoding kernel
-  BYTE_STREAM_SPLIT = (1 << 5),  // Run plain encoding kernel, but split streams
+  BYTE_STREAM_SPLIT = (1 << 5)   // Run plain encoding kernel, but split streams
 };
 
 /**
@@ -911,72 +914,18 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
- * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
- * @param[in] is_list Whether or not the data contains list data.
+ * @param[in] kernel_mask Mask indicating the type of decoding kernel to launch.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
-void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
-                         cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                         std::size_t num_rows,
-                         size_t min_row,
-                         int level_type_size,
-                         bool has_nesting,
-                         bool is_list,
-                         kernel_error::pointer error_code,
-                         rmm::cuda_stream_view stream);
-
-/**
- * @brief Launches kernel for reading dictionary fixed width column data stored in the pages
- *
- * The page data will be written to the output pointed to in the page's
- * associated column chunk.
- *
- * @param[in,out] pages All pages to be decoded
- * @param[in] chunks All chunks to be decoded
- * @param[in] num_rows Total number of rows to read
- * @param[in] min_row Minimum number of rows to read
- * @param[in] level_type_size Size in bytes of the type for level decoding
- * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
- * @param[in] is_list Whether or not the data contains list data.
- * @param[out] error_code Error code for kernel failures
- * @param[in] stream CUDA stream to use
- */
-void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
-                             cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                             std::size_t num_rows,
-                             size_t min_row,
-                             int level_type_size,
-                             bool has_nesting,
-                             bool is_list,
-                             kernel_error::pointer error_code,
-                             rmm::cuda_stream_view stream);
-
-/**
- * @brief Launches kernel for reading fixed width column data stored in the pages
- *
- * The page data will be written to the output pointed to in the page's
- * associated column chunk.
- *
- * @param[in,out] pages All pages to be decoded
- * @param[in] chunks All chunks to be decoded
- * @param[in] num_rows Total number of rows to read
- * @param[in] min_row Minimum number of rows to read
- * @param[in] level_type_size Size in bytes of the type for level decoding
- * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
- * @param[in] is_list Whether or not the data contains list data.
- * @param[out] error_code Error code for kernel failures
- * @param[in] stream CUDA stream to use
- */
-void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
-                                   cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                                   std::size_t num_rows,
-                                   size_t min_row,
-                                   int level_type_size,
-                                   bool has_nesting,
-                                   bool is_list,
-                                   kernel_error::pointer error_code,
-                                   rmm::cuda_stream_view stream);
+void DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
+                    cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                    size_t num_rows,
+                    size_t min_row,
+                    int level_type_size,
+                    decode_kernel_mask kernel_mask,
+                    kernel_error::pointer error_code,
+                    rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for initializing encoder row group fragments
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 689386b8957..d74ae83b635 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -97,22 +97,37 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                              _stream);
     }
 
+    // Compute column string sizes (using page string offsets) for this subpass
     col_string_sizes = calculate_page_string_offsets();
 
-    // check for overflow
+    // ensure cumulative column string sizes have been initialized
+    if (pass.cumulative_col_string_sizes.empty()) {
+      pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0);
+    }
+
+    // Add to the cumulative column string sizes of this pass
+    std::transform(pass.cumulative_col_string_sizes.begin(),
+                   pass.cumulative_col_string_sizes.end(),
+                   col_string_sizes.begin(),
+                   pass.cumulative_col_string_sizes.begin(),
+                   std::plus<>{});
+
+    // Check for overflow in cumulative column string sizes of this pass so that the page string
+    // offsets of overflowing (large) string columns are treated as 64-bit.
     auto const threshold         = static_cast<size_t>(strings::detail::get_offset64_threshold());
-    auto const has_large_strings = std::any_of(col_string_sizes.cbegin(),
-                                               col_string_sizes.cend(),
+    auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(),
+                                               pass.cumulative_col_string_sizes.cend(),
                                                [=](std::size_t sz) { return sz > threshold; });
     if (has_large_strings and not strings::detail::is_large_strings_enabled()) {
       CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
     }
 
-    // mark any chunks that are large string columns
+    // Mark any chunks for which the cumulative column string size has exceeded the
+    // large strings threshold
     if (has_large_strings) {
       for (auto& chunk : pass.chunks) {
         auto const idx = chunk.src_col_index;
-        if (col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; }
+        if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; }
       }
     }
   }
@@ -195,7 +210,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
         // only do string buffer for leaf
         if (idx == max_depth - 1 and out_buf.string_size() == 0 and
             col_string_sizes[pass.chunks[c].src_col_index] > 0) {
-          out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index], _stream);
+          out_buf.create_string_data(
+            col_string_sizes[pass.chunks[c].src_col_index],
+            pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] >
+              static_cast<size_t>(strings::detail::get_offset64_threshold()),
+            _stream);
         }
         if (has_strings) { str_data[idx] = out_buf.string_data(); }
         out_buf.user_data |=
@@ -219,8 +238,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   int const nkernels = std::bitset<32>(kernel_mask).count();
   auto streams       = cudf::detail::fork_streams(_stream, nkernels);
 
-  // launch string decoder
   int s_idx = 0;
+
+  auto decode_data = [&](decode_kernel_mask decoder_mask) {
+    DecodePageData(subpass.pages,
+                   pass.chunks,
+                   num_rows,
+                   skip_rows,
+                   level_type_size,
+                   decoder_mask,
+                   error_code.data(),
+                   streams[s_idx++]);
+  };
+
+  // launch string decoder
   if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) {
     DecodeStringPageData(subpass.pages,
                          pass.chunks,
@@ -266,41 +297,17 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
 
   // launch byte stream split decoder
   if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT) != 0) {
-    DecodeSplitPageFixedWidthData(subpass.pages,
-                                  pass.chunks,
-                                  num_rows,
-                                  skip_rows,
-                                  level_type_size,
-                                  false,
-                                  false,
-                                  error_code.data(),
-                                  streams[s_idx++]);
+    decode_data(decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT);
   }
 
   // launch byte stream split decoder, for nested columns
   if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED) != 0) {
-    DecodeSplitPageFixedWidthData(subpass.pages,
-                                  pass.chunks,
-                                  num_rows,
-                                  skip_rows,
-                                  level_type_size,
-                                  true,
-                                  false,
-                                  error_code.data(),
-                                  streams[s_idx++]);
+    decode_data(decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED);
   }
 
   // launch byte stream split decoder, for list columns
   if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST) != 0) {
-    DecodeSplitPageFixedWidthData(subpass.pages,
-                                  pass.chunks,
-                                  num_rows,
-                                  skip_rows,
-                                  level_type_size,
-                                  true,
-                                  true,
-                                  error_code.data(),
-                                  streams[s_idx++]);
+    decode_data(decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST);
   }
 
   // launch byte stream split decoder
@@ -316,80 +323,47 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
 
   // launch fixed width type decoder
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT) != 0) {
-    DecodePageDataFixed(subpass.pages,
-                        pass.chunks,
-                        num_rows,
-                        skip_rows,
-                        level_type_size,
-                        false,
-                        false,
-                        error_code.data(),
-                        streams[s_idx++]);
+    decode_data(decode_kernel_mask::FIXED_WIDTH_NO_DICT);
   }
 
   // launch fixed width type decoder for lists
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) != 0) {
-    DecodePageDataFixed(subpass.pages,
-                        pass.chunks,
-                        num_rows,
-                        skip_rows,
-                        level_type_size,
-                        true,
-                        true,
-                        error_code.data(),
-                        streams[s_idx++]);
+    decode_data(decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST);
   }
 
   // launch fixed width type decoder, for nested columns
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED) != 0) {
-    DecodePageDataFixed(subpass.pages,
-                        pass.chunks,
-                        num_rows,
-                        skip_rows,
-                        level_type_size,
-                        true,
-                        false,
-                        error_code.data(),
-                        streams[s_idx++]);
+    decode_data(decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED);
+  }
+
+  // launch boolean type decoder
+  if (BitAnd(kernel_mask, decode_kernel_mask::BOOLEAN) != 0) {
+    decode_data(decode_kernel_mask::BOOLEAN);
+  }
+
+  // launch boolean type decoder, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::BOOLEAN_NESTED) != 0) {
+    decode_data(decode_kernel_mask::BOOLEAN_NESTED);
+  }
+
+  // launch boolean type decoder, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::BOOLEAN_LIST) != 0) {
+    decode_data(decode_kernel_mask::BOOLEAN_LIST);
   }
 
   // launch fixed width type decoder with dictionaries
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT) != 0) {
-    DecodePageDataFixedDict(subpass.pages,
-                            pass.chunks,
-                            num_rows,
-                            skip_rows,
-                            level_type_size,
-                            false,
-                            false,
-                            error_code.data(),
-                            streams[s_idx++]);
+    decode_data(decode_kernel_mask::FIXED_WIDTH_DICT);
   }
 
   // launch fixed width type decoder with dictionaries for lists
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_LIST) != 0) {
-    DecodePageDataFixedDict(subpass.pages,
-                            pass.chunks,
-                            num_rows,
-                            skip_rows,
-                            level_type_size,
-                            true,
-                            true,
-                            error_code.data(),
-                            streams[s_idx++]);
+    decode_data(decode_kernel_mask::FIXED_WIDTH_DICT_LIST);
   }
 
   // launch fixed width type decoder with dictionaries, for nested columns
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) != 0) {
-    DecodePageDataFixedDict(subpass.pages,
-                            pass.chunks,
-                            num_rows,
-                            skip_rows,
-                            level_type_size,
-                            true,
-                            false,
-                            error_code.data(),
-                            streams[s_idx++]);
+    decode_data(decode_kernel_mask::FIXED_WIDTH_DICT_NESTED);
   }
 
   // launch the catch-all page decoder
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index a0c2dbd3e44..ca46f198bb8 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -130,6 +130,9 @@ struct pass_intermediate_data {
   rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()};
   rmm::device_uvector<string_index_pair> str_dict_index{0, cudf::get_default_stream()};
 
+  // cumulative strings column sizes.
+  std::vector<size_t> cumulative_col_string_sizes{};
+
   int level_type_size{0};
 
   // skip_rows / num_rows for this pass.
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 69e783a89d0..3c49de0c997 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -152,7 +152,6 @@ __device__ inline void decode(level_t* const output,
 }
 
 // a single rle run. may be broken up into multiple rle_batches
-template <typename level_t>
 struct rle_run {
   int size;        // total size of the run
   int output_pos;  // absolute position of this run w.r.t output
@@ -183,14 +182,14 @@ struct rle_stream {
 
   level_t* output;
 
-  rle_run<level_t>* runs;
+  rle_run* runs;
 
   int output_pos;
 
   int fill_index;
   int decode_index;
 
-  __device__ rle_stream(rle_run<level_t>* _runs) : runs(_runs) {}
+  __device__ rle_stream(rle_run* _runs) : runs(_runs) {}
 
   __device__ inline bool is_last_decode_warp(int warp_id)
   {
@@ -217,7 +216,7 @@ struct rle_stream {
     decode_index = -1;  // signals the first iteration. Nothing to decode.
   }
 
-  __device__ inline int get_rle_run_info(rle_run<level_t>& run)
+  __device__ inline int get_rle_run_info(rle_run& run)
   {
     run.start     = cur;
     run.level_run = get_vlq32(run.start, end);
@@ -384,7 +383,7 @@ struct rle_stream {
     // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for
     // the first time
     while (cur < end) {
-      rle_run<level_t> run;
+      rle_run run;
       int run_bytes = get_rle_run_info(run);
 
       if ((output_pos + run.size) > target_count) {
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 6d954753af8..41ed55cd090 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -63,9 +63,11 @@ void cudf::io::detail::inline_column_buffer::allocate_strings_data(bool memset_d
 }
 
 void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes,
+                                                                bool is_large_strings_col,
                                                                 rmm::cuda_stream_view stream)
 {
-  _string_data = rmm::device_buffer(num_bytes, stream, _mr);
+  _is_large_strings_col = is_large_strings_col;
+  _string_data          = rmm::device_buffer(num_bytes, stream, _mr);
 }
 
 namespace {
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 31c8b781e77..da19539f509 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -246,13 +246,17 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
   [[nodiscard]] size_t data_size_impl() const { return _data.size(); }
   std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
 
-  void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream);
+  void create_string_data(size_t num_bytes,
+                          bool is_large_strings_col,
+                          rmm::cuda_stream_view stream);
   void* string_data() { return _string_data.data(); }
   [[nodiscard]] void const* string_data() const { return _string_data.data(); }
   [[nodiscard]] size_t string_size() const { return _string_data.size(); }
+  [[nodiscard]] bool is_large_strings_column() const { return _is_large_strings_col; }
 
  private:
   rmm::device_buffer _string_data{};
+  bool _is_large_strings_col{};
 };
 
 using column_buffer = gather_column_buffer;
diff --git a/cpp/src/io/utilities/column_buffer_strings.cu b/cpp/src/io/utilities/column_buffer_strings.cu
index 4bc303a34a5..66d0a644c12 100644
--- a/cpp/src/io/utilities/column_buffer_strings.cu
+++ b/cpp/src/io/utilities/column_buffer_strings.cu
@@ -27,8 +27,7 @@ std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_colu
 {
   // if the size of _string_data is over the threshold for 64bit size_type, _data will contain
   // sizes rather than offsets. need special handling for that case.
-  auto const threshold = static_cast<size_t>(strings::detail::get_offset64_threshold());
-  if (_string_data.size() > threshold) {
+  if (is_large_strings_column()) {
     if (not strings::detail::is_large_strings_enabled()) {
       CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
     }
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 15de5d85614..68377ad6d5f 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -72,8 +72,12 @@ class file_sink : public data_sink {
 
   [[nodiscard]] bool is_device_write_preferred(size_t size) const override
   {
-    if (size < _gds_write_preferred_threshold) { return false; }
-    return supports_device_write();
+    if (!supports_device_write()) { return false; }
+
+    // Always prefer device writes if kvikio is enabled
+    if (!_kvikio_file.closed()) { return true; }
+
+    return size >= _gds_write_preferred_threshold;
   }
 
   std::future<void> device_write_async(void const* gpu_data,
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 15a4a270ce0..0870e4a84a7 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -33,8 +33,13 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+#include <regex>
 #include <vector>
 
+#ifdef CUDF_KVIKIO_REMOTE_IO
+#include <kvikio/remote_handle.hpp>
+#endif
+
 namespace cudf {
 namespace io {
 namespace {
@@ -90,8 +95,12 @@ class file_source : public datasource {
 
   [[nodiscard]] bool is_device_read_preferred(size_t size) const override
   {
-    if (size < _gds_read_preferred_threshold) { return false; }
-    return supports_device_read();
+    if (!supports_device_read()) { return false; }
+
+    // Always prefer device reads if kvikio is enabled
+    if (!_kvikio_file.closed()) { return true; }
+
+    return size >= _gds_read_preferred_threshold;
   }
 
   std::future<size_t> device_read_async(size_t offset,
@@ -389,6 +398,96 @@ class user_datasource_wrapper : public datasource {
   datasource* const source;  ///< A non-owning pointer to the user-implemented datasource
 };
 
+#ifdef CUDF_KVIKIO_REMOTE_IO
+/**
+ * @brief Remote file source backed by KvikIO, which handles S3 filepaths seamlessly.
+ */
+class remote_file_source : public datasource {
+  static std::unique_ptr<kvikio::S3Endpoint> create_s3_endpoint(char const* filepath)
+  {
+    auto [bucket_name, bucket_object] = kvikio::S3Endpoint::parse_s3_url(filepath);
+    return std::make_unique<kvikio::S3Endpoint>(bucket_name, bucket_object);
+  }
+
+ public:
+  explicit remote_file_source(char const* filepath) : _kvikio_file{create_s3_endpoint(filepath)} {}
+
+  ~remote_file_source() override = default;
+
+  [[nodiscard]] bool supports_device_read() const override { return true; }
+
+  [[nodiscard]] bool is_device_read_preferred(size_t size) const override { return true; }
+
+  [[nodiscard]] size_t size() const override { return _kvikio_file.nbytes(); }
+
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override
+  {
+    CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
+
+    auto const read_size = std::min(size, this->size() - offset);
+    return _kvikio_file.pread(dst, read_size, offset);
+  }
+
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t* dst,
+                     rmm::cuda_stream_view stream) override
+  {
+    return device_read_async(offset, size, dst, stream).get();
+  }
+
+  std::unique_ptr<datasource::buffer> device_read(size_t offset,
+                                                  size_t size,
+                                                  rmm::cuda_stream_view stream) override
+  {
+    rmm::device_buffer out_data(size, stream);
+    size_t read = device_read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
+    out_data.resize(read, stream);
+    return datasource::buffer::create(std::move(out_data));
+  }
+
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
+  {
+    auto const read_size = std::min(size, this->size() - offset);
+    return _kvikio_file.pread(dst, read_size, offset).get();
+  }
+
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  {
+    auto const count = std::min(size, this->size() - offset);
+    std::vector<uint8_t> h_data(count);
+    this->host_read(offset, count, h_data.data());
+    return datasource::buffer::create(std::move(h_data));
+  }
+
+  /**
+   * @brief Is `url` referring to a remote file supported by KvikIO?
+   *
+   * For now, only S3 urls (urls starting with "s3://") are supported.
+   */
+  static bool is_supported_remote_url(std::string const& url)
+  {
+    // Regular expression to match "s3://"
+    static std::regex pattern{R"(^s3://)", std::regex_constants::icase};
+    return std::regex_search(url, pattern);
+  }
+
+ private:
+  kvikio::RemoteHandle _kvikio_file;
+};
+#else
+/**
+ * @brief When KvikIO remote IO is disabled, `is_supported_remote_url()` return false always.
+ */
+class remote_file_source : public file_source {
+ public:
+  explicit remote_file_source(char const* filepath) : file_source(filepath) {}
+  static constexpr bool is_supported_remote_url(std::string const&) { return false; }
+};
+#endif
 }  // namespace
 
 std::unique_ptr<datasource> datasource::create(std::string const& filepath,
@@ -403,8 +502,9 @@ std::unique_ptr<datasource> datasource::create(std::string const& filepath,
 
     CUDF_FAIL("Invalid LIBCUDF_MMAP_ENABLED value: " + policy);
   }();
-
-  if (use_memory_mapping) {
+  if (remote_file_source::is_supported_remote_url(filepath)) {
+    return std::make_unique<remote_file_source>(filepath.c_str());
+  } else if (use_memory_mapping) {
     return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, max_size_estimate);
   } else {
     // `file_source` reads the file directly, without memory mapping
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 80fd72a3088..21f6fe87a62 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -195,10 +195,11 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  interpolation interp,
                                  column_view const& ordered_indices,
                                  bool exact,
+                                 rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::quantile(input, q, interp, ordered_indices, exact, cudf::get_default_stream(), mr);
+  return detail::quantile(input, q, interp, ordered_indices, exact, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index 69421f3bfc4..a94fb9362b9 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -103,17 +103,12 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  cudf::sorted is_input_sorted,
                                  std::vector<order> const& column_order,
                                  std::vector<null_order> const& null_precedence,
+                                 rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::quantiles(input,
-                           q,
-                           interp,
-                           is_input_sorted,
-                           column_order,
-                           null_precedence,
-                           cudf::get_default_stream(),
-                           mr);
+  return detail::quantiles(
+    input, q, interp, is_input_sorted, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 43c3b0a291b..fb5aebb4b39 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -410,10 +410,11 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
 
 std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
+                                          rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return tdigest::percentile_approx(input, percentiles, cudf::get_default_stream(), mr);
+  return tdigest::percentile_approx(input, percentiles, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
new file mode 100644
index 00000000000..1183e3e4038
--- /dev/null
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/find_multiple.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cooperative_groups.h>
+#include <cub/cub.cuh>
+#include <cuda/functional>
+#include <thrust/binary_search.h>
+#include <thrust/equal.h>
+#include <thrust/fill.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+#include <thrust/unique.h>
+
+#include <vector>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+
+/**
+ * @brief Threshold to decide on using string or warp parallel functions.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * a warp-parallel function is used.
+ */
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;
+
+/**
+ * @brief Kernel for finding multiple targets in each row of input strings
+ *
+ * The d_first_bytes is sorted and unique so the d_indices and d_offsets
+ * are used to map the corresponding character to its d_targets entry.
+ *
+ * Example
+ * d_targets = ["foo", "hello", "world", "hi"]
+ *  - sorted first-chars: ['f','h','h','w']
+ * d_indices = [0, 3, 1, 2]
+ * d_first_bytes = ['f', 'h', 'w']   (unique)
+ * d_offsets = [0, 1, 3]
+ * unique_count = 3
+ *
+ * If 'h' is found, lower_bound produces pos=1 in d_first_bytes.
+ * This corresponds to d_offset[1]==1 which has two values:
+ * - (d_offsets[2] - d_offsets[1]) = (3 - 1) = 2.
+ * Set map_idx = d_offsets[1] = 1 and the two targets to check are sequential
+ * in the d_indices array:
+ * - tgt1_idx = d_indices[map_idx]   = 3 --> d_targets[3] == 'hi'
+ * - tgt2_idx = d_indices[map_idx+1] = 1 --> d_targets[1] == 'hello'
+ * The logic now only needs to check for either of these 2 targets.
+ *
+ * This kernel works in either thread-per-string or warp-per-string depending
+ * on the template parameter. If tile_size==1, then this kernel executes as
+ * a row-per-string. If tile_size=32, the it executes as a warp-per-string.
+ * No other options are supported for now.
+ *
+ * @tparam tile_size Number of threads per string
+ * @param d_strings Input strings
+ * @param d_targets Target strings to search within input strings
+ * @param d_first_bytes Sorted, unique list of first bytes of the target strings
+ * @param d_indices Indices to map sorted d_first_bytes to d_targets
+ * @param d_offsets Offsets to map d_indices to d_targets
+ * @param unique_count Number of unique values in d_first_bytes (and d_offsets)
+ * @param working_memory Global memory to use if shared-memory is too small
+ * @param d_results Bool results for each target within each string row
+ */
+template <cudf::thread_index_type tile_size>
+CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings,
+                                       column_device_view const d_targets,
+                                       u_char const* d_first_bytes,
+                                       size_type const* d_indices,
+                                       size_type const* d_offsets,
+                                       size_type unique_count,
+                                       bool* working_memory,
+                                       cudf::device_span<bool*> d_results)
+{
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = idx / tile_size;
+  if (str_idx >= d_strings.size()) { return; }
+  if (d_strings.is_null(str_idx)) { return; }
+
+  // get the string for this tile
+  auto const d_str = d_strings.element<string_view>(str_idx);
+
+  namespace cg           = cooperative_groups;
+  auto const tile        = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto const lane_idx    = tile.thread_rank();
+  auto const num_targets = d_targets.size();
+
+  // size of shared_bools = num_targets * block_size
+  // each thread uses num_targets bools
+  extern __shared__ bool shared_bools[];
+  // bools for the current string
+  auto bools = working_memory == nullptr
+                 ? (shared_bools + (tile.meta_group_rank() * tile_size * num_targets))
+                 : (working_memory + (str_idx * tile_size * num_targets));
+
+  // initialize result: set true if target is empty, false otherwise
+  for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) {
+    auto const d_target = d_targets.element<string_view>(target_idx);
+    if constexpr (tile_size == 1) {
+      d_results[target_idx][str_idx] = d_target.empty();
+    } else {
+      auto const begin = bools + (target_idx * tile_size);
+      thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty());
+    }
+  }
+  tile.sync();
+
+  auto const last_ptr = d_first_bytes + unique_count;
+  for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes();
+       str_byte_idx += tile_size) {
+    // search for byte in first_bytes array
+    auto const sptr     = d_str.data() + str_byte_idx;
+    auto const chr      = static_cast<u_char>(*sptr);
+    auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr);
+    // if not found, continue to next byte
+    if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; }
+    // compute index of matched byte
+    auto const offset_idx = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
+    auto map_idx          = d_offsets[offset_idx];
+    auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets;
+    // check for targets that begin with chr
+    while (map_idx < last_idx) {
+      auto const target_idx = d_indices[map_idx++];
+      auto const bool_idx   = (target_idx * tile_size) + lane_idx;
+      auto const found      = tile_size == 1 ? d_results[target_idx][str_idx] : bools[bool_idx];
+      if (!found) {  // not found before
+        auto const d_target = d_targets.element<string_view>(target_idx);
+        if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) {
+          // first char already checked, so just check the [1, end) chars match
+          auto const tp = d_target.data();
+          if (thrust::equal(thrust::seq, tp + 1, tp + d_target.size_bytes(), sptr + 1)) {
+            if constexpr (tile_size == 1) {
+              d_results[target_idx][str_idx] = true;
+            } else {
+              bools[bool_idx] = true;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if constexpr (tile_size > 1) {
+    tile.sync();
+    // reduce the bools for each target to store in the result
+    for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) {
+      auto const begin = bools + (target_idx * tile_size);
+      d_results[target_idx][str_idx] =
+        thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity<bool>{});
+      // cooperative_group any() implementation was almost 3x slower than this parallel reduce
+    }
+  }
+}
+}  // namespace
+
+std::unique_ptr<table> contains_multiple(strings_column_view const& input,
+                                         strings_column_view const& targets,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(
+    not targets.is_empty(), "Must specify at least one target string.", std::invalid_argument);
+  CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null", std::invalid_argument);
+
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+  auto const d_targets = column_device_view::create(targets.parent(), stream);
+
+  // copy the first byte of each target and sort them
+  auto first_bytes = rmm::device_uvector<u_char>(targets.size(), stream);
+  auto indices     = rmm::device_uvector<size_type>(targets.size(), stream);
+  {
+    auto tgt_itr = thrust::make_transform_iterator(
+      d_targets->begin<string_view>(),
+      cuda::proclaim_return_type<u_char>([] __device__(auto const& d_tgt) -> u_char {
+        return d_tgt.empty() ? u_char{0} : static_cast<u_char>(d_tgt.data()[0]);
+      }));
+    auto count_itr = thrust::make_counting_iterator<size_type>(0);
+    auto keys_out  = first_bytes.begin();
+    auto vals_out  = indices.begin();
+    auto num_items = targets.size();
+    auto cmp_op    = thrust::less();
+    auto sv        = stream.value();
+
+    std::size_t tmp_bytes = 0;
+    cub::DeviceMergeSort::SortPairsCopy(
+      nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv);
+    auto tmp_stg = rmm::device_buffer(tmp_bytes, stream);
+    cub::DeviceMergeSort::SortPairsCopy(
+      tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv);
+  }
+
+  // remove duplicates to help speed up lower_bound
+  auto offsets = rmm::device_uvector<size_type>(targets.size(), stream);
+  thrust::sequence(rmm::exec_policy_nosync(stream), offsets.begin(), offsets.end());
+  auto const end = thrust::unique_by_key(
+    rmm::exec_policy_nosync(stream), first_bytes.begin(), first_bytes.end(), offsets.begin());
+  auto const unique_count =
+    static_cast<size_type>(thrust::distance(first_bytes.begin(), end.first));
+
+  // create output columns
+  auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+    return make_numeric_column(data_type{type_id::BOOL8},
+                               input.size(),
+                               cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                               input.null_count(),
+                               stream,
+                               mr);
+  });
+  auto results = std::vector<std::unique_ptr<column>>(results_iter, results_iter + targets.size());
+  auto d_results = [&] {
+    auto host_results_pointer_iter =
+      thrust::make_transform_iterator(results.begin(), [](auto const& results_column) {
+        return results_column->mutable_view().template data<bool>();
+      });
+    auto host_results_pointers =
+      std::vector<bool*>(host_results_pointer_iter, host_results_pointer_iter + results.size());
+    return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr);
+  }();
+
+  constexpr cudf::thread_index_type block_size = 256;
+  // calculated (benchmarked) for efficient use of shared-memory
+  constexpr size_type targets_threshold = 32;
+
+  auto d_first_bytes = first_bytes.data();
+  auto d_indices     = indices.data();
+  auto d_offsets     = offsets.data();
+
+  bool const row_parallel = ((input.null_count() == input.size()) ||
+                             ((input.chars_size(stream) / (input.size() - input.null_count())) <=
+                              AVG_CHAR_BYTES_THRESHOLD));
+
+  if (row_parallel) {
+    // Smaller strings perform better with a row per string
+    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()), block_size};
+    multi_contains_kernel<1>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(*d_strings,
+                                                                           *d_targets,
+                                                                           d_first_bytes,
+                                                                           d_indices,
+                                                                           d_offsets,
+                                                                           unique_count,
+                                                                           nullptr,
+                                                                           d_results);
+  } else {
+    constexpr cudf::thread_index_type tile_size = cudf::detail::warp_size;
+
+    auto const shared_mem_size =
+      (targets.size() <= targets_threshold) ? (block_size * targets.size()) : 0;
+    auto const work_mem_size =
+      (targets.size() <= targets_threshold) ? 0 : tile_size * targets.size() * input.size();
+    auto working_memory = rmm::device_uvector<bool>(work_mem_size, stream);
+
+    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * tile_size,
+                               block_size};
+    multi_contains_kernel<tile_size>
+      <<<grid.num_blocks, grid.num_threads_per_block, shared_mem_size, stream.value()>>>(
+        *d_strings,
+        *d_targets,
+        d_first_bytes,
+        d_indices,
+        d_offsets,
+        unique_count,
+        working_memory.data(),
+        d_results);
+  }
+
+  return std::make_unique<table>(std::move(results));
+}
+
+}  // namespace detail
+
+std::unique_ptr<table> contains_multiple(strings_column_view const& strings,
+                                         strings_column_view const& targets,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::contains_multiple(strings, targets, stream, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index ec7015878dd..67226b259d4 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -42,8 +42,9 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
 {
   auto const strings_count = input.size();
   auto const targets_count = targets.size();
-  CUDF_EXPECTS(targets_count > 0, "Must include at least one search target");
-  CUDF_EXPECTS(!targets.has_nulls(), "Search targets cannot contain null strings");
+  CUDF_EXPECTS(targets_count > 0, "Must include at least one search target", std::invalid_argument);
+  CUDF_EXPECTS(
+    !targets.has_nulls(), "Search targets cannot contain null strings", std::invalid_argument);
 
   auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_strings      = *strings_column;
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index a03a34f5fa7..aee83ab35ed 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sequence.hpp>
@@ -37,9 +38,13 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cooperative_groups.h>
 #include <cuda/atomic>
+#include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
 
 #include <limits>
 
@@ -162,6 +167,339 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
   return hashes;
 }
 
+constexpr cudf::thread_index_type block_size = 256;
+// for potentially tuning minhash_seed_kernel independently from block_size
+constexpr cudf::thread_index_type tile_size = block_size;
+
+// Number of a/b parameter values to process per thread.
+// The intermediate values are stored in shared-memory and therefore limits this count.
+// This value was found to be an efficient size for both uint32 and uint64
+// hash types based on benchmarks.
+constexpr cuda::std::size_t params_per_thread = 16;
+
+// Separate kernels are used to process strings above and below this value (in bytes).
+constexpr cudf::size_type wide_string_threshold = 1 << 18;  // 256K
+// The number of blocks per string for the above-threshold kernel processing.
+constexpr cudf::size_type blocks_per_string = 64;
+// The above values were determined using the redpajama and books_sample datasets
+
+/**
+ * @brief Hashing kernel launched as a thread per tile-size (block or warp)
+ *
+ * This kernel computes the hashes for each string using the seed and the specified
+ * hash function. The width is used to compute rolling substrings to hash over.
+ * The hashes are stored in d_hashes to be used in the minhash_permuted_kernel.
+ *
+ * This kernel also counts the number of strings above the wide_string_threshold
+ * and proactively initializes the output values for those strings.
+ *
+ * @tparam HashFunction The hash function to use for this kernel
+ * @tparam hash_value_type Derived from HashFunction result_type
+ *
+ * @param d_strings The input strings to hash
+ * @param seed The seed used for the hash function
+ * @param width Width in characters used for determining substrings to hash
+ * @param d_hashes The resulting hash values are stored here
+ * @param threshold_count Stores the number of strings above wide_string_threshold
+ * @param param_count Number of parameters (used for the proactive initialize)
+ * @param d_results Final results vector (used for the proactive initialize)
+ */
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
+CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
+                                     hash_value_type seed,
+                                     cudf::size_type width,
+                                     hash_value_type* d_hashes,
+                                     cudf::size_type* threshold_count,
+                                     cudf::size_type param_count,
+                                     hash_value_type* d_results)
+{
+  auto const tid     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = tid / tile_size;
+  if (str_idx >= d_strings.size()) { return; }
+  if (d_strings.is_null(str_idx)) { return; }
+
+  // retrieve this string's offset to locate the output position in d_hashes
+  auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
+  auto const offsets_itr =
+    cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
+  auto const offset     = offsets_itr[str_idx];
+  auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
+  if (size_bytes == 0) { return; }
+
+  auto const d_str    = cudf::string_view(d_strings.head<char>() + offset, size_bytes);
+  auto const lane_idx = tid % tile_size;
+
+  // hashes for this string/thread are stored here
+  auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx;
+
+  auto const begin  = d_str.data() + lane_idx;
+  auto const end    = d_str.data() + d_str.size_bytes();
+  auto const hasher = HashFunction(seed);
+
+  for (auto itr = begin; itr < end; itr += tile_size, seed_hashes += tile_size) {
+    if (cudf::strings::detail::is_utf8_continuation_char(*itr)) {
+      *seed_hashes = 0;
+      continue;
+    }
+    auto const check_str =  // used for counting 'width' characters
+      cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+    auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width);
+    if ((itr != d_str.data()) && (left > 0)) {
+      // true itr+width is past the end of the string
+      *seed_hashes = 0;
+      continue;
+    }
+
+    auto const hash_str = cudf::string_view(itr, bytes);
+    hash_value_type hv;
+    if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
+      hv = hasher(hash_str);
+    } else {
+      hv = thrust::get<0>(hasher(hash_str));
+    }
+    // disallowing hash to zero case
+    *seed_hashes = cuda::std::max(hv, hash_value_type{1});
+  }
+
+  // logic appended here so an extra kernel is not required
+  if (size_bytes >= wide_string_threshold) {
+    if (lane_idx == 0) {
+      // count the number of wide strings
+      cuda::atomic_ref<cudf::size_type, cuda::thread_scope_device> ref{*threshold_count};
+      ref.fetch_add(1, cuda::std::memory_order_relaxed);
+    }
+    // initialize the output -- only needed for wider strings
+    auto d_output = d_results + (str_idx * param_count);
+    for (auto i = lane_idx; i < param_count; i += tile_size) {
+      d_output[i] = std::numeric_limits<hash_value_type>::max();
+    }
+  }
+}
+
+/**
+ * @brief Permutation calculation kernel
+ *
+ * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and
+ * parameter_b values to compute the final output results.
+ * The output is the number of input rows (N) by the number of parameter values (M).
+ * Each output[i] is the calculated result for parameter_a/b[0:M].
+ *
+ * This kernel is launched with either blocks per strings of 1 for strings
+ * below the wide_strings_threshold or blocks per string = blocks_per_strings
+ * for strings above wide_strings_threshold.
+ *
+ * @tparam hash_value_type Derived from HashFunction result_type
+ * @tparam blocks_per_string Number of blocks used to process each string
+ *
+ * @param d_strings The input strings to hash
+ * @param indices The indices of the strings in d_strings to process
+ * @param parameter_a 1st set of parameters for the calculation result
+ * @param parameter_b 2nd set of parameters for the calculation result
+ * @param width Used for calculating the number of available hashes in each string
+ * @param d_hashes The hash values computed in minhash_seed_kernel
+ * @param d_results Final results vector of calculate values
+ */
+template <typename hash_value_type, int blocks_per_string>
+CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
+                                         cudf::device_span<cudf::size_type const> indices,
+                                         cudf::device_span<hash_value_type const> parameter_a,
+                                         cudf::device_span<hash_value_type const> parameter_b,
+                                         cudf::size_type width,
+                                         hash_value_type const* d_hashes,
+                                         hash_value_type* d_results)
+{
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  auto const idx = (tid / blocks_per_string) / block_size;
+  if (idx >= indices.size()) { return; }
+  auto const str_idx = indices[idx];
+  if (d_strings.is_null(str_idx)) { return; }
+
+  auto const block      = cooperative_groups::this_thread_block();
+  int const section_idx = block.group_index().x % blocks_per_string;
+
+  auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
+  auto const offsets_itr =
+    cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
+  auto const offset     = offsets_itr[str_idx];
+  auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
+
+  // number of items to process in this block;
+  // last block also includes any remainder values from the size_bytes/blocks_per_string truncation
+  // example:
+  //  each section_size for string with size 588090 and blocks_per_string=64 is 9188
+  //  except the last section which is 9188 + (588090 % 64) = 9246
+  auto const section_size =
+    (size_bytes / blocks_per_string) +
+    (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string);
+  auto const section_offset = section_idx * (size_bytes / blocks_per_string);
+
+  // hash values for this block/section
+  auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset;
+  // width used here as a max value since a string's char-count <= byte-count
+  auto const hashes_size =
+    section_idx < (blocks_per_string - 1)
+      ? section_size
+      : cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), section_size - width + 1);
+
+  auto const init     = size_bytes == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
+  auto const lane_idx = block.thread_rank();
+  auto const d_output = d_results + (str_idx * parameter_a.size());
+
+  auto const begin = seed_hashes + lane_idx;
+  auto const end   = seed_hashes + hashes_size;
+
+  // constants used in the permutation calculations
+  constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
+  constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
+
+  // found to be an efficient shared memory size for both hash types
+  __shared__ hash_value_type block_values[block_size * params_per_thread];
+
+  for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) {
+    // initialize this block's chunk of shared memory
+    // each thread handles params_per_thread of values
+    auto const chunk_values = block_values + (lane_idx * params_per_thread);
+    thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + params_per_thread, init);
+    block.sync();
+
+    auto const param_count =
+      cuda::std::min(static_cast<cuda::std::size_t>(params_per_thread), parameter_a.size() - i);
+
+    // each lane accumulates min hashes in its shared memory
+    for (auto itr = begin; itr < end; itr += block_size) {
+      auto const hv = *itr;
+      // 0 is used as a skip sentinel for UTF-8 and trailing bytes
+      if (hv == 0) { continue; }
+
+      for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) {
+        // permutation formula used by datatrove
+        hash_value_type const v =
+          ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max;
+        auto const block_idx    = ((param_idx % params_per_thread) * block_size) + lane_idx;
+        block_values[block_idx] = cuda::std::min(v, block_values[block_idx]);
+      }
+    }
+    block.sync();
+
+    // reduce each parameter values vector to a single min value;
+    // assumes that the block_size > params_per_thread;
+    // each thread reduces a block_size of parameter values (thread per parameter)
+    if (lane_idx < param_count) {
+      auto const values = block_values + (lane_idx * block_size);
+      // cooperative groups does not have a min function and cub::BlockReduce was slower
+      auto const minv =
+        thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{});
+      if constexpr (blocks_per_string > 1) {
+        // accumulates mins for each block into d_output
+        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{d_output[lane_idx + i]};
+        ref.fetch_min(minv, cuda::std::memory_order_relaxed);
+      } else {
+        d_output[lane_idx + i] = minv;
+      }
+    }
+    block.sync();
+  }
+}
+
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
+std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
+                                         hash_value_type seed,
+                                         cudf::device_span<hash_value_type const> parameter_a,
+                                         cudf::device_span<hash_value_type const> parameter_b,
+                                         cudf::size_type width,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(width >= 2,
+               "Parameter width should be an integer value of 2 or greater",
+               std::invalid_argument);
+  CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument);
+  CUDF_EXPECTS(parameter_a.size() == parameter_b.size(),
+               "Parameters A and B should have the same number of elements",
+               std::invalid_argument);
+  CUDF_EXPECTS(
+    (static_cast<std::size_t>(input.size()) * parameter_a.size()) <
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+    "The number of parameters times the number of input rows exceeds the column size limit",
+    std::overflow_error);
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  auto results =
+    cudf::make_numeric_column(output_type,
+                              input.size() * static_cast<cudf::size_type>(parameter_a.size()),
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              mr);
+  auto d_results = results->mutable_view().data<hash_value_type>();
+
+  cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * block_size,
+                             block_size};
+  auto const hashes_size = input.chars_size(stream);
+  auto d_hashes          = rmm::device_uvector<hash_value_type>(hashes_size, stream);
+  auto d_threshold_count = cudf::detail::device_scalar<cudf::size_type>(0, stream);
+
+  minhash_seed_kernel<HashFunction>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(*d_strings,
+                                                                         seed,
+                                                                         width,
+                                                                         d_hashes.data(),
+                                                                         d_threshold_count.data(),
+                                                                         parameter_a.size(),
+                                                                         d_results);
+  auto const threshold_count = d_threshold_count.value(stream);
+
+  auto indices = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+  thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end());
+  cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0;
+
+  // if we counted a split of above/below threshold then
+  // compute partitions based on the size of each string
+  if ((threshold_count > 0) && (threshold_count < input.size())) {
+    auto sizes = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      thrust::counting_iterator<cudf::size_type>(0),
+                      thrust::counting_iterator<cudf::size_type>(input.size()),
+                      sizes.data(),
+                      cuda::proclaim_return_type<cudf::size_type>(
+                        [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type {
+                          if (d_strings.is_null(idx)) { return 0; }
+                          return d_strings.element<cudf::string_view>(idx).size_bytes();
+                        }));
+    thrust::sort_by_key(
+      rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin());
+    auto const lb = thrust::lower_bound(
+      rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold);
+    threshold_index = static_cast<cudf::size_type>(thrust::distance(sizes.begin(), lb));
+  }
+
+  // handle the strings below the threshold width
+  if (threshold_index > 0) {
+    auto d_indices = cudf::device_span<cudf::size_type const>(indices.data(), threshold_index);
+    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(d_indices.size()) * block_size,
+                               block_size};
+    minhash_permuted_kernel<hash_value_type, 1>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
+  }
+
+  // handle the strings above the threshold width
+  if (threshold_index < input.size()) {
+    auto const count = static_cast<cudf::thread_index_type>(input.size() - threshold_index);
+    auto d_indices =
+      cudf::device_span<cudf::size_type const>(indices.data() + threshold_index, count);
+    cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size};
+    minhash_permuted_kernel<hash_value_type, blocks_per_string>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
+  }
+
+  return results;
+}
+
 /**
  * @brief Compute the minhash of each list row of strings for each seed
  *
@@ -309,6 +647,20 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
+                                      uint32_t seed,
+                                      cudf::device_span<uint32_t const> parameter_a,
+                                      cudf::device_span<uint32_t const> parameter_b,
+                                      cudf::size_type width,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto hashes =
+    detail::minhash_fn<HashFunction>(input, seed, parameter_a, parameter_b, width, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
+}
+
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> const& seed,
                                         cudf::size_type width,
@@ -333,6 +685,20 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        uint64_t seed,
+                                        cudf::device_span<uint64_t const> parameter_a,
+                                        cudf::device_span<uint64_t const> parameter_b,
+                                        cudf::size_type width,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto hashes =
+    detail::minhash_fn<HashFunction>(input, seed, parameter_a, parameter_b, width, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
+}
+
 std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
                                            cudf::device_span<uint32_t const> seeds,
                                            rmm::cuda_stream_view stream,
@@ -374,6 +740,18 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return detail::minhash(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const& input,
+                                               uint32_t seed,
+                                               cudf::device_span<uint32_t const> parameter_a,
+                                               cudf::device_span<uint32_t const> parameter_b,
+                                               cudf::size_type width,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr);
+}
+
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> seed,
                                         cudf::size_type width,
@@ -394,6 +772,18 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return detail::minhash64(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const& input,
+                                                 uint64_t seed,
+                                                 cudf::device_span<uint64_t const> parameter_a,
+                                                 cudf::device_span<uint64_t const> parameter_b,
+                                                 cudf::size_type width,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr);
+}
+
 std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
                                            cudf::device_span<uint32_t const> seeds,
                                            rmm::cuda_stream_view stream,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e9ba58ba224..cbca0ceef77 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -83,7 +83,6 @@ function(ConfigureTest CMAKE_TEST_NAME)
         "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage_mode_${_CUDF_TEST_STREAM_MODE}>"
     )
   endif()
-  enable_clang_tidy(${CMAKE_TEST_NAME})
 endfunction()
 
 # ##################################################################################################
@@ -611,6 +610,7 @@ ConfigureTest(
   text/bpe_tests.cpp
   text/edit_distance_tests.cpp
   text/jaccard_tests.cpp
+  text/minhash_tests.cpp
   text/ngrams_tests.cpp
   text/ngrams_tokenize_tests.cpp
   text/normalize_tests.cpp
@@ -712,6 +712,7 @@ ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_PARTITIONING_TEST streams/partitioning_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
+ConfigureTest(STREAM_QUANTILE_TEST streams/quantile_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
index c8c2d18903f..0fbd7da7f4d 100644
--- a/cpp/tests/io/json/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json/json_quote_normalization_test.cpp
@@ -34,7 +34,9 @@
 // Base test fixture for tests
 struct JsonNormalizationTest : public cudf::test::BaseFixture {};
 
-void run_test(std::string const& host_input, std::string const& expected_host_output)
+void run_test(std::string const& host_input,
+              std::string const& expected_host_output,
+              char delimiter = '\n')
 {
   // RMM memory resource
   std::shared_ptr<rmm::mr::device_memory_resource> rsc =
@@ -46,7 +48,7 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
 
   // Preprocessing FST
   cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
-  cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
+  cudf::io::json::detail::normalize_single_quotes(device_data, delimiter, stream_view, rsc.get());
 
   std::string preprocessed_host_output(device_data.size(), 0);
   CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
@@ -172,6 +174,13 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces
   run_test(input, output);
 }
 
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_NonNewlineDelimiter)
+{
+  std::string input{"{\"a\": \"1\n2\"}z{\'a\': 12}"};
+  std::string output{"{\"a\": \"1\n2\"}z{\"a\": 12}"};
+  run_test(input, output, 'z');
+}
+
 TEST_F(JsonNormalizationTest, ReadJsonOption)
 {
   // RMM memory resource
@@ -179,22 +188,24 @@ TEST_F(JsonNormalizationTest, ReadJsonOption)
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
   // Test input
-  std::string const host_input = R"({"A":'TEST"'})";
+  std::string const host_input = R"({"a": "1\n2"}h{'a': 12})";
   cudf::io::json_reader_options input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{host_input.data(), host_input.size()})
       .lines(true)
+      .delimiter('h')
       .normalize_single_quotes(true);
 
   cudf::io::table_with_metadata processed_table =
     cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get());
 
   // Expected table
-  std::string const expected_input = R"({"A":"TEST\""})";
+  std::string const expected_input = R"({"a": "1\n2"}h{"a": 12})";
   cudf::io::json_reader_options expected_input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{expected_input.data(), expected_input.size()})
-      .lines(true);
+      .lines(true)
+      .delimiter('h');
 
   cudf::io::table_with_metadata expected_table =
     cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get());
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index b58ca56e066..26937c9298a 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -239,7 +239,7 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalTy
                       });
     cudf::io::json_reader_options const in_opts =
       cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
-        .dtypes({data_type{type_to_id<DecimalType>(), scale}})
+        .dtypes(std::vector{data_type{type_to_id<DecimalType>(), scale}})
         .lines(true);
 
     auto const result      = cudf::io::read_json(in_opts);
@@ -324,7 +324,7 @@ TEST_P(JsonReaderParamTest, FloatingPoint)
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
-      .dtypes({dtype<float>()})
+      .dtypes(std::vector{dtype<float>()})
       .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
@@ -348,7 +348,8 @@ TEST_P(JsonReaderParamTest, JsonLinesStrings)
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-      .dtypes({{"2", dtype<cudf::string_view>()}, {"0", dtype<int32_t>()}, {"1", dtype<double>()}})
+      .dtypes(std::map<std::string, data_type>{
+        {"2", dtype<cudf::string_view>()}, {"0", dtype<int32_t>()}, {"1", dtype<double>()}})
       .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
@@ -466,7 +467,7 @@ TEST_P(JsonReaderParamTest, Booleans)
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
-      .dtypes({dtype<bool>()})
+      .dtypes(std::vector{dtype<bool>()})
       .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -508,7 +509,7 @@ TEST_P(JsonReaderParamTest, Dates)
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
-      .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
+      .dtypes(std::vector{data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .lines(true)
       .dayfirst(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
@@ -564,7 +565,7 @@ TEST_P(JsonReaderParamTest, Durations)
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
-      .dtypes({data_type{type_id::DURATION_NANOSECONDS}})
+      .dtypes(std::vector{data_type{type_id::DURATION_NANOSECONDS}})
       .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1022,7 +1023,7 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint)
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
-      .dtypes({dtype<float>()})
+      .dtypes(std::vector{dtype<float>()})
       .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1461,7 +1462,7 @@ TEST_F(JsonReaderTest, ErrorStrings)
 
   cudf::io::json_reader_options const in_opts =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
-      .dtypes({data_type{cudf::type_id::STRING}})
+      .dtypes(std::vector{data_type{cudf::type_id::STRING}})
       .lines(true);
 
   auto const result      = cudf::io::read_json(in_opts);
@@ -1849,7 +1850,7 @@ TYPED_TEST(JsonFixedPointReaderTest, EmptyValues)
 
   cudf::io::json_reader_options const in_opts =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
-      .dtypes({data_type{type_to_id<TypeParam>(), 0}})
+      .dtypes(std::vector{data_type{type_to_id<TypeParam>(), 0}})
       .lines(true);
 
   auto const result      = cudf::io::read_json(in_opts);
@@ -2827,7 +2828,7 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren)
     EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
     ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
     EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
-    ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+    ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 1);
     EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
     // types
     EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
@@ -2865,7 +2866,7 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren)
     EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
     ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
     EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
-    ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+    ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 1);
     EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
     // types
     EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
@@ -2884,9 +2885,9 @@ TEST_F(JsonReaderTest, MixedTypesWithSchema)
   std::map<std::string, cudf::io::schema_element> data_types;
   std::map<std::string, cudf::io::schema_element> child_types;
   child_types.insert(
-    std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING, 0}, {}}});
-  data_types.insert(std::pair{
-    "data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST, 0}, child_types}});
+    std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING}, {}}});
+  data_types.insert(
+    std::pair{"data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST}, child_types}});
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
@@ -2991,4 +2992,264 @@ TEST_F(JsonReaderTest, LastRecordInvalid)
   CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), cudf::table_view{{expected}});
 }
 
+// Test case for dtype pruning with column order
+TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder)
+{
+  std::string json_stringl = R"(
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}
+    {"a": 1, "b": {"0": "abc"          }, "c": false}
+    {"a": 1, "b": {}}
+    {"a": 1,                              "c": null}
+    )";
+  std::string json_string  = R"([
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true},
+    {"a": 1, "b": {"0": "abc"          }, "c": false},
+    {"a": 1, "b": {}},
+    {"a": 1,                              "c": null}
+    ])";
+  for (auto& [json_string, lines] : {std::pair{json_stringl, true}, {json_string, false}}) {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_string.data(), json_string.size()})
+        .prune_columns(true)
+        .lines(lines);
+
+    // include all columns
+    //// schema with partial ordering
+    {
+      cudf::io::schema_element dtype_schema{
+        data_type{cudf::type_id::STRUCT},
+        {
+          {"b",
+           {data_type{cudf::type_id::STRUCT},
+            {{"0", {data_type{cudf::type_id::STRING}}},
+             {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype<float>()}}}}}},
+            {{"0", "1"}}}},
+          {"a", {dtype<int32_t>()}},
+          {"c", {dtype<bool>()}},
+        },
+        {{"b", "a", "c"}}};
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+      // "b" children checks
+      ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "0");
+      EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "1");
+      ASSERT_EQ(result.metadata.schema_info[0].children[1].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].children[1].children[0].name, "offsets");
+      EXPECT_EQ(result.metadata.schema_info[0].children[1].children[1].name, "element");
+      // types
+      EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+      EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::BOOL8);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
+      EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::LIST);
+      EXPECT_EQ(result.tbl->get_column(0).child(1).child(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(0).child(1).child(1).type().id(), cudf::type_id::FLOAT32);
+    }
+    //// schema with pruned columns and different order.
+    {
+      cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT},
+                                            {
+                                              {"c", {dtype<bool>()}},
+                                              {"b",
+                                               {
+                                                 data_type{cudf::type_id::STRUCT},
+                                               }},
+                                              {"a", {dtype<int32_t>()}},
+                                            },
+                                            {{"c", "b", "a"}}};
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // "c", "b" and "a" order
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "c");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "a");
+      // pruned
+      EXPECT_EQ(result.metadata.schema_info[1].children.size(), 0);
+    }
+    //// schema with pruned columns and different sub-order.
+    {
+      cudf::io::schema_element dtype_schema{
+        data_type{cudf::type_id::STRUCT},
+        {
+          {"c", {dtype<bool>()}},
+          {"b",
+           {data_type{cudf::type_id::STRUCT},
+            //  {},
+            {{"0", {data_type{cudf::type_id::STRING}}},
+             {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype<float>()}}}}}},
+            {{"1", "0"}}}},
+          {"a", {dtype<int32_t>()}},
+        }};
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Order of occurance in json
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+      // Sub-order of "b"
+      EXPECT_EQ(result.metadata.schema_info[1].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "1");
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "0");
+    }
+    //// schema with 1 dtype, but 2 column order
+    {
+      cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT},
+                                            {
+                                              {"a", {dtype<int32_t>()}},
+                                            },
+                                            {{"a", "b"}}};
+      EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument);
+      // Input schema column order size mismatch with input schema child types
+    }
+    //// repetition, Error
+    {
+      cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT},
+                                            {
+                                              {"a", {dtype<int32_t>()}},
+                                            },
+                                            {{"a", "a"}}};
+      EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument);
+      // Input schema column order size mismatch with input schema child types
+    }
+    //// different column name in order, Error
+    {
+      cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT},
+                                            {
+                                              {"a", {dtype<int32_t>()}},
+                                            },
+                                            {{"b"}}};
+      EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument);
+      // Column name not found in input schema map, but present in column order and
+      // prune_columns is enabled
+    }
+    // include only one column (nested)
+    {
+      cudf::io::schema_element dtype_schema{
+        data_type{cudf::type_id::STRUCT},
+        {
+          {"b",
+           {data_type{cudf::type_id::STRUCT},
+            {{"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype<float>()}}}}}},
+            {{"1"}}}},
+        }};
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "b":"1":[float]
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "b");
+      ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "1");
+      ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].children[1].name, "element");
+      EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::LIST);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).child(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).child(1).type().id(), cudf::type_id::FLOAT32);
+    }
+    // multiple - all present
+    {
+      cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT},
+                                            {
+                                              {"a", {dtype<int32_t>()}},
+                                              {"c", {dtype<bool>()}},
+                                            },
+                                            {{"a", "c"}}};
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 2);
+      ASSERT_EQ(result.metadata.schema_info.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "c");
+    }
+    // multiple - not all present
+    {
+      cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT},
+                                            {
+                                              {"a", {dtype<int32_t>()}},
+                                              {"d", {dtype<bool>()}},
+                                            },
+                                            {{"a", "d"}}};
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 2);
+      ASSERT_EQ(result.metadata.schema_info.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "d");
+      auto all_null_bools =
+        cudf::test::fixed_width_column_wrapper<bool>{{true, true, true, true}, {0, 0, 0, 0}};
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), all_null_bools);
+    }
+    // test struct, list of string, list of struct.
+    //  multiple - not all present nested
+    {
+      cudf::io::schema_element dtype_schema{
+        data_type{cudf::type_id::STRUCT},
+        {
+          {"b",
+           {data_type{cudf::type_id::STRUCT},
+            {
+              {"2", {data_type{cudf::type_id::STRING}}},
+            },
+            {{"2"}}}},
+          {"d", {data_type{cudf::type_id::LIST}, {{"element", {dtype<int32_t>()}}}}},
+          {"e",
+           {data_type{cudf::type_id::LIST},
+            {{"element",
+              {
+                data_type{cudf::type_id::STRUCT},
+                {
+                  {"3", {data_type{cudf::type_id::STRING}}},
+                },  //{{"3"}} missing column_order, but output should not have it.
+              }}}}},
+        },
+        {{"b", "d", "e"}}};
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "b" (empty struct) and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "b");
+      ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+      ASSERT_EQ(result.metadata.schema_info[0].children[0].name, "2");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "d");
+      auto all_null_strings = cudf::test::strings_column_wrapper{{"", "", "", ""}, {0, 0, 0, 0}};
+      EXPECT_EQ(result.tbl->get_column(0).num_children(), 1);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0).child(0), all_null_strings);
+      auto const all_null_list = cudf::test::lists_column_wrapper<int32_t>{
+        {{0, 0}, {1, 1}, {2, 2}, {3, 3}}, cudf::test::iterators::all_nulls()};
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), all_null_list);
+      EXPECT_EQ(result.metadata.schema_info[2].name, "e");
+      ASSERT_EQ(result.metadata.schema_info[2].children.size(), 2);
+      ASSERT_EQ(result.metadata.schema_info[2].children[1].children.size(), 0);
+      // ASSERT_EQ(result.metadata.schema_info[2].children[1].children[0].name, "3");
+      auto empty_string_col = cudf::test::strings_column_wrapper{};
+      cudf::test::structs_column_wrapper expected_structs{{}, cudf::test::iterators::all_nulls()};
+      // make all null column of list of struct of string
+      auto wrapped = make_lists_column(
+        4,
+        cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0}.release(),
+        expected_structs.release(),
+        4,
+        cudf::create_null_mask(4, cudf::mask_state::ALL_NULL));
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), *wrapped);
+    }
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp
index f47782a2d02..39cd783de00 100644
--- a/cpp/tests/large_strings/parquet_tests.cpp
+++ b/cpp/tests/large_strings/parquet_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/table_utilities.hpp>
 
+#include <cudf/concatenate.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
@@ -69,3 +70,76 @@ TEST_F(ParquetStringsTest, ReadLargeStrings)
   // go back to normal threshold
   unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
 }
+
+// Disabled as the test is too brittle and depends on empirically set `pass_read_limit`,
+// encoding type, and the currently used `ZSTD` scratch space size.
+TEST_F(ParquetStringsTest, DISABLED_ChunkedReadLargeStrings)
+{
+  // Construct a table with one large strings column > 2GB
+  auto const wide = this->wide_column();
+  auto input      = cudf::concatenate(std::vector<cudf::column_view>(120000, wide));  ///< 230MB
+
+  int constexpr multiplier = 12;
+  std::vector<cudf::column_view> input_cols(multiplier, input->view());
+  auto col0 = cudf::concatenate(input_cols);  ///< 2.70GB
+
+  // Expected table
+  auto const expected    = cudf::table_view{{col0->view()}};
+  auto expected_metadata = cudf::io::table_input_metadata{expected};
+
+  // Needed to get exactly 2 Parquet subpasses: first with large-strings and the second with
+  // regualar ones. This may change in the future and lead to false failures.
+  expected_metadata.column_metadata[0].set_encoding(
+    cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY);
+
+  // Host buffer to write Parquet
+  std::vector<char> buffer;
+
+  // Writer options
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected)
+      .metadata(expected_metadata);
+
+  // Needed to get exactly 2 Parquet subpasses: first with large-strings and the second with
+  // regualar ones. This may change in the future and lead to false failures.
+  out_opts.set_compression(cudf::io::compression_type::ZSTD);
+
+  // Write to Parquet
+  cudf::io::write_parquet(out_opts);
+
+  // Empirically set pass_read_limit of 8GB so we read almost entire table (>2GB strings) in the
+  // first subpass and only a small amount in the second subpass. This may change in the future
+  // and lead to false failures.
+  size_t constexpr pass_read_limit = size_t{8} * 1024 * 1024 * 1024;
+
+  // Reader options
+  cudf::io::parquet_reader_options default_in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(buffer.data(), buffer.size()));
+
+  // Chunked parquet reader
+  auto reader = cudf::io::chunked_parquet_reader(0, pass_read_limit, default_in_opts);
+
+  // Read chunked
+  auto tables = std::vector<std::unique_ptr<cudf::table>>{};
+  while (reader.has_next()) {
+    tables.emplace_back(reader.read_chunk().tbl);
+  }
+  auto table_views = std::vector<cudf::table_view>{};
+  std::transform(tables.begin(), tables.end(), std::back_inserter(table_views), [](auto& tbl) {
+    return tbl->view();
+  });
+  auto result            = cudf::concatenate(table_views);
+  auto const result_view = result->view();
+
+  // Verify offsets
+  for (auto const& cv : result_view) {
+    auto const offsets = cudf::strings_column_view(cv).offsets();
+    EXPECT_EQ(offsets.type(), cudf::data_type{cudf::type_id::INT64});
+  }
+
+  // Verify tables to be equal
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result_view, expected);
+
+  // Verify that we read exactly two table chunks
+  EXPECT_EQ(tables.size(), 2);
+}
diff --git a/cpp/tests/streams/quantile_test.cpp b/cpp/tests/streams/quantile_test.cpp
new file mode 100644
index 00000000000..4f4f16a9e70
--- /dev/null
+++ b/cpp/tests/streams/quantile_test.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/quantiles.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <stdexcept>
+
+struct QuantileTest : public cudf::test::BaseFixture {};
+
+TEST_F(QuantileTest, TestMultiColumnUnsorted)
+{
+  auto input_a = cudf::test::strings_column_wrapper(
+    {"C", "B", "A", "A", "D", "B", "D", "B", "D", "C", "C", "C",
+     "D", "B", "D", "B", "C", "C", "A", "D", "B", "A", "A", "A"},
+    {true, true, true, true, true, true, true, true, true, true, true, true,
+     true, true, true, true, true, true, true, true, true, true, true, true});
+
+  cudf::test::fixed_width_column_wrapper<numeric::decimal32, int32_t> input_b(
+    {4, 3, 5, 0, 1, 0, 4, 1, 5, 3, 0, 5, 2, 4, 3, 2, 1, 2, 3, 0, 5, 1, 4, 2},
+    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+
+  auto input = cudf::table_view({input_a, input_b});
+
+  auto actual = cudf::quantiles(input,
+                                {0.0f, 0.5f, 0.7f, 0.25f, 1.0f},
+                                cudf::interpolation::NEAREST,
+                                cudf::sorted::NO,
+                                {cudf::order::ASCENDING, cudf::order::DESCENDING},
+                                {},
+                                cudf::test::get_default_stream());
+}
+
+TEST_F(QuantileTest, TestEmpty)
+{
+  auto input = cudf::test::fixed_width_column_wrapper<numeric::decimal32>({});
+  cudf::quantile(
+    input, {0.5, 0.25}, cudf::interpolation::LINEAR, {}, true, cudf::test::get_default_stream());
+}
+
+TEST_F(QuantileTest, EmptyInput)
+{
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
+  cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
+
+  std::vector<cudf::column_view> input;
+  input.push_back(*empty_);
+  input.push_back(*empty_);
+  input.push_back(*empty_);
+  auto empty = cudf::concatenate(input, cudf::test::get_default_stream());
+
+  cudf::tdigest::tdigest_column_view tdv(*empty);
+  auto result = cudf::percentile_approx(tdv, percentiles, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp
index 41a5940c880..3c8483b153d 100644
--- a/cpp/tests/strings/find_multiple_tests.cpp
+++ b/cpp/tests/strings/find_multiple_tests.cpp
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -75,8 +76,158 @@ TEST_F(StringsFindMultipleTest, ErrorTest)
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   auto empty_view                     = cudf::strings_column_view(zero_size_strings_column);
   // targets must have at least one string
-  EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), std::invalid_argument);
+  EXPECT_THROW(cudf::strings::contains_multiple(strings_view, empty_view), std::invalid_argument);
 
   // targets cannot have nulls
-  EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), std::invalid_argument);
+  EXPECT_THROW(cudf::strings::contains_multiple(strings_view, strings_view), std::invalid_argument);
+}
+
+TEST_F(StringsFindMultipleTest, MultiContains)
+{
+  constexpr int num_rows = 1024 + 1;
+  // replicate the following 9 rows:
+  std::vector<std::string> s = {
+    "Héllo, there world and goodbye",
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving",
+    "the following code snippet demonstrates how to use search for values in an ordered range",
+    "it returns the last position where value could be inserted without violating the ordering",
+    "algorithms execution is parallelized as determined by an execution policy. t",
+    "he this is a continuation of previous row to make sure string boundaries are honored",
+    "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~",
+    "",
+    ""};
+
+  // replicate strings
+  auto string_itr =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; });
+
+  // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ......
+  auto string_v = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) { return (i + 1) % s.size() != 0; });
+
+  auto const strings =
+    cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v);
+  auto strings_view = cudf::strings_column_view(strings);
+  std::vector<std::string> match_targets({" the ", "a", "", "é"});
+  cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(),
+                                                          match_targets.end());
+  auto results =
+    cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column));
+
+  std::vector<bool> ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0};
+  std::vector<bool> ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0};
+  std::vector<bool> ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0};
+  std::vector<bool> ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  auto make_bool_col_fn = [&string_v, &num_rows](std::vector<bool> bools) {
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [&](auto i) { return bools[i % bools.size()]; });
+    return cudf::test::fixed_width_column_wrapper<bool>(iter, iter + num_rows, string_v);
+  };
+
+  auto expected_0 = make_bool_col_fn(ret_0);
+  auto expected_1 = make_bool_col_fn(ret_1);
+  auto expected_2 = make_bool_col_fn(ret_2);
+  auto expected_3 = make_bool_col_fn(ret_3);
+
+  auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected);
+}
+
+TEST_F(StringsFindMultipleTest, MultiContainsMoreTargets)
+{
+  auto const strings = cudf::test::strings_column_wrapper{
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "quick brown fox jumped",
+    "the following code snippet demonstrates how to use search for values in an ordered rangethe "
+    "following code snippet",
+    "thé it returns the last position where value could be inserted without violating ordering thé "
+    "it returns the last position"};
+  auto strings_view = cudf::strings_column_view(strings);
+  std::vector<std::string> targets({"lazy brown", "non-exist", ""});
+
+  std::vector<cudf::test::fixed_width_column_wrapper<bool>> expects;
+  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0}));
+  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0}));
+  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1}));
+
+  std::vector<std::string> match_targets;
+  int max_num_targets = 50;
+
+  for (int num_targets = 1; num_targets < max_num_targets; num_targets++) {
+    match_targets.clear();
+    for (int i = 0; i < num_targets; i++) {
+      match_targets.push_back(targets[i % targets.size()]);
+    }
+
+    cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(),
+                                                            match_targets.end());
+    auto results = cudf::strings::contains_multiple(
+      strings_view, cudf::strings_column_view(multi_targets_column));
+    EXPECT_EQ(results->num_columns(), num_targets);
+    for (int i = 0; i < num_targets; i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]);
+    }
+  }
+}
+
+TEST_F(StringsFindMultipleTest, MultiContainsLongStrings)
+{
+  constexpr int num_rows = 1024 + 1;
+  // replicate the following 7 rows:
+  std::vector<std::string> s = {
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "quick brown fox jumped",
+    "the following code snippet demonstrates how to use search for values in an ordered rangethe "
+    "following code snippet",
+    "thé it returns the last position where value could be inserted without violating ordering thé "
+    "it returns the last position",
+    "algorithms execution is parallelized as determined by an execution policy. t algorithms "
+    "execution is parallelized as ",
+    "he this is a continuation of previous row to make sure string boundaries are honored he this "
+    "is a continuation of previous row",
+    "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ "
+    "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL",
+    ""};
+
+  // replicate strings
+  auto string_itr =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; });
+
+  // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ......
+  auto string_v = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) { return (i + 1) % s.size() != 0; });
+
+  auto const strings =
+    cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v);
+
+  auto sv      = cudf::strings_column_view(strings);
+  auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "});
+  auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets));
+
+  std::vector<bool> ret_0 = {1, 0, 1, 0, 0, 0, 0};
+  std::vector<bool> ret_1 = {0, 1, 0, 0, 0, 0, 0};
+  std::vector<bool> ret_2 = {1, 1, 1, 1, 1, 1, 0};
+  std::vector<bool> ret_3 = {0, 0, 0, 0, 1, 0, 0};
+  std::vector<bool> ret_4 = {1, 0, 0, 0, 0, 0, 0};
+  std::vector<bool> ret_5 = {0, 0, 1, 0, 0, 0, 0};
+
+  auto make_bool_col_fn = [&string_v, &num_rows](std::vector<bool> bools) {
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [&](auto i) { return bools[i % bools.size()]; });
+    return cudf::test::fixed_width_column_wrapper<bool>(iter, iter + num_rows, string_v);
+  };
+
+  auto expected_0 = make_bool_col_fn(ret_0);
+  auto expected_1 = make_bool_col_fn(ret_1);
+  auto expected_2 = make_bool_col_fn(ret_2);
+  auto expected_3 = make_bool_col_fn(ret_3);
+  auto expected_4 = make_bool_col_fn(ret_4);
+  auto expected_5 = make_bool_col_fn(ret_5);
+
+  auto expected =
+    cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected);
 }
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index 2da95ba5c27..a3066c40650 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -17,16 +17,14 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/attributes.hpp>
 #include <cudf/strings/find.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <thrust/iterator/transform_iterator.h>
-
 #include <vector>
 
 struct StringsFindTest : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index ef35a4472cf..042ac44621e 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -28,155 +28,169 @@
 
 struct MinHashTest : public cudf::test::BaseFixture {};
 
-TEST_F(MinHashTest, Basic)
+TEST_F(MinHashTest, Permuted)
 {
-  auto validity = cudf::test::iterators::null_at(1);
   auto input =
     cudf::test::strings_column_wrapper({"doc 1",
-                                        "",
                                         "this is doc 2",
-                                        "",
                                         "doc 3",
                                         "d",
-                                        "The quick brown fox jumpéd over the lazy brown dog."},
-                                       validity);
+                                        "The quick brown fox jumpéd over the lazy brown dog.",
+                                        "line six",
+                                        "line seven",
+                                        "line eight",
+                                        "line nine",
+                                        "line ten"});
 
   auto view = cudf::strings_column_view(input);
 
-  auto results = nvtext::minhash(view);
+  auto first  = thrust::counting_iterator<uint32_t>(10);
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
-  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(
-    {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity);
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW32 expected({
+    LCW32{1392101586u,  394869177u,  811528444u},
+    LCW32{ 211415830u,  187088503u,  130291444u},
+    LCW32{2098117052u,  394869177u,  799753544u},
+    LCW32{2264583304u, 2920538364u, 3576493424u},
+    LCW32{ 253327882u,   41747273u,  302030804u},
+    LCW32{2109809594u, 1017470651u,  326988172u},
+    LCW32{1303819864u,  850676747u,  147107852u},
+    LCW32{ 736021564u,  720812292u, 1405158760u},
+    LCW32{ 902780242u,  134064807u, 1613944636u},
+    LCW32{ 547084870u, 1748895564u,  656501844u}
+  });
+  // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto results64  = nvtext::minhash64(view);
-  auto expected64 = cudf::test::fixed_width_column_wrapper<uint64_t>({774489391575805754ul,
-                                                                      0ul,
-                                                                      3232308021562742685ul,
-                                                                      0ul,
-                                                                      13145552576991307582ul,
-                                                                      14660046701545912182ul,
-                                                                      398062025280761388ul},
-                                                                     validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
-}
+  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
-TEST_F(MinHashTest, LengthEqualsWidth)
-{
-  auto input   = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"});
-  auto view    = cudf::strings_column_view(input);
-  auto results = nvtext::minhash(view, 0, 5);
-  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(
-    {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  // clang-format off
+  LCW64 expected64({
+    LCW64{ 827364888116975697ul, 1601854279692781452ul,  70500662054893256ul},
+    LCW64{  18312093741021833ul,  133793446674258329ul,  21974512489226198ul},
+    LCW64{  22474244732520567ul, 1638811775655358395ul, 949306297364502264ul},
+    LCW64{1332357434996402861ul, 2157346081260151330ul, 676491718310205848ul},
+    LCW64{  65816830624808020ul,   43323600380520789ul,  63511816333816345ul},
+    LCW64{ 629657184954525200ul,   49741036507643002ul,  97466271004074331ul},
+    LCW64{ 301611977846331113ul,  101188874709594830ul,  97466271004074331ul},
+    LCW64{ 121498891461700668ul,  171065800427907402ul,  97466271004074331ul},
+    LCW64{  54617739511834072ul,  231454301607238929ul,  97466271004074331ul},
+    LCW64{ 576418665851990314ul,  231454301607238929ul,  97466271004074331ul}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
-TEST_F(MinHashTest, MultiSeed)
+TEST_F(MinHashTest, PermutedWide)
 {
-  auto input =
-    cudf::test::strings_column_wrapper({"doc 1",
-                                        "this is doc 2",
-                                        "doc 3",
-                                        "d",
-                                        "The quick brown fox jumpéd over the lazy brown dog."});
-
-  auto view = cudf::strings_column_view(input);
+  std::string const small(2 << 10, 'x');  // below wide_string_threshold
+  std::string const wide(2 << 19, 'y');   // above wide_string_threshold
+  auto input = cudf::test::strings_column_wrapper({small, wide});
+  auto view  = cudf::strings_column_view(input);
 
-  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({0, 1, 2});
-  auto results = nvtext::minhash(view, cudf::column_view(seeds));
+  auto first  = thrust::counting_iterator<uint32_t>(20);
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
-  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
-  LCW expected({LCW{1207251914u, 1677652962u, 1061355987u},
-                LCW{  21141582u,  580916568u, 1258052021u},
-                LCW{1207251914u,  943567174u, 1109272887u},
-                LCW{ 655955059u,  488346356u, 2394664816u},
-                LCW{  86520422u,  236622901u,  102546228u}});
+  LCW32 expected({
+    LCW32{1731998032u,  315359380u, 3193688024u},
+    LCW32{1293098788u, 2860992281u,  133918478u}
+  });
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({0, 1, 2});
-  auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64));
+  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
-  LCW64 expected64({LCW64{  774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul},
-                    LCW64{ 3232308021562742685ul,  4445611509348165860ul, 1188598072697676120ul},
-                    LCW64{13145552576991307582ul,  6846192680998069919ul, 1188598072697676120ul},
-                    LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul},
-                    LCW64{  398062025280761388ul,   377720198157450084ul,  984941365662009329ul}});
+   LCW64 expected64({
+     LCW64{1818322427062143853ul, 641024893347719371ul, 1769570368846988848ul},
+     LCW64{1389920339306667795ul, 421787002125838902ul, 1759496674158703968ul}
+   });
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
-TEST_F(MinHashTest, MultiSeedWithNullInputRow)
+TEST_F(MinHashTest, PermutedManyParameters)
 {
-  auto validity = cudf::test::iterators::null_at(1);
-  auto input    = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity);
-  auto view     = cudf::strings_column_view(input);
+  std::string const small(2 << 10, 'x');
+  std::string const wide(2 << 19, 'y');
+  auto input = cudf::test::strings_column_wrapper({small, wide});
+  auto view  = cudf::strings_column_view(input);
 
-  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
-  auto results = nvtext::minhash(view, cudf::column_view(seeds));
+  auto first = thrust::counting_iterator<uint32_t>(20);
+  // more than params_per_thread
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 31);
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
-  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
-  LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}},
-               validity);
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW32 expected({
+    LCW32{1731998032u,  315359380u, 3193688024u, 1777049372u,  360410720u, 3238739364u, 1822100712u,  405462060u,
+          3283790704u, 1867152052u,  450513400u, 3328842044u, 1912203392u,  495564740u, 3373893384u, 1957254732u,
+           540616080u, 3418944724u, 2002306072u,  585667420u, 3463996064u, 2047357412u,  630718760u, 3509047404u,
+          2092408752u,  675770100u, 3554098744u, 2137460092u,  720821440u, 3599150084u, 2182511432u},
+    LCW32{1293098788u, 2860992281u,  133918478u, 1701811971u, 3269705464u,  542631661u, 2110525154u, 3678418647u,
+           951344844u, 2519238337u, 4087131830u, 1360058027u, 2927951520u,  200877717u, 1768771210u, 3336664703u,
+           609590900u, 2177484393u, 3745377886u, 1018304083u, 2586197576u, 4154091069u, 1427017266u, 2994910759u,
+           267836956u, 1835730449u, 3403623942u,  676550139u, 2244443632u, 3812337125u, 1085263322u}
+  });
+  // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
-  auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64));
+  // more than params_per_thread
+  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 31);
+  auto results64 = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
-  LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul},
-                    LCW64{},
-                    LCW64{0ul, 0ul},
-                    LCW64{2717781266371273264ul, 6977325820868387259ul}},
-                   validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
-}
-
-TEST_F(MinHashTest, WordsMinHash)
-{
-  using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
-  auto validity = cudf::test::iterators::null_at(1);
-
-  LCWS input(
-    {LCWS({"hello", "abcdéfgh"}),
-     LCWS{},
-     LCWS({"rapids", "moré", "test", "text"}),
-     LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})},
-    validity);
-
-  auto view = cudf::lists_column_view(input);
-
-  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
-  auto results = nvtext::word_minhash(view, cudf::column_view(seeds));
-  using LCW32  = cudf::test::lists_column_wrapper<uint32_t>;
-  LCW32 expected({LCW32{2069617641u, 1975382903u},
-                  LCW32{},
-                  LCW32{657297235u, 1010955999u},
-                  LCW32{644643885u, 310002789u}},
-                 validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-
-  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
-  auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64));
-  using LCW64    = cudf::test::lists_column_wrapper<uint64_t>;
-  LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul},
-                    LCW64{},
-                    LCW64{5331949571924938590ul, 2088583894581919741ul},
-                    LCW64{3400468157617183341ul, 2398577492366130055ul}},
-                   validity);
+  // clang-format off
+   LCW64 expected64({
+     LCW64{1818322427062143853,  641024893347719371, 1769570368846988848, 592272835132564366,
+           1720818310631833835,  543520776917409353, 1672066252416678822, 494768718702254348,
+           1623314194201523817,  446016660487099335, 1574562135986368804, 397264602271944322,
+           1525810077771213799,  348512544056789317, 1477058019556058786, 299760485841634304,
+           1428305961340903773,  251008427626479291, 1379553903125748768, 202256369411324286,
+           1330801844910593755,  153504311196169273, 1282049786695438742, 104752252981014268,
+           1233297728480283737,   56000194765859255, 1184545670265128724,   7248136550704242,
+           1135793612049973719, 2264339087549243188, 1087041553834818706},
+     LCW64{1389920339306667795,  421787002125838902, 1759496674158703968,  791363336977875075,
+           2129073009010740141, 1160939671829911248,  192806334649082363, 1530516006681947421,
+            562382669501118536, 1900092341533983602,  931959004353154709, 2269668676386019775,
+           1301535339205190882,  333402002024361997, 1671111674057227055,  702978336876398170,
+           2040688008909263228, 1072554671728434343,  104421334547605450, 1442131006580470516,
+            473997669399641631, 1811707341432506689,  843574004251677804, 2181283676284542862,
+           1213150339103713977,  245017001922885084, 1582726673955750150,  614593336774921257,
+           1952303008807786323,  984169671626957438,   16036334446128545}
+   });
+  // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
 TEST_F(MinHashTest, EmptyTest)
 {
-  auto input   = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-  auto view    = cudf::strings_column_view(input->view());
-  auto results = nvtext::minhash(view);
+  auto input  = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  auto view   = cudf::strings_column_view(input->view());
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2, 3});
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::minhash64(view);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>({1, 2, 3});
+  results       = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
   EXPECT_EQ(results->size(), 0);
 }
 
@@ -184,20 +198,39 @@ TEST_F(MinHashTest, ErrorsTest)
 {
   auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"});
   auto view  = cudf::strings_column_view(input);
-  EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument);
-  EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument);
-  auto seeds = cudf::test::fixed_width_column_wrapper<uint32_t>();
-  EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument);
-  auto seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
-  EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument);
+  auto empty = cudf::test::fixed_width_column_wrapper<uint32_t>();
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0),
+    std::invalid_argument);
+  auto empty64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
+  EXPECT_THROW(
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0),
+    std::invalid_argument);
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4),
+    std::invalid_argument);
+  EXPECT_THROW(
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4),
+    std::invalid_argument);
 
   std::vector<std::string> h_input(50000, "");
   input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end());
   view  = cudf::strings_column_view(input);
 
   auto const zeroes = thrust::constant_iterator<uint32_t>(0);
-  seeds             = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error);
-  seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error);
+  auto params       = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4),
+    std::overflow_error);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(nvtext::minhash64_permuted(
+                 view, 0, cudf::column_view(params64), cudf::column_view(params64), 4),
+               std::overflow_error);
+
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4),
+    std::invalid_argument);
+  EXPECT_THROW(
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4),
+    std::invalid_argument);
 }
diff --git a/dependencies.yaml b/dependencies.yaml
index 4c6aefe996f..b5165f82d5f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -442,7 +442,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - pyarrow>=14.0.0,<18.0.0a0
+          - pyarrow>=14.0.0,<19.0.0a0
       - output_types: [requirements, pyproject]
         packages:
           # pyarrow 17.0.0 wheels have a subtle issue around threading that
@@ -450,8 +450,8 @@ dependencies:
           # be highly dependent on the exact build configuration, so we'll just
           # avoid 17.0.0 for now unless we observe similar issues in future
           # releases as well.
-          - pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'
-          - pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'
+          - pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'
+          - pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'
   cuda_version:
     specific:
       - output_types: conda
@@ -587,6 +587,12 @@ dependencies:
         packages:
           - clang==19.1.0
           - clang-tools==19.1.0
+          # TODO: These are build requirements for IWYU and can be replaced
+          # with IWYU itself once a conda package of IWYU supporting clang 19
+          # is available.
+          - clangdev==19.1.0
+          - llvm==19.1.0
+          - llvmdev==19.1.0
   docs:
     common:
       - output_types: [conda]
@@ -669,7 +675,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cachetools
-          - &numba-cuda-dep numba-cuda>=0.0.13
+          - &numba-cuda-dep numba-cuda>=0.0.13,<0.0.18
           - nvtx>=0.2.1
           - packaging
           - rich
@@ -728,7 +734,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.11,<1.13
+          - polars>=1.11,<1.14
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 5942cc16850..fbb9ca4b128 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -26,16 +26,18 @@
 import tempfile
 import warnings
 import xml.etree.ElementTree as ET
+from enum import IntEnum
+from typing import Any
 
+import cudf
 from docutils.nodes import Text
 from packaging.version import Version
-from sphinx.addnodes import pending_xref
-from sphinx.highlighting import lexers
-from sphinx.ext import intersphinx
 from pygments.lexer import RegexLexer
 from pygments.token import Text as PText
-
-import cudf
+from sphinx.addnodes import pending_xref
+from sphinx.ext import intersphinx
+from sphinx.ext.autodoc import ClassDocumenter, bool_option
+from sphinx.highlighting import lexers
 
 
 class PseudoLexer(RegexLexer):
@@ -342,7 +344,10 @@ def clean_all_xml_files(path):
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
     "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
-    "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
+    "DeviceBuffer": (
+        "rmm.pylibrmm.device_buffer.DeviceBuffer",
+        "rmm.DeviceBuffer",
+    ),
 }
 
 
@@ -373,7 +378,14 @@ def _generate_namespaces(namespaces):
 _all_namespaces = _generate_namespaces(
     {
         # Note that io::datasource is actually a nested class
-        "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"},
+        "cudf": {
+            "io",
+            "io::datasource",
+            "strings",
+            "ast",
+            "ast::expression",
+            "io::text",
+        },
         "numeric": {},
         "nvtext": {},
     }
@@ -554,6 +566,8 @@ def on_missing_reference(app, env, node, contnode):
 
 
 nitpick_ignore = [
+    # Erroneously warned in ParquetColumnSchema.name
+    ("py:class", "unicode"),
     ("py:class", "SeriesOrIndex"),
     ("py:class", "Dtype"),
     # The following are erroneously warned due to
@@ -640,9 +654,54 @@ def linkcode_resolve(domain, info) -> str | None:
         f"branch-{version}/python/cudf/cudf/{fn}{linespec}"
     )
 
+
 # Needed for avoid build warning for PandasCompat extension
 suppress_warnings = ["myst.domains"]
 
+
+class PLCIntEnumDocumenter(ClassDocumenter):
+    objtype = "enum"
+    directivetype = "attribute"
+    priority = 10 + ClassDocumenter.priority
+
+    option_spec = dict(ClassDocumenter.option_spec)
+
+    @classmethod
+    def can_document_member(
+        cls, member: Any, membername: str, isattr: bool, parent: Any
+    ) -> bool:
+        try:
+            return issubclass(
+                member, IntEnum
+            ) and member.__module__.startswith("pylibcudf")
+        except TypeError:
+            return False
+
+    def add_directive_header(self, sig: str) -> None:
+        self.directivetype = "attribute"
+        super().add_directive_header(sig)
+
+    def add_content(self, more_content) -> None:
+        doc_as_attr = self.doc_as_attr
+        self.doc_as_attr = False
+        super().add_content(more_content)
+        self.doc_as_attr = doc_as_attr
+        source_name = self.get_sourcename()
+        enum_object: IntEnum = self.object
+
+        if self.object.__name__ != "Kind":
+            self.add_line(f"See also :cpp:enum:`cudf::{self.object.__name__}`.", source_name)
+        self.add_line("", source_name)
+        self.add_line("Enum members", source_name)
+        self.add_line("", source_name)
+
+        for the_member_name in enum_object.__members__:  # type: ignore[attr-defined]
+            self.add_line(
+                f"* ``{the_member_name}``", source_name
+            )
+            self.add_line("", source_name)
+
+
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
     app.add_js_file(
@@ -650,3 +709,5 @@ def setup(app):
     )
     app.connect("doctree-read", resolve_aliases)
     app.connect("missing-reference", on_missing_reference)
+    app.setup_extension("sphinx.ext.autodoc")
+    app.add_autodocumenter(PLCIntEnumDocumenter)
diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md
index 911a64fa152..b653b786129 100644
--- a/docs/cudf/source/developer_guide/cudf_pandas.md
+++ b/docs/cudf/source/developer_guide/cudf_pandas.md
@@ -11,7 +11,8 @@ In the rest of this document, to maintain a concrete pair of libraries in mind,
 For example, future support could include pairs such as CuPy (as the "fast" library) and NumPy (as the "slow" library).
 
 ```{note}
-We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type.
+1. We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type.
+2. There is a `custom_iter` method defined to always utilize slow objects `iter` method, that way we don't move the objects to GPU and trigger an error and again move the object to CPU to execute the iteration successfully.
 ```
 
 ### Types:
diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
index 39840e72e21..1ee828e7c4e 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -15,7 +15,8 @@ To satisfy the goals of pylibcudf, we impose the following set of design princip
 - All typing in code should be written using Cython syntax, not PEP 484 Python typing syntax. Not only does this ensure compatibility with Cython < 3, but even with Cython 3 PEP 484 support remains incomplete as of this writing.
 - All cudf code should interact only with pylibcudf, never with libcudf directly. This is not currently the case, but is the direction that the library is moving towards.
 - Ideally, pylibcudf should depend on no RAPIDS component other than rmm, and should in general have minimal runtime dependencies.
-
+- Type stubs are provided and generated manually. When adding new
+  functionality, ensure that the matching type stub is appropriately updated.
 
 ## Relationship to libcudf
 
@@ -249,3 +250,73 @@ In the event that libcudf provides multiple overloads for the same function with
 and set arguments not shared between overloads to `None`. If a user tries to pass in an unsupported argument for a specific overload type, you should raise `ValueError`.
 
 Finally, consider making an libcudf issue if you think this inconsistency can be addressed on the libcudf side.
+
+### Type stubs
+
+Since static type checkers like `mypy` and `pyright` cannot parse
+Cython code, we provide type stubs for the pylibcudf package. These
+are currently maintained manually, alongside the matching pylibcudf
+files.
+
+Every `pyx` file should have a matching `pyi` file that provides the
+type stubs. Most functions can be exposed straightforwardly. Some
+guiding principles:
+
+- For typed integer arguments in libcudf, use `int` as a type
+  annotation.
+- For functions which are annotated as a `list` in Cython, but the
+  function body does more detailed checking, try and encode the
+  detailed information in the type.
+- For Cython fused types there are two options:
+    1. If the fused type appears only once in the function signature,
+       use a `Union` type;
+    2. If the fused type appears more than once (or as both an input
+       and output type), use a `TypeVar` with
+       the variants in the fused type provided as constraints.
+
+
+As an example, `pylibcudf.copying.split` is typed in Cython as:
+
+```cython
+ctypedef fused ColumnOrTable:
+    Table
+    Column
+
+cpdef list split(ColumnOrTable input, list splits): ...
+```
+
+Here we only have a single use of the fused type, and the `list`
+arguments do not specify their values. Here, if we provide a `Column`
+as input, we receive a `list[Column]` as output, and if we provide a
+`Table` we receive `list[Table]` as output.
+
+In the type stub, we can encode this with a `TypeVar`, we can also
+provide typing for the `splits` argument that indicates that the split
+values must be integers:
+
+```python
+ColumnOrTable = TypeVar("ColumnOrTable", Column, Table)
+
+def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ...
+```
+
+Conversely, `pylibcudf.copying.scatter` uses a fused type only once in
+its input:
+
+```cython
+ctypedef fused TableOrListOfScalars:
+    Table
+    list
+
+cpdef Table scatter(
+    TableOrListOfScalars source, Column scatter_map, Table target
+)
+```
+
+In the type stub, we can use a normal union in this case
+
+```python
+def scatter(
+    source: Table | list[Scalar], scatter_map: Column, target: Table
+) -> Table: ...
+```
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index 53638f071cc..1c1c8040972 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -19,4 +19,6 @@ I/O Functions
     csv
     json
     parquet
+    parquet_metadata
+    text
     timezone
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst
new file mode 100644
index 00000000000..fce964f9714
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst
@@ -0,0 +1,6 @@
+================
+Parquet Metadata
+================
+
+.. automodule:: pylibcudf.io.parquet_metadata
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst
new file mode 100644
index 00000000000..327ca043f36
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst
@@ -0,0 +1,6 @@
+====
+text
+====
+
+.. automodule:: pylibcudf.io.text
+   :members:
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index 4b5379cf0f1..b85c215d7d1 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -65,7 +65,7 @@ cmake .. -G"${CMAKE_GENERATOR}" \
          -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS \
          -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \
          -DBUILD_SHARED_LIBS=OFF \
-         -DKvikIO_REMOTE_SUPPORT=OFF
+         -DCUDF_KVIKIO_REMOTE_IO=OFF
 
 if [[ -z "${PARALLEL_LEVEL}" ]]; then
     cmake --build .
diff --git a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java
index e48b1cf59e4..86b6b98f2ae 100644
--- a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java
+++ b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
  * that is backing it.
  */
 public class DeviceMemoryBufferView extends BaseDeviceMemoryBuffer {
-  DeviceMemoryBufferView(long address, long lengthInBytes) {
+  public DeviceMemoryBufferView(long address, long lengthInBytes) {
     // Set the cleaner to null so we don't end up releasing anything
     super(address, lengthInBytes, (MemoryBufferCleaner) null);
   }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index dbee53640aa..b01ce31b1f3 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -259,7 +259,6 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean allowLeadingZeros,
                                         boolean allowNonNumericNumbers,
                                         boolean allowUnquotedControl,
-                                        boolean pruneColumns,
                                         boolean experimental,
                                         byte lineDelimiter) throws CudfException;
 
@@ -275,7 +274,6 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean allowLeadingZeros,
                                       boolean allowNonNumericNumbers,
                                       boolean allowUnquotedControl,
-                                      boolean pruneColumns,
                                       boolean experimental,
                                       byte lineDelimiter,
                                       long dsHandle) throws CudfException;
@@ -1092,224 +1090,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) {
     return readJSON(schema, opts, buffer, 0, buffer.length);
   }
 
-  private static class DidViewChange {
-    ColumnVector changeWasNeeded = null;
-    boolean noChangeNeeded = false;
-
-    public static DidViewChange yes(ColumnVector cv) {
-      DidViewChange ret = new DidViewChange();
-      ret.changeWasNeeded = cv;
-      return ret;
-    }
-
-    public static DidViewChange no() {
-      DidViewChange ret = new DidViewChange();
-      ret.noChangeNeeded = true;
-      return ret;
-    }
-  }
-
-  private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.NestedChildren children,
-                                                 ColumnView cv) {
-    // We need to do this recursively to be sure it all matches as expected.
-    // If we run into problems where the data types don't match, we are not
-    // going to fix up the data types. We are only going to reorder the columns.
-    if (schema.getType() == DType.STRUCT) {
-      if (cv.getType() != DType.STRUCT) {
-        // The types don't match so just return the input unchanged...
-        return DidViewChange.no();
-      } else {
-        String[] foundNames;
-        if (children == null) {
-          foundNames = new String[0];
-        } else {
-          foundNames = children.getNames();
-        }
-        HashMap<String, Integer> indices = new HashMap<>();
-        for (int i = 0; i < foundNames.length; i++) {
-          indices.put(foundNames[i], i);
-        }
-        // We might need to rearrange the columns to match what we want.
-        DType[] types = schema.getChildTypes();
-        String[] neededNames = schema.getColumnNames();
-        ColumnView[] columns = new ColumnView[neededNames.length];
-        try {
-          boolean somethingChanged = false;
-          if (columns.length != foundNames.length) {
-            somethingChanged = true;
-          }
-          for (int i = 0; i < columns.length; i++) {
-            String neededColumnName = neededNames[i];
-            Integer index = indices.get(neededColumnName);
-            Schema childSchema = schema.getChild(i);
-            if (index != null) {
-              if (childSchema.isStructOrHasStructDescendant()) {
-                ColumnView child = cv.getChildColumnView(index);
-                boolean shouldCloseChild = true;
-                try {
-                  if (index != i) {
-                    somethingChanged = true;
-                  }
-                  DidViewChange childResult = gatherJSONColumns(schema.getChild(i),
-                      children.getChild(index), child);
-                  if (childResult.noChangeNeeded) {
-                    shouldCloseChild = false;
-                    columns[i] = child;
-                  } else {
-                    somethingChanged = true;
-                    columns[i] = childResult.changeWasNeeded;
-                  }
-                } finally {
-                  if (shouldCloseChild) {
-                    child.close();
-                  }
-                }
-              } else {
-                if (index != i) {
-                  somethingChanged = true;
-                }
-                columns[i] = cv.getChildColumnView(index);
-              }
-            } else {
-              somethingChanged = true;
-              if (types[i] == DType.LIST) {
-                try (Scalar s = Scalar.listFromNull(childSchema.getChild(0).asHostDataType())) {
-                  columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
-                }
-              } else if (types[i] == DType.STRUCT) {
-                int numStructChildren = childSchema.getNumChildren();
-                HostColumnVector.DataType[] structChildren = new HostColumnVector.DataType[numStructChildren];
-                for (int structChildIndex = 0; structChildIndex < numStructChildren; structChildIndex++) {
-                  structChildren[structChildIndex] = childSchema.getChild(structChildIndex).asHostDataType();
-                }
-                try (Scalar s = Scalar.structFromNull(structChildren)) {
-                  columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
-                }
-              } else {
-                try (Scalar s = Scalar.fromNull(types[i])) {
-                  columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
-                }
-              }
-            }
-          }
-          if (somethingChanged) {
-            try (ColumnView ret = new ColumnView(cv.type, cv.rows, Optional.of(cv.nullCount),
-                cv.getValid(), null, columns)) {
-              return DidViewChange.yes(ret.copyToColumnVector());
-            }
-          } else {
-            return DidViewChange.no();
-          }
-        } finally {
-          for (ColumnView c: columns) {
-            if (c != null) {
-              c.close();
-            }
-          }
-        }
-      }
-    } else if (schema.getType() == DType.LIST && cv.getType() == DType.LIST) {
-      if (schema.isStructOrHasStructDescendant()) {
-        String [] childNames = children.getNames();
-        if (childNames.length == 2 &&
-            "offsets".equals(childNames[0]) &&
-            "element".equals(childNames[1])) {
-          try (ColumnView child = cv.getChildColumnView(0)){
-            DidViewChange listResult = gatherJSONColumns(schema.getChild(0),
-                children.getChild(1), child);
-            if (listResult.noChangeNeeded) {
-              return DidViewChange.no();
-            } else {
-              try (ColumnView listView = new ColumnView(cv.type, cv.rows,
-                  Optional.of(cv.nullCount), cv.getValid(), cv.getOffsets(),
-                  new ColumnView[]{listResult.changeWasNeeded})) {
-                return DidViewChange.yes(listView.copyToColumnVector());
-              } finally {
-                listResult.changeWasNeeded.close();
-              }
-            }
-          }
-        }
-      }
-      // Nothing to change so just return the input, but we need to inc a ref count to really
-      // make it work, so for now we are going to turn it into a ColumnVector.
-      return DidViewChange.no();
-    } else {
-      // Nothing to change so just return the input, but we need to inc a ref count to really
-      // make it work, so for now we are going to turn it into a ColumnVector.
-      return DidViewChange.no();
-    }
-  }
-
-  private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emptyRowCount) {
-    String[] neededColumns = schema.getColumnNames();
-    if (neededColumns == null || neededColumns.length == 0) {
-      return twm.releaseTable();
-    } else {
-      String[] foundNames = twm.getColumnNames();
-      HashMap<String, Integer> indices = new HashMap<>();
-      for (int i = 0; i < foundNames.length; i++) {
-        indices.put(foundNames[i], i);
-      }
-      // We might need to rearrange the columns to match what we want.
-      DType[] types = schema.getChildTypes();
-      ColumnVector[] columns = new ColumnVector[neededColumns.length];
-      try (Table tbl = twm.releaseTable()) {
-        int rowCount = tbl == null ? emptyRowCount : (int)tbl.getRowCount();
-        if (rowCount < 0) {
-          throw new IllegalStateException(
-              "No empty row count provided and the table read has no row count or columns");
-        }
-        for (int i = 0; i < columns.length; i++) {
-          String neededColumnName = neededColumns[i];
-          Integer index = indices.get(neededColumnName);
-          if (index != null) {
-            if (schema.getChild(i).isStructOrHasStructDescendant()) {
-              DidViewChange gathered = gatherJSONColumns(schema.getChild(i), twm.getChild(index),
-                  tbl.getColumn(index));
-              if (gathered.noChangeNeeded) {
-                columns[i] = tbl.getColumn(index).incRefCount();
-              } else {
-                columns[i] = gathered.changeWasNeeded;
-              }
-            } else {
-              columns[i] = tbl.getColumn(index).incRefCount();
-            }
-          } else {
-            if (types[i] == DType.LIST) {
-              Schema listSchema = schema.getChild(i);
-              Schema elementSchema = listSchema.getChild(0);
-              try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) {
-                columns[i] = ColumnVector.fromScalar(s, rowCount);
-              }
-            } else if (types[i] == DType.STRUCT) {
-              Schema structSchema = schema.getChild(i);
-              int numStructChildren = structSchema.getNumChildren();
-              DataType[] structChildrenTypes = new DataType[numStructChildren];
-              for (int j = 0; j < numStructChildren; j++) {
-                structChildrenTypes[j] = structSchema.getChild(j).asHostDataType();
-              }
-              try (Scalar s = Scalar.structFromNull(structChildrenTypes)) {
-                columns[i] = ColumnVector.fromScalar(s, rowCount);
-              }
-            } else {
-              try (Scalar s = Scalar.fromNull(types[i])) {
-                columns[i] = ColumnVector.fromScalar(s, rowCount);
-              }
-            }
-          }
-        }
-        return new Table(columns);
-      } finally {
-        for (ColumnVector c: columns) {
-          if (c != null) {
-            c.close();
-          }
-        }
-      }
-    }
-  }
-
   /**
    * Read a JSON file.
    * @param schema the schema of the file.  You may use Schema.INFERRED to infer the schema.
@@ -1318,10 +1098,6 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
    * @return the file parsed as a table on the GPU.
    */
   public static Table readJSON(Schema schema, JSONOptions opts, File path) {
-    // only prune the schema if one is provided
-    boolean cudfPruneSchema = schema.getColumnNames() != null &&
-        schema.getColumnNames().length != 0 &&
-        opts.shouldCudfPruneSchema();
     try (TableWithMeta twm = new TableWithMeta(
             readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
                     schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
@@ -1336,11 +1112,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     opts.leadingZerosAllowed(),
                     opts.nonNumericNumbersAllowed(),
                     opts.unquotedControlChars(),
-                    cudfPruneSchema,
                     opts.experimental(),
                     opts.getLineDelimiter()))) {
 
-      return gatherJSONColumns(schema, twm, -1);
+      return twm.releaseTable();
     }
   }
 
@@ -1361,6 +1136,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon
 
   /**
    * Read JSON formatted data.
+   *
+   * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without
+   * emptyRowCount instead.
+   *
    * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
    * @param opts various JSON parsing options.
    * @param buffer raw UTF8 formatted bytes.
@@ -1370,6 +1149,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon
    * @param emptyRowCount the number of rows to return if no columns were read.
    * @return the data parsed as a table on the GPU.
    */
+  @SuppressWarnings("unused")
   public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
                                long len, HostMemoryAllocator hostMemoryAllocator,
                                int emptyRowCount) {
@@ -1381,14 +1161,14 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon
     assert offset >= 0 && offset < buffer.length;
     try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
       newBuf.setBytes(0, buffer, offset, len);
-      return readJSON(schema, opts, newBuf, 0, len, emptyRowCount);
+      return readJSON(schema, opts, newBuf, 0, len);
     }
   }
 
+  @SuppressWarnings("unused")
   public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
                                long len, int emptyRowCount) {
-    return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get(),
-        emptyRowCount);
+    return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
   }
 
   public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
@@ -1470,6 +1250,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
 
   /**
    * Read JSON formatted data.
+   *
+   * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without
+   * emptyRowCount instead.
+   *
    * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
    * @param opts various JSON parsing options.
    * @param buffer raw UTF8 formatted bytes.
@@ -1478,6 +1262,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
    * @param emptyRowCount the number of rows to use if no columns were found.
    * @return the data parsed as a table on the GPU.
    */
+  @SuppressWarnings("unused")
   public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer,
                                long offset, long len, int emptyRowCount) {
     if (len <= 0) {
@@ -1486,10 +1271,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     assert len > 0;
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
-    // only prune the schema if one is provided
-    boolean cudfPruneSchema = schema.getColumnNames() != null &&
-        schema.getColumnNames().length != 0 &&
-        opts.shouldCudfPruneSchema();
     try (TableWithMeta twm = new TableWithMeta(readJSON(
             schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
             schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
@@ -1505,10 +1286,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
             opts.leadingZerosAllowed(),
             opts.nonNumericNumbersAllowed(),
             opts.unquotedControlChars(),
-            cudfPruneSchema,
             opts.experimental(),
             opts.getLineDelimiter()))) {
-      return gatherJSONColumns(schema, twm, emptyRowCount);
+      return twm.releaseTable();
     }
   }
 
@@ -1525,18 +1305,19 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
 
   /**
    * Read JSON formatted data.
+   *
+   * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without
+   * emptyRowCount instead.
+   *
    * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
    * @param opts various JSON parsing options.
    * @param ds the DataSource to read from.
    * @param emptyRowCount the number of rows to return if no columns were read.
    * @return the data parsed as a table on the GPU.
    */
+  @SuppressWarnings("unused")
   public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) {
     long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
-    // only prune the schema if one is provided
-    boolean cudfPruneSchema = schema.getColumnNames() != null &&
-        schema.getColumnNames().length != 0 &&
-        opts.shouldCudfPruneSchema();
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
         schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
         opts.isDayFirst(),
@@ -1550,11 +1331,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
         opts.leadingZerosAllowed(),
         opts.nonNumericNumbersAllowed(),
         opts.unquotedControlChars(),
-        cudfPruneSchema,
         opts.experimental(),
         opts.getLineDelimiter(),
         dsHandle))) {
-      return gatherJSONColumns(schema, twm, emptyRowCount);
+      return twm.releaseTable();
     } finally {
       DataSourceHelper.destroyWrapperDataSource(dsHandle);
     }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 0a667978ca3..1f8b1ea207d 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1037,21 +1037,23 @@ cudf::io::schema_element read_schema_element(int& index,
   if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) {
     std::map<std::string, cudf::io::schema_element> child_elems;
     int num_children = children[index];
+    std::vector<std::string> child_names(num_children);
     // go to the next entry, so recursion can parse it.
     index++;
     for (int i = 0; i < num_children; i++) {
-      auto const name = std::string{names.get(index).get()};
+      auto name = std::string{names.get(index).get()};
       child_elems.insert(
         std::pair{name, cudf::jni::read_schema_element(index, children, names, types, scales)});
+      child_names[i] = std::move(name);
     }
-    return cudf::io::schema_element{d_type, std::move(child_elems)};
+    return cudf::io::schema_element{d_type, std::move(child_elems), {std::move(child_names)}};
   } else {
     if (children[index] != 0) {
       throw std::invalid_argument("found children for a type that should have none");
     }
     // go to the next entry before returning...
     index++;
-    return cudf::io::schema_element{d_type, {}};
+    return cudf::io::schema_element{d_type, {}, std::nullopt};
   }
 }
 
@@ -1824,7 +1826,6 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
                                                  jboolean allow_leading_zeros,
                                                  jboolean allow_nonnumeric_numbers,
                                                  jboolean allow_unquoted_control,
-                                                 jboolean prune_columns,
                                                  jboolean experimental,
                                                  jbyte line_delimiter,
                                                  jlong ds_handle)
@@ -1853,6 +1854,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
     cudf::io::json_recovery_mode_t recovery_mode =
       recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
                         : cudf::io::json_recovery_mode_t::FAIL;
+
     cudf::io::json_reader_options_builder opts =
       cudf::io::json_reader_options::builder(source)
         .dayfirst(static_cast<bool>(day_first))
@@ -1864,7 +1866,6 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes)
-        .prune_columns(prune_columns)
         .experimental(experimental);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
@@ -1886,13 +1887,19 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
       }
 
       std::map<std::string, cudf::io::schema_element> data_types;
+      std::vector<std::string> name_order;
       int at = 0;
       while (at < n_types.size()) {
         auto const name = std::string{n_col_names.get(at).get()};
         data_types.insert(std::pair{
           name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
+        name_order.push_back(name);
       }
-      opts.dtypes(data_types);
+      auto const prune_columns = data_types.size() != 0;
+      cudf::io::schema_element structs{
+        cudf::data_type{cudf::type_id::STRUCT}, std::move(data_types), {std::move(name_order)}};
+      opts.prune_columns(prune_columns).dtypes(structs);
+
     } else {
       // should infer the types
     }
@@ -1925,7 +1932,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
                                                            jboolean allow_leading_zeros,
                                                            jboolean allow_nonnumeric_numbers,
                                                            jboolean allow_unquoted_control,
-                                                           jboolean prune_columns,
                                                            jboolean experimental,
                                                            jbyte line_delimiter)
 {
@@ -1968,6 +1974,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
     cudf::io::json_recovery_mode_t recovery_mode =
       recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
                         : cudf::io::json_recovery_mode_t::FAIL;
+
     cudf::io::json_reader_options_builder opts =
       cudf::io::json_reader_options::builder(source)
         .dayfirst(static_cast<bool>(day_first))
@@ -1979,7 +1986,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes)
-        .prune_columns(prune_columns)
         .experimental(experimental);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
@@ -2001,13 +2007,19 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
       }
 
       std::map<std::string, cudf::io::schema_element> data_types;
+      std::vector<std::string> name_order;
+      name_order.reserve(n_types.size());
       int at = 0;
       while (at < n_types.size()) {
-        auto const name = std::string{n_col_names.get(at).get()};
+        auto name = std::string{n_col_names.get(at).get()};
         data_types.insert(std::pair{
           name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
+        name_order.emplace_back(std::move(name));
       }
-      opts.dtypes(data_types);
+      auto const prune_columns = data_types.size() != 0;
+      cudf::io::schema_element structs{
+        cudf::data_type{cudf::type_id::STRUCT}, std::move(data_types), {std::move(name_order)}};
+      opts.prune_columns(prune_columns).dtypes(structs);
     } else {
       // should infer the types
     }
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index d844466120f..7e8f29dac93 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -4,46 +4,28 @@ import warnings
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 cimport pylibcudf.libcudf.datetime as libcudf_datetime
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.filling cimport calendrical_month_sequence
-from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
-from pylibcudf.datetime import DatetimeComponent
+from pylibcudf.datetime import DatetimeComponent, RoundingFrequency
 
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
 def add_months(Column col, Column months):
     # months must be int16 dtype
-    cdef unique_ptr[column] c_result
-    cdef column_view col_view = col.view()
-    cdef column_view months_view = months.view()
-
-    with nogil:
-        c_result = move(
-            libcudf_datetime.add_calendrical_months(
-                col_view,
-                months_view
-            )
+    return Column.from_pylibcudf(
+        plc.datetime.add_calendrical_months(
+            col.to_pylibcudf(mode="read"),
+            months.to_pylibcudf(mode="read")
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def extract_datetime_component(Column col, object field):
-
-    cdef unique_ptr[column] c_result
-    cdef column_view col_view = col.view()
-    cdef libcudf_datetime.datetime_component component
-
     component_names = {
         "year": DatetimeComponent.YEAR,
         "month": DatetimeComponent.MONTH,
@@ -57,33 +39,29 @@ def extract_datetime_component(Column col, object field):
         "nanosecond": DatetimeComponent.NANOSECOND,
     }
     if field == "day_of_year":
-        with nogil:
-            c_result = move(libcudf_datetime.day_of_year(col_view))
+        result = Column.from_pylibcudf(
+            plc.datetime.day_of_year(
+                col.to_pylibcudf(mode="read")
+            )
+        )
     elif field in component_names:
-        component = component_names[field]
-        with nogil:
-            c_result = move(
-                libcudf_datetime.extract_datetime_component(
-                    col_view,
-                    component
-                )
+        result = Column.from_pylibcudf(
+            plc.datetime.extract_datetime_component(
+                col.to_pylibcudf(mode="read"),
+                component_names[field],
             )
+        )
+        if field == "weekday":
+            # Pandas counts Monday-Sunday as 0-6
+            # while libcudf counts Monday-Sunday as 1-7
+            result = result - result.dtype.type(1)
     else:
         raise ValueError(f"Invalid field: '{field}'")
 
-    result = Column.from_unique_ptr(move(c_result))
-
-    if field == "weekday":
-        # Pandas counts Monday-Sunday as 0-6
-        # while libcudf counts Monday-Sunday as 1-7
-        result = result - result.dtype.type(1)
-
     return result
 
 
 cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq):
-    cdef libcudf_datetime.rounding_frequency freq_val
-
     # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timedelta.resolution_string.html
     old_to_new_freq_map = {
         "H": "h",
@@ -101,96 +79,75 @@ cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq):
             FutureWarning
         )
         freq = old_to_new_freq_map.get(freq)
-    if freq == "D":
-        freq_val = libcudf_datetime.rounding_frequency.DAY
-    elif freq == "h":
-        freq_val = libcudf_datetime.rounding_frequency.HOUR
-    elif freq == "min":
-        freq_val = libcudf_datetime.rounding_frequency.MINUTE
-    elif freq == "s":
-        freq_val = libcudf_datetime.rounding_frequency.SECOND
-    elif freq == "ms":
-        freq_val = libcudf_datetime.rounding_frequency.MILLISECOND
-    elif freq == "us":
-        freq_val = libcudf_datetime.rounding_frequency.MICROSECOND
-    elif freq == "ns":
-        freq_val = libcudf_datetime.rounding_frequency.NANOSECOND
+    rounding_fequency_map = {
+        "D": RoundingFrequency.DAY,
+        "h": RoundingFrequency.HOUR,
+        "min": RoundingFrequency.MINUTE,
+        "s": RoundingFrequency.SECOND,
+        "ms": RoundingFrequency.MILLISECOND,
+        "us": RoundingFrequency.MICROSECOND,
+        "ns": RoundingFrequency.NANOSECOND,
+    }
+    if freq in rounding_fequency_map:
+        return rounding_fequency_map[freq]
     else:
         raise ValueError(f"Invalid resolution: '{freq}'")
-    return freq_val
 
 
 @acquire_spill_lock()
 def ceil_datetime(Column col, object freq):
-    cdef unique_ptr[column] c_result
-    cdef column_view col_view = col.view()
-    cdef libcudf_datetime.rounding_frequency freq_val = \
-        _get_rounding_frequency(freq)
-
-    with nogil:
-        c_result = move(libcudf_datetime.ceil_datetimes(col_view, freq_val))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
+    return Column.from_pylibcudf(
+        plc.datetime.ceil_datetimes(
+            col.to_pylibcudf(mode="read"),
+            _get_rounding_frequency(freq),
+        )
+    )
 
 
 @acquire_spill_lock()
 def floor_datetime(Column col, object freq):
-    cdef unique_ptr[column] c_result
-    cdef column_view col_view = col.view()
-    cdef libcudf_datetime.rounding_frequency freq_val = \
-        _get_rounding_frequency(freq)
-
-    with nogil:
-        c_result = move(libcudf_datetime.floor_datetimes(col_view, freq_val))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
+    return Column.from_pylibcudf(
+        plc.datetime.floor_datetimes(
+            col.to_pylibcudf(mode="read"),
+            _get_rounding_frequency(freq),
+        )
+    )
 
 
 @acquire_spill_lock()
 def round_datetime(Column col, object freq):
-    cdef unique_ptr[column] c_result
-    cdef column_view col_view = col.view()
-    cdef libcudf_datetime.rounding_frequency freq_val = \
-        _get_rounding_frequency(freq)
-
-    with nogil:
-        c_result = move(libcudf_datetime.round_datetimes(col_view, freq_val))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
+    return Column.from_pylibcudf(
+        plc.datetime.round_datetimes(
+            col.to_pylibcudf(mode="read"),
+            _get_rounding_frequency(freq),
+        )
+    )
 
 
 @acquire_spill_lock()
 def is_leap_year(Column col):
     """Returns a boolean indicator whether the year of the date is a leap year
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view col_view = col.view()
-
-    with nogil:
-        c_result = move(libcudf_datetime.is_leap_year(col_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.datetime.is_leap_year(
+            col.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
 def date_range(DeviceScalar start, size_type n, offset):
-    cdef unique_ptr[column] c_result
     cdef size_type months = (
         offset.kwds.get("years", 0) * 12
         + offset.kwds.get("months", 0)
     )
-
-    cdef const scalar* c_start = start.get_raw_ptr()
-    with nogil:
-        c_result = move(calendrical_month_sequence(
+    return Column.from_pylibcudf(
+        plc.filling.calendrical_month_sequence(
             n,
-            c_start[0],
-            months
-        ))
-    return Column.from_unique_ptr(move(c_result))
+            start.c_value,
+            months,
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -199,34 +156,28 @@ def extract_quarter(Column col):
     Returns a column which contains the corresponding quarter of the year
     for every timestamp inside the input column.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view col_view = col.view()
-
-    with nogil:
-        c_result = move(libcudf_datetime.extract_quarter(col_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.datetime.extract_quarter(
+            col.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
 def days_in_month(Column col):
     """Extracts the number of days in the month of the date
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view col_view = col.view()
-
-    with nogil:
-        c_result = move(libcudf_datetime.days_in_month(col_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.datetime.days_in_month(
+            col.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
 def last_day_of_month(Column col):
-    cdef unique_ptr[column] c_result
-    cdef column_view col_view = col.view()
-
-    with nogil:
-        c_result = move(libcudf_datetime.last_day_of_month(col_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.datetime.last_day_of_month(
+            col.to_pylibcudf(mode="read")
+        )
+    )
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index c199ed96d4f..1ce6dfab15e 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -18,7 +18,6 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib.scalar import as_device_scalar
 
-from pylibcudf.libcudf.replace cimport replace_policy
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 import pylibcudf
@@ -244,13 +243,11 @@ cdef class GroupBy:
         return columns_from_pylibcudf_table(shifts), columns_from_pylibcudf_table(keys)
 
     def replace_nulls(self, list values, object method):
-        # TODO: This is using an enum (replace_policy) that has not been exposed in
-        # pylibcudf yet. We'll want to fix that import once it is in pylibcudf.
         _, replaced = self._groupby.replace_nulls(
             pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]),
             [
-                replace_policy.PRECEDING
-                if method == 'ffill' else replace_policy.FOLLOWING
+                pylibcudf.replace.ReplacePolicy.PRECEDING
+                if method == 'ffill' else pylibcudf.replace.ReplacePolicy.FOLLOWING
             ] * len(values),
         )
 
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 76a6e32fde0..96504ebdd66 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -13,7 +13,6 @@ from pylibcudf.libcudf.io.types cimport (
 from cudf._lib.column cimport Column
 
 
-cdef source_info make_source_info(list src) except*
 cdef sink_info make_sinks_info(
     list src, vector[unique_ptr[data_sink]] & data) except*
 cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except*
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 564daefbae2..f23980b387a 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -7,76 +7,20 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from pylibcudf.io.datasource cimport Datasource
 from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.datasource cimport datasource
 from pylibcudf.libcudf.io.types cimport (
     column_name_info,
-    host_buffer,
     sink_info,
-    source_info,
 )
 
 from cudf._lib.column cimport Column
 
 import codecs
-import errno
 import io
 import os
 
 from cudf.core.dtypes import StructDtype
 
-
-# Converts the Python source input to libcudf IO source_info
-# with the appropriate type and source values
-cdef source_info make_source_info(list src) except*:
-    if not src:
-        raise ValueError("Need to pass at least one source")
-
-    cdef const unsigned char[::1] c_buffer
-    cdef vector[host_buffer] c_host_buffers
-    cdef vector[string] c_files
-    cdef Datasource csrc
-    cdef vector[datasource*] c_datasources
-    empty_buffer = False
-    if isinstance(src[0], bytes):
-        empty_buffer = True
-        for buffer in src:
-            if (len(buffer) > 0):
-                c_buffer = buffer
-                c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
-                                                     c_buffer.shape[0]))
-                empty_buffer = False
-    elif isinstance(src[0], io.BytesIO):
-        for bio in src:
-            c_buffer = bio.getbuffer()  # check if empty?
-            c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
-                                                 c_buffer.shape[0]))
-    # Otherwise src is expected to be a numeric fd, string path, or PathLike.
-    # TODO (ptaylor): Might need to update this check if accepted input types
-    #                 change when UCX and/or cuStreamz support is added.
-    elif isinstance(src[0], Datasource):
-        for csrc in src:
-            c_datasources.push_back(csrc.get_datasource())
-        return source_info(c_datasources)
-    elif isinstance(src[0], (int, float, complex, basestring, os.PathLike)):
-        # If source is a file, return source_info where type=FILEPATH
-        if not all(os.path.isfile(file) for file in src):
-            raise FileNotFoundError(errno.ENOENT,
-                                    os.strerror(errno.ENOENT),
-                                    src)
-
-        files = [<string> str(elem).encode() for elem in src]
-        c_files = files
-        return source_info(c_files)
-    else:
-        raise TypeError("Unrecognized input type: {}".format(type(src[0])))
-
-    if empty_buffer is True:
-        c_host_buffers.push_back(host_buffer(<char*>NULL, 0))
-
-    return source_info(c_host_buffers)
-
 # Converts the Python sink input to libcudf IO sink_info.
 cdef sink_info make_sinks_info(
     list src, vector[unique_ptr[data_sink]] & sink
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index fb149603960..7dc9cd01a00 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -104,7 +104,7 @@ cpdef read_json(object filepaths_or_buffers,
         )
         df = cudf.DataFrame._from_data(
             *_data_from_columns(
-                columns=[Column.from_pylibcudf(plc) for plc in res_cols],
+                columns=[Column.from_pylibcudf(col) for col in res_cols],
                 column_names=res_col_names,
                 index_names=None
                )
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
index 3966cce8981..524bfd3b2e8 100644
--- a/python/cudf/cudf/_lib/labeling.pyx
+++ b/python/cudf/cudf/_lib/labeling.pyx
@@ -17,8 +17,8 @@ def label_bins(Column input, Column left_edges, cbool left_inclusive,
     plc_column = plc.labeling.label_bins(
         input.to_pylibcudf(mode="read"),
         left_edges.to_pylibcudf(mode="read"),
-        left_inclusive,
+        plc.labeling.Inclusive.YES if left_inclusive else plc.labeling.Inclusive.NO,
         right_edges.to_pylibcudf(mode="read"),
-        right_inclusive
+        plc.labeling.Inclusive.YES if right_inclusive else plc.labeling.Inclusive.NO,
     )
     return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 12432ac6d5d..9a2aa4a6130 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -4,7 +4,9 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-from pylibcudf.libcudf.types cimport null_order, size_type
+from pylibcudf.libcudf.types cimport (
+    nan_equality, null_equality, null_order, order, size_type
+)
 
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_pylibcudf_table
@@ -37,8 +39,8 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
     return Column.from_pylibcudf(
         plc.lists.distinct(
             col.to_pylibcudf(mode="read"),
-            nulls_equal,
-            nans_all_equal,
+            null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL,
+            nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL,
         )
     )
 
@@ -48,7 +50,7 @@ def sort_lists(Column col, bool ascending, str na_position):
     return Column.from_pylibcudf(
         plc.lists.sort_lists(
             col.to_pylibcudf(mode="read"),
-            ascending,
+            order.ASCENDING if ascending else order.DESCENDING,
             null_order.BEFORE if na_position == "first" else null_order.AFTER,
             False,
         )
@@ -91,7 +93,7 @@ def index_of_scalar(Column col, object py_search_key):
         plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
-            True,
+            plc.lists.DuplicateFindOption.FIND_FIRST,
         )
     )
 
@@ -102,7 +104,7 @@ def index_of_column(Column col, Column search_keys):
         plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             search_keys.to_pylibcudf(mode="read"),
-            True,
+            plc.lists.DuplicateFindOption.FIND_FIRST,
         )
     )
 
@@ -123,7 +125,9 @@ def concatenate_list_elements(Column input_column, dropna=False):
     return Column.from_pylibcudf(
         plc.lists.concatenate_list_elements(
             input_column.to_pylibcudf(mode="read"),
-            dropna,
+            plc.lists.ConcatenateNullPolicy.IGNORE
+            if dropna
+            else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW,
         )
     )
 
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 5e39cafa47b..25cfcf99ca6 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -1,5 +1,7 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint32_t, uint64_t
+
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
@@ -17,6 +19,19 @@ def minhash(Column input, Column seeds, int width=4):
     return Column.from_pylibcudf(result)
 
 
+@acquire_spill_lock()
+def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width):
+    return Column.from_pylibcudf(
+        nvtext.minhash.minhash_permuted(
+            input.to_pylibcudf(mode="read"),
+            seed,
+            a.to_pylibcudf(mode="read"),
+            b.to_pylibcudf(mode="read"),
+            width,
+        )
+    )
+
+
 @acquire_spill_lock()
 def minhash64(Column input, Column seeds, int width=4):
     result = nvtext.minhash.minhash64(
@@ -27,6 +42,19 @@ def minhash64(Column input, Column seeds, int width=4):
     return Column.from_pylibcudf(result)
 
 
+@acquire_spill_lock()
+def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width):
+    return Column.from_pylibcudf(
+        nvtext.minhash.minhash64_permuted(
+            input.to_pylibcudf(mode="read"),
+            seed,
+            a.to_pylibcudf(mode="read"),
+            b.to_pylibcudf(mode="read"),
+            width,
+        )
+    )
+
+
 @acquire_spill_lock()
 def word_minhash(Column input, Column seeds):
     result = nvtext.minhash.word_minhash(
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 1212637d330..d4bd0cd306c 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -27,7 +27,6 @@ from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
-from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -41,12 +40,7 @@ from pylibcudf.libcudf.io.parquet cimport (
     parquet_writer_options,
     write_parquet as parquet_writer,
 )
-from pylibcudf.libcudf.io.parquet_metadata cimport (
-    parquet_metadata,
-    read_parquet_metadata as parquet_metadata_reader,
-)
 from pylibcudf.libcudf.io.types cimport (
-    source_info,
     sink_info,
     column_in_metadata,
     table_input_metadata,
@@ -62,7 +56,6 @@ from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport (
     add_df_col_struct_names,
     make_sinks_info,
-    make_source_info,
 )
 from cudf._lib.utils cimport table_view_from_table
 
@@ -373,7 +366,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                            nrows=nrows, skip_rows=skip_rows)
     return df
 
-cpdef read_parquet_metadata(filepaths_or_buffers):
+cpdef read_parquet_metadata(list filepaths_or_buffers):
     """
     Cython function to call into libcudf API, see `read_parquet_metadata`.
 
@@ -382,56 +375,40 @@ cpdef read_parquet_metadata(filepaths_or_buffers):
     cudf.io.parquet.read_parquet
     cudf.io.parquet.to_parquet
     """
-    cdef source_info source = make_source_info(filepaths_or_buffers)
-
-    args = move(source)
-
-    cdef parquet_metadata c_result
-
-    # Read Parquet metadata
-    with nogil:
-        c_result = move(parquet_metadata_reader(args))
-
-    # access and return results
-    num_rows = c_result.num_rows()
-    num_rowgroups = c_result.num_rowgroups()
-
-    # extract row group metadata and sanitize keys
-    row_group_metadata = [{k.decode(): v for k, v in metadata}
-                          for metadata in c_result.rowgroup_metadata()]
+    parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata(
+        plc.io.SourceInfo(filepaths_or_buffers)
+    )
 
     # read all column names including index column, if any
-    col_names = [info.name().decode() for info in c_result.schema().root().children()]
-
-    # access the Parquet file_footer to find the index
-    index_col = None
-    cdef unordered_map[string, string] file_footer = c_result.metadata()
+    col_names = [info.name() for info in parquet_metadata.schema().root().children()]
 
-    # get index column name(s)
-    index_col_names = None
-    json_str = file_footer[b'pandas'].decode('utf-8')
-    meta = None
+    index_col_names = set()
+    json_str = parquet_metadata.metadata()['pandas']
     if json_str != "":
         meta = json.loads(json_str)
         file_is_range_index, index_col, _ = _parse_metadata(meta)
-        if not file_is_range_index and index_col is not None \
-                and index_col_names is None:
-            index_col_names = {}
+        if (
+            not file_is_range_index
+            and index_col is not None
+        ):
+            columns = meta['columns']
             for idx_col in index_col:
-                for c in meta['columns']:
+                for c in columns:
                     if c['field_name'] == idx_col:
-                        index_col_names[idx_col] = c['name']
+                        index_col_names.add(idx_col)
 
     # remove the index column from the list of column names
     # only if index_col_names is not None
-    if index_col_names is not None:
+    if len(index_col_names) >= 0:
         col_names = [name for name in col_names if name not in index_col_names]
 
-    # num_columns = length of list(col_names)
-    num_columns = len(col_names)
-
-    # return the metadata
-    return num_rows, num_rowgroups, col_names, num_columns, row_group_metadata
+    return (
+        parquet_metadata.num_rows(),
+        parquet_metadata.num_rowgroups(),
+        col_names,
+        len(col_names),
+        parquet_metadata.rowgroup_metadata()
+    )
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx
index 7666b7ff8da..509cfe5e9f8 100644
--- a/python/cudf/cudf/_lib/quantiles.pyx
+++ b/python/cudf/cudf/_lib/quantiles.pyx
@@ -6,14 +6,6 @@ from libcpp cimport bool
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.types cimport (
-    underlying_type_t_interpolation,
-    underlying_type_t_sorted,
-)
-
-from cudf._lib.types import Interpolation
-
-from pylibcudf.libcudf.types cimport interpolation, sorted
 
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
@@ -28,17 +20,13 @@ def quantile(
     Column ordered_indices,
     bool exact,
 ):
-    cdef interpolation c_interp = <interpolation>(
-        <underlying_type_t_interpolation> Interpolation[interp.upper()]
-    )
-
     return Column.from_pylibcudf(
         plc.quantiles.quantile(
             input.to_pylibcudf(mode="read"),
             q,
-            c_interp,
+            plc.types.Interpolation[interp.upper()],
             ordered_indices.to_pylibcudf(mode="read"),
-            <bool>exact
+            exact
         )
     )
 
@@ -51,22 +39,14 @@ def quantile_table(
     list column_order,
     list null_precedence,
 ):
-
-    cdef interpolation c_interp = <interpolation>(
-        <underlying_type_t_interpolation> interp
-    )
-    cdef sorted c_is_input_sorted = <sorted>(
-        <underlying_type_t_sorted> is_input_sorted
-    )
-
     return columns_from_pylibcudf_table(
         plc.quantiles.quantiles(
             plc.Table([
                 c.to_pylibcudf(mode="read") for c in source_columns
             ]),
             q,
-            c_interp,
-            c_is_input_sorted,
+            interp,
+            is_input_sorted,
             column_order,
             null_precedence
         )
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index 185552ede82..eefe37d9880 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -5,21 +5,10 @@ from itertools import repeat
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
 from pylibcudf.libcudf.aggregation cimport rank_method
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.search cimport lower_bound, upper_bound
-from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport null_order, order as cpp_order
-
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport (
-    columns_from_pylibcudf_table,
-    table_view_from_columns,
-)
+from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 import pylibcudf
 
@@ -311,44 +300,19 @@ def digitize(list source_columns, list bins, bool right=False):
     right : Indicating whether the intervals include the
             right or the left bin edge.
     """
-
-    cdef table_view bins_view = table_view_from_columns(bins)
-    cdef table_view source_table_view = table_view_from_columns(
-        source_columns
-    )
-    cdef vector[cpp_order] column_order = (
-        vector[cpp_order](
-            bins_view.num_columns(),
-            cpp_order.ASCENDING
-        )
-    )
-    cdef vector[null_order] null_precedence = (
-        vector[null_order](
-            bins_view.num_columns(),
-            null_order.BEFORE
+    return Column.from_pylibcudf(
+        getattr(pylibcudf.search, "lower_bound" if right else "upper_bound")(
+            pylibcudf.Table(
+                [c.to_pylibcudf(mode="read") for c in bins]
+            ),
+            pylibcudf.Table(
+                [c.to_pylibcudf(mode="read") for c in source_columns]
+            ),
+            [pylibcudf.types.Order.ASCENDING]*len(bins),
+            [pylibcudf.types.NullOrder.BEFORE]*len(bins)
         )
     )
 
-    cdef unique_ptr[column] c_result
-    if right:
-        with nogil:
-            c_result = move(lower_bound(
-                bins_view,
-                source_table_view,
-                column_order,
-                null_precedence)
-            )
-    else:
-        with nogil:
-            c_result = move(upper_bound(
-                bins_view,
-                source_table_view,
-                column_order,
-                null_precedence)
-            )
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def rank_columns(list source_columns, rank_method method, str na_option,
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index ffa5e603408..4c0ec2d9ac5 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -9,6 +9,8 @@
 from cudf._lib.nvtext.minhash import (
     minhash,
     minhash64,
+    minhash64_permuted,
+    minhash_permuted,
     word_minhash,
     word_minhash64,
 )
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
index 8b6da2bfa1c..50113347ccb 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
@@ -1,15 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_integers cimport (
-    is_integer as cpp_is_integer,
-)
+import pylibcudf as plc
 
 from cudf._lib.column cimport Column
 
@@ -20,12 +13,8 @@ def is_integer(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that have integers.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_integer(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.convert.convert_integers.is_integer(
+            source_strings.to_pylibcudf(mode="read")
+        )
+    )
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index b2c7232f549..7942d067c2b 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -1,33 +1,20 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from io import TextIOBase
+from libcpp cimport bool
 
-from cython.operator cimport dereference
-from libc.stdint cimport uint64_t
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
+from io import TextIOBase
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.io.text cimport (
-    byte_range_info,
-    data_chunk_source,
-    make_source,
-    make_source_from_bgzip_file,
-    make_source_from_file,
-    multibyte_split,
-    parse_options,
-)
+import pylibcudf as plc
 
 from cudf._lib.column cimport Column
 
 
 def read_text(object filepaths_or_buffers,
-              object delimiter=None,
-              object byte_range=None,
-              object strip_delimiters=False,
-              object compression=None,
-              object compression_offsets=None):
+              str delimiter,
+              object byte_range,
+              bool strip_delimiters,
+              object compression,
+              object compression_offsets):
     """
     Cython function to call into libcudf API, see `multibyte_split`.
 
@@ -35,24 +22,11 @@ def read_text(object filepaths_or_buffers,
     --------
     cudf.io.text.read_text
     """
-    cdef string delim = delimiter.encode()
-
-    cdef unique_ptr[data_chunk_source] datasource
-    cdef unique_ptr[column] c_col
-
-    cdef size_t c_byte_range_offset
-    cdef size_t c_byte_range_size
-    cdef uint64_t c_compression_begin_offset
-    cdef uint64_t c_compression_end_offset
-    cdef parse_options c_options
-
     if compression is None:
         if isinstance(filepaths_or_buffers, TextIOBase):
-            datasource = move(make_source(
-                filepaths_or_buffers.read().encode()))
+            datasource = plc.io.text.make_source(filepaths_or_buffers.read())
         else:
-            datasource = move(make_source_from_file(
-                filepaths_or_buffers.encode()))
+            datasource = plc.io.text.make_source_from_file(filepaths_or_buffers)
     elif compression == "bgzip":
         if isinstance(filepaths_or_buffers, TextIOBase):
             raise ValueError("bgzip compression requires a file path")
@@ -60,30 +34,20 @@ def read_text(object filepaths_or_buffers,
             if len(compression_offsets) != 2:
                 raise ValueError(
                     "compression offsets need to consist of two elements")
-            c_compression_begin_offset = compression_offsets[0]
-            c_compression_end_offset = compression_offsets[1]
-            datasource = move(make_source_from_bgzip_file(
-                filepaths_or_buffers.encode(),
-                c_compression_begin_offset,
-                c_compression_end_offset))
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepaths_or_buffers,
+                compression_offsets[0],
+                compression_offsets[1]
+            )
         else:
-            datasource = move(make_source_from_bgzip_file(
-                filepaths_or_buffers.encode()))
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepaths_or_buffers,
+            )
     else:
         raise ValueError("Only bgzip compression is supported at the moment")
 
-    c_options = parse_options()
-    if byte_range is not None:
-        c_byte_range_offset = byte_range[0]
-        c_byte_range_size = byte_range[1]
-        c_options.byte_range = byte_range_info(
-            c_byte_range_offset,
-            c_byte_range_size)
-    c_options.strip_delimiters = strip_delimiters
-    with nogil:
-        c_col = move(multibyte_split(
-            dereference(datasource),
-            delim,
-            c_options))
-
-    return Column.from_unique_ptr(move(c_col))
+    options = plc.io.text.ParseOptions(
+        byte_range=byte_range, strip_delimiters=strip_delimiters
+    )
+    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index 4fd3d31841e..c2b760490c1 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -7,12 +7,7 @@ cimport pylibcudf.libcudf.types as libcudf_types
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
-ctypedef bool underlying_type_t_order
-ctypedef bool underlying_type_t_null_order
-ctypedef bool underlying_type_t_sorted
-ctypedef int32_t underlying_type_t_interpolation
 ctypedef int32_t underlying_type_t_type_id
-ctypedef bool underlying_type_t_null_policy
 
 cdef dtype_from_column_view(column_view cv)
 
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 861bb063707..f169ea12b10 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -11,12 +11,6 @@ cimport pylibcudf.libcudf.types as libcudf_types
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
-from cudf._lib.types cimport (
-    underlying_type_t_interpolation,
-    underlying_type_t_order,
-    underlying_type_t_sorted,
-)
-
 import pylibcudf
 
 import cudf
@@ -151,44 +145,6 @@ datetime_unit_map = {
 size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID]
 
 
-class Interpolation(IntEnum):
-    LINEAR = (
-        <underlying_type_t_interpolation> libcudf_types.interpolation.LINEAR
-    )
-    LOWER = (
-        <underlying_type_t_interpolation> libcudf_types.interpolation.LOWER
-    )
-    HIGHER = (
-        <underlying_type_t_interpolation> libcudf_types.interpolation.HIGHER
-    )
-    MIDPOINT = (
-        <underlying_type_t_interpolation> libcudf_types.interpolation.MIDPOINT
-    )
-    NEAREST = (
-        <underlying_type_t_interpolation> libcudf_types.interpolation.NEAREST
-    )
-
-
-class Order(IntEnum):
-    ASCENDING = <underlying_type_t_order> libcudf_types.order.ASCENDING
-    DESCENDING = <underlying_type_t_order> libcudf_types.order.DESCENDING
-
-
-class Sorted(IntEnum):
-    YES = <underlying_type_t_sorted> libcudf_types.sorted.YES
-    NO = <underlying_type_t_sorted> libcudf_types.sorted.NO
-
-
-class NullOrder(IntEnum):
-    BEFORE = <underlying_type_t_order> libcudf_types.null_order.BEFORE
-    AFTER = <underlying_type_t_order> libcudf_types.null_order.AFTER
-
-
-class NullHandling(IntEnum):
-    INCLUDE = <underlying_type_t_null_policy> libcudf_types.null_policy.INCLUDE
-    EXCLUDE = <underlying_type_t_null_policy> libcudf_types.null_policy.EXCLUDE
-
-
 cdef dtype_from_lists_column_view(column_view cv):
     # lists_column_view have no default constructor, so we heap
     # allocate it to get around Cython's limitation of requiring
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 856ce0f75de..3d70b01b7e4 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5350,11 +5350,65 @@ def minhash(
             libstrings.minhash(self._column, seeds_column, width)
         )
 
+    def minhash_permuted(
+        self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int
+    ) -> SeriesOrIndex:
+        """
+        Compute the minhash of a strings column.
+
+        This uses the MurmurHash3_x86_32 algorithm for the hash function.
+
+        Calculation uses the formula (hv * a + b) % mersenne_prime
+        where hv is the hash of a substring of width characters,
+        a and b are provided values and mersenne_prime is 2^61-1.
+
+        Parameters
+        ----------
+        seed : uint32
+            The seed used for the hash algorithm.
+        a : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint32.
+        b : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint32.
+        width : int
+            The width of the substring to hash.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> s = cudf.Series(['this is my', 'favorite book'])
+        >>> a = cudf.Series([1, 2, 3], dtype=np.uint32)
+        >>> b = cudf.Series([4, 5, 6], dtype=np.uint32)
+        >>> s.str.minhash_permuted(0, a=a, b=b, width=5)
+        0    [1305480171, 462824409, 74608232]
+        1       [32665388, 65330773, 97996158]
+        dtype: list
+        """
+        a_column = column.as_column(a)
+        if a_column.dtype != np.uint32:
+            raise ValueError(
+                f"Expecting a Series with dtype uint32, got {type(a)}"
+            )
+        b_column = column.as_column(b)
+        if b_column.dtype != np.uint32:
+            raise ValueError(
+                f"Expecting a Series with dtype uint32, got {type(b)}"
+            )
+        return self._return_or_inplace(
+            libstrings.minhash_permuted(
+                self._column, seed, a_column, b_column, width
+            )
+        )
+
     def minhash64(
         self, seeds: ColumnLike | None = None, width: int = 4
     ) -> SeriesOrIndex:
         """
         Compute the minhash of a strings column.
+
         This uses the MurmurHash3_x64_128 algorithm for the hash function.
         This function generates 2 uint64 values but only the first
         uint64 value is used.
@@ -5390,6 +5444,59 @@ def minhash64(
             libstrings.minhash64(self._column, seeds_column, width)
         )
 
+    def minhash64_permuted(
+        self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int
+    ) -> SeriesOrIndex:
+        """
+        Compute the minhash of a strings column.
+        This uses the MurmurHash3_x64_128 algorithm for the hash function.
+
+        Calculation uses the formula (hv * a + b) % mersenne_prime
+        where hv is the hash of a substring of width characters,
+        a and b are provided values and mersenne_prime is 2^61-1.
+
+        Parameters
+        ----------
+        seed : uint64
+            The seed used for the hash algorithm.
+        a : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint64.
+        b : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint64.
+        width : int
+            The width of the substring to hash.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> s = cudf.Series(['this is my', 'favorite book', 'to read'])
+        >>> a = cudf.Series([2, 3], dtype=np.uint64)
+        >>> b = cudf.Series([5, 6], dtype=np.uint64)
+        >>> s.str.minhash64_permuted(0, a=a, b=b, width=5)
+        0    [172452388517576012, 316595762085180527]
+        1      [71427536958126239, 58787297728258215]
+        2    [423885828176437114, 1140588505926961370]
+        dtype: list
+        """
+        a_column = column.as_column(a)
+        if a_column.dtype != np.uint64:
+            raise ValueError(
+                f"Expecting a Series with dtype uint64, got {type(a)}"
+            )
+        b_column = column.as_column(b)
+        if b_column.dtype != np.uint64:
+            raise ValueError(
+                f"Expecting a Series with dtype uint64, got {type(b)}"
+            )
+        return self._return_or_inplace(
+            libstrings.minhash64_permuted(
+                self._column, seed, a_column, b_column, width
+            )
+        )
+
     def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
         """
         Compute the minhash of a list column of strings.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 205edd91d9d..2b4a17f9559 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -16,6 +16,8 @@
 import pyarrow as pa
 from typing_extensions import Self
 
+import pylibcudf as plc
+
 import cudf
 from cudf import _lib as libcudf
 from cudf.api.types import is_dtype_equal, is_scalar
@@ -789,15 +791,13 @@ def _quantile_table(
         column_order=(),
         null_precedence=(),
     ):
-        interpolation = libcudf.types.Interpolation[interpolation]
+        interpolation = plc.types.Interpolation[interpolation]
 
-        is_sorted = libcudf.types.Sorted["YES" if is_sorted else "NO"]
+        is_sorted = plc.types.Sorted["YES" if is_sorted else "NO"]
 
-        column_order = [libcudf.types.Order[key] for key in column_order]
+        column_order = [plc.types.Order[key] for key in column_order]
 
-        null_precedence = [
-            libcudf.types.NullOrder[key] for key in null_precedence
-        ]
+        null_precedence = [plc.types.NullOrder[key] for key in null_precedence]
 
         return self._from_columns_like_self(
             libcudf.quantiles.quantile_table(
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index df7bbe22a61..e206c8bca08 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -351,6 +351,22 @@ def _integer_and_none_validator(val):
     _make_contains_validator([False, True]),
 )
 
+_register_option(
+    "kvikio_remote_io",
+    _env_get_bool("CUDF_KVIKIO_REMOTE_IO", False),
+    textwrap.dedent(
+        """
+        Whether to use KvikIO's remote IO backend or not.
+        \tWARN: this is experimental and may be removed at any time
+        \twithout warning or deprecation period.
+        \tSet KVIKIO_NTHREADS (default is 8) to change the number of
+        \tconcurrent tcp connections, which is important for good performance.
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
+
 
 class option_context(ContextDecorator):
     """
diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py
index 66a51a83896..b801654068e 100644
--- a/python/cudf/cudf/pandas/_wrappers/common.py
+++ b/python/cudf/cudf/pandas/_wrappers/common.py
@@ -52,4 +52,13 @@ def array_interface(self: _FastSlowProxy):
 
 
 def custom_iter(self: _FastSlowProxy):
-    return iter(self._fsproxy_slow)
+    """
+    Custom iter method to handle the case where only the slow
+    object's iter method is used.
+    """
+    # NOTE: Do not remove this method. This is required to avoid
+    # falling back to GPU for iter method.
+    return _maybe_wrap_result(
+        iter(self._fsproxy_slow),
+        None,  # type: ignore
+    )
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 73afde407db..9768a6c4a2f 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -33,6 +33,20 @@ def call_operator(fn, args, kwargs):
     "EXECUTE_SLOW": 0x0571B0,
 }
 
+# This is a dict of functions that are known to have arguments that
+# need to be transformed from fast to slow only. i.e., Some cudf functions
+# error on passing a device object but don't error on passing a host object.
+# For example: DataFrame.__setitem__(arg, value) errors on passing a
+# cudf.Index object but doesn't error on passing a pd.Index object.
+# Hence we need to transform the arg from fast to slow only. So, we use
+# a dictionary like:
+# {"DataFrame.__setitem__": {0}}
+# where the keys are the function names and the values are the indices
+# (0-based) of the arguments that need to be transformed.
+
+_SPECIAL_FUNCTIONS_ARGS_MAP = {
+    "DataFrame.__setitem__": {0},
+}
 
 _WRAPPER_ASSIGNMENTS = tuple(
     attr
@@ -875,6 +889,10 @@ def __name__(self, value):
             pass
         setattr(self._fsproxy_slow, "__name__", value)
 
+    @property
+    def _customqualname(self):
+        return self._fsproxy_slow.__qualname__
+
 
 def _assert_fast_slow_eq(left, right):
     if _is_final_type(type(left)) or type(left) in NUMPY_TYPES:
@@ -1011,7 +1029,36 @@ def _transform_arg(
         # use __reduce_ex__ instead...
         if type(arg) is tuple:
             # Must come first to avoid infinite recursion
-            return tuple(_transform_arg(a, attribute_name, seen) for a in arg)
+            if (
+                len(arg) > 0
+                and isinstance(arg[0], _MethodProxy)
+                and arg[0]._customqualname in _SPECIAL_FUNCTIONS_ARGS_MAP
+            ):
+                indices_map = _SPECIAL_FUNCTIONS_ARGS_MAP[
+                    arg[0]._customqualname
+                ]
+                method_proxy, original_args, original_kwargs = arg
+
+                original_args = tuple(
+                    _transform_arg(a, "_fsproxy_slow", seen)
+                    if i - 1 in indices_map
+                    else _transform_arg(a, attribute_name, seen)
+                    for i, a in enumerate(original_args)
+                )
+                original_kwargs = _transform_arg(
+                    original_kwargs, attribute_name, seen
+                )
+                return tuple(
+                    (
+                        _transform_arg(method_proxy, attribute_name, seen),
+                        original_args,
+                        original_kwargs,
+                    )
+                )
+            else:
+                return tuple(
+                    _transform_arg(a, attribute_name, seen) for a in arg
+                )
         elif hasattr(arg, "__getnewargs_ex__"):
             # Partial implementation of to reconstruct with
             # transformed pieces
@@ -1099,7 +1146,9 @@ def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any:
     """
     Wraps "result" in a fast-slow proxy if is a "proxiable" object.
     """
-    if _is_final_type(result):
+    if isinstance(result, (int, str, float, bool, type(None))):
+        return result
+    elif _is_final_type(result):
         typ = get_final_type_map()[type(result)]
         return typ._fsproxy_wrap(result, func)
     elif _is_intermediate_type(result):
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index c9ce24d2a5b..96512dacb69 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -193,11 +193,6 @@ def parquet_file(request, tmp_path_factory, pdf):
     return fname
 
 
-@pytest.fixture(scope="module")
-def rdg_seed():
-    return int(os.environ.get("TEST_CUDF_RDG_SEED", "42"))
-
-
 def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64):
     test_pdf = pd.DataFrame(
         [list(range(ncolumns * i, ncolumns * (i + 1))) for i in range(nrows)],
@@ -405,14 +400,14 @@ def test_parquet_range_index_pandas_metadata(tmpdir, pandas_compat, as_bytes):
     assert_eq(expect, got)
 
 
-def test_parquet_read_metadata(tmpdir, pdf):
+def test_parquet_read_metadata(tmp_path, pdf):
     if len(pdf) > 100:
         pytest.skip("Skipping long setup test")
 
     def num_row_groups(rows, group_size):
         return max(1, (rows + (group_size - 1)) // group_size)
 
-    fname = tmpdir.join("metadata.parquet")
+    fname = tmp_path / "metadata.parquet"
     row_group_size = 5
     pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size)
 
@@ -431,7 +426,7 @@ def num_row_groups(rows, group_size):
         assert a == b
 
 
-def test_parquet_read_filtered(tmpdir, rdg_seed):
+def test_parquet_read_filtered(tmpdir):
     # Generate data
     fname = tmpdir.join("filtered.parquet")
     dg.generate(
@@ -455,13 +450,13 @@ def test_parquet_read_filtered(tmpdir, rdg_seed):
                 dg.ColumnParameters(
                     40,
                     0.2,
-                    lambda: np.random.default_rng(seed=None).integers(
+                    lambda: np.random.default_rng(seed=0).integers(
                         0, 100, size=40
                     ),
                     True,
                 ),
             ],
-            seed=rdg_seed,
+            seed=42,
         ),
         format={"name": "parquet", "row_group_size": 64},
     )
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 0958b68084d..afb82f75bcf 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -69,6 +69,7 @@ def s3_base(endpoint_ip, endpoint_port):
         # with an S3 endpoint on localhost
 
         endpoint_uri = f"http://{endpoint_ip}:{endpoint_port}/"
+        os.environ["AWS_ENDPOINT_URL"] = endpoint_uri
 
         server = ThreadedMotoServer(ip_address=endpoint_ip, port=endpoint_port)
         server.start()
@@ -105,6 +106,15 @@ def s3_context(s3_base, bucket, files=None):
                 pass
 
 
+@pytest.fixture(
+    params=[True, False],
+    ids=["kvikio=ON", "kvikio=OFF"],
+)
+def kvikio_remote_io(request):
+    with cudf.option_context("kvikio_remote_io", request.param):
+        yield request.param
+
+
 @pytest.fixture
 def pdf(scope="module"):
     df = pd.DataFrame()
@@ -193,6 +203,7 @@ def test_write_csv(s3_base, s3so, pdf, chunksize):
 def test_read_parquet(
     s3_base,
     s3so,
+    kvikio_remote_io,
     pdf,
     bytes_per_thread,
     columns,
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 997ca357986..47e541fdcef 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -882,68 +882,48 @@ def test_is_vowel_consonant():
     assert_eq(expected, actual)
 
 
-def test_minhash():
+def test_minhash_permuted():
     strings = cudf.Series(["this is my", "favorite book", None, ""])
 
+    params = cudf.Series([1, 2, 3], dtype=np.uint32)
     expected = cudf.Series(
         [
-            cudf.Series([21141582], dtype=np.uint32),
-            cudf.Series([962346254], dtype=np.uint32),
-            None,
-            cudf.Series([0], dtype=np.uint32),
-        ]
-    )
-    actual = strings.str.minhash()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-    expected = cudf.Series(
-        [
-            cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32),
-            cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32),
+            cudf.Series([1305480168, 462824406, 74608229], dtype=np.uint32),
+            cudf.Series([32665385, 65330770, 97996155], dtype=np.uint32),
             None,
             cudf.Series([0, 0, 0], dtype=np.uint32),
         ]
     )
-    actual = strings.str.minhash(seeds=seeds, width=5)
+    actual = strings.str.minhash_permuted(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
-    expected = cudf.Series(
-        [
-            cudf.Series([3232308021562742685], dtype=np.uint64),
-            cudf.Series([23008204270530356], dtype=np.uint64),
-            None,
-            cudf.Series([0], dtype=np.uint64),
-        ]
-    )
-    actual = strings.str.minhash64()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+    params = cudf.Series([1, 2, 3], dtype=np.uint64)
     expected = cudf.Series(
         [
             cudf.Series(
-                [7082801294247314046, 185949556058924788, 167570629329462454],
+                [105531920695060180, 172452388517576009, 316595762085180524],
                 dtype=np.uint64,
             ),
             cudf.Series(
-                [382665377781028452, 86243762733551437, 7688750597953083512],
+                [35713768479063122, 71427536958126236, 58787297728258212],
                 dtype=np.uint64,
             ),
             None,
             cudf.Series([0, 0, 0], dtype=np.uint64),
         ]
     )
-    actual = strings.str.minhash64(seeds=seeds, width=5)
+    actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
     # test wrong seed types
     with pytest.raises(ValueError):
-        strings.str.minhash(seeds="a")
+        strings.str.minhash_permuted(1, a="a", b="b", width=7)
     with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
-        strings.str.minhash(seeds=seeds)
+        params = cudf.Series([0, 1, 2], dtype=np.int32)
+        strings.str.minhash_permuted(1, a=params, b=params, width=6)
     with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        strings.str.minhash64(seeds=seeds)
+        params = cudf.Series([0, 1, 2], dtype=np.uint32)
+        strings.str.minhash64_permuted(1, a=params, b=params, width=8)
 
 
 def test_word_minhash():
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index d636f36f282..aecb7ae7c5c 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -16,6 +16,7 @@
 import pandas as pd
 from fsspec.core import expand_paths_if_needed, get_fs_token_paths
 
+import cudf
 from cudf.api.types import is_list_like
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
@@ -1624,6 +1625,16 @@ def _maybe_expand_directories(paths, glob_pattern, fs):
     return expanded_paths
 
 
+def _use_kvikio_remote_io(fs) -> bool:
+    """Whether `kvikio_remote_io` is enabled and `fs` refers to a S3 file"""
+
+    try:
+        from s3fs.core import S3FileSystem
+    except ImportError:
+        return False
+    return cudf.get_option("kvikio_remote_io") and isinstance(fs, S3FileSystem)
+
+
 @doc_get_reader_filepath_or_buffer()
 def get_reader_filepath_or_buffer(
     path_or_data,
@@ -1649,17 +1660,17 @@ def get_reader_filepath_or_buffer(
         )
     ]
     if not input_sources:
-        raise ValueError("Empty input source list: {input_sources}.")
+        raise ValueError(f"Empty input source list: {input_sources}.")
 
     filepaths_or_buffers = []
     string_paths = [isinstance(source, str) for source in input_sources]
     if any(string_paths):
-        # Sources are all strings. Thes strings are typically
+        # Sources are all strings. The strings are typically
         # file paths, but they may also be raw text strings.
 
         # Don't allow a mix of source types
         if not all(string_paths):
-            raise ValueError("Invalid input source list: {input_sources}.")
+            raise ValueError(f"Invalid input source list: {input_sources}.")
 
         # Make sure we define a filesystem (if possible)
         paths = input_sources
@@ -1712,11 +1723,17 @@ def get_reader_filepath_or_buffer(
                 raise FileNotFoundError(
                     f"{input_sources} could not be resolved to any files"
                 )
-            filepaths_or_buffers = _prefetch_remote_buffers(
-                paths,
-                fs,
-                **(prefetch_options or {}),
-            )
+
+            # If `kvikio_remote_io` is enabled and `fs` refers to a S3 file,
+            # we create S3 URLs and let them pass-through to libcudf.
+            if _use_kvikio_remote_io(fs):
+                filepaths_or_buffers = [f"s3://{fpath}" for fpath in paths]
+            else:
+                filepaths_or_buffers = _prefetch_remote_buffers(
+                    paths,
+                    fs,
+                    **(prefetch_options or {}),
+                )
         else:
             raw_text_input = True
 
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 3e7d1cf3c4c..d48fbad0ec3 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -12,6 +12,7 @@
 import pickle
 import subprocess
 import tempfile
+import time
 import types
 from io import BytesIO, StringIO
 
@@ -1777,3 +1778,43 @@ def test_cudf_pandas_util_version(attrs):
         assert not hasattr(pd.util, attrs)
     else:
         assert hasattr(pd.util, attrs)
+
+
+def test_iteration_over_dataframe_dtypes_produces_proxy_objects(dataframe):
+    _, xdf = dataframe
+    xdf["b"] = xpd.IntervalIndex.from_arrays(xdf["a"], xdf["b"])
+    xdf["a"] = xpd.Series([1, 1, 1, 2, 3], dtype="category")
+    dtype_series = xdf.dtypes
+    assert all(is_proxy_object(x) for x in dtype_series)
+    assert isinstance(dtype_series.iloc[0], xpd.CategoricalDtype)
+    assert isinstance(dtype_series.iloc[1], xpd.IntervalDtype)
+
+
+def test_iter_doesnot_raise(monkeypatch):
+    s = xpd.Series([1, 2, 3])
+    with monkeypatch.context() as monkeycontext:
+        monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True")
+        for _ in s:
+            pass
+
+
+def test_dataframe_setitem_slowdown():
+    # We are explicitly testing the slowdown of the setitem operation
+    df = xpd.DataFrame(
+        {"a": [1, 2, 3] * 100000, "b": [1, 2, 3] * 100000}
+    ).astype("float64")
+    df = xpd.DataFrame({"a": df["a"].repeat(1000), "b": df["b"].repeat(1000)})
+    new_df = df + 1
+    start_time = time.time()
+    df[df.columns] = new_df
+    end_time = time.time()
+    delta = int(end_time - start_time)
+    if delta > 5:
+        pytest.fail(f"Test took too long to run, runtime: {delta}")
+
+
+def test_dataframe_setitem():
+    df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}).astype("float64")
+    new_df = df + 1
+    df[df.columns] = new_df
+    tm.assert_equal(df, new_df)
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 1eadceaaccd..280dd52bb22 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -24,14 +24,14 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "libcudf==24.12.*,>=0.0.0a0",
-    "numba-cuda>=0.0.13",
+    "numba-cuda>=0.0.13,<0.0.18",
     "numpy>=1.23,<3.0a0",
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=2.0,<2.2.4dev0",
     "ptxcompiler",
-    "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'",
-    "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'",
+    "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'",
+    "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'",
     "pylibcudf==24.12.*,>=0.0.0a0",
     "rich",
     "rmm==24.12.*,>=0.0.0a0",
@@ -83,6 +83,14 @@ cudf-pandas-tests = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index ec0bc0eb22b..b2ea3f06e48 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -47,6 +47,14 @@ rapids = ["rmm", "cudf", "dask_cudf"]
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["E402", "F401"]
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
index 66c15f694ee..ba4858c5619 100644
--- a/python/cudf_polars/cudf_polars/__init__.py
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -12,7 +12,7 @@
 
 from cudf_polars._version import __git_commit__, __version__
 from cudf_polars.callback import execute_with_cudf
-from cudf_polars.dsl.translate import translate_ir
+from cudf_polars.dsl.translate import Translator
 
 # Check we have a supported polars version
 from cudf_polars.utils.versions import _ensure_polars_version
@@ -22,7 +22,7 @@
 
 __all__: list[str] = [
     "execute_with_cudf",
-    "translate_ir",
+    "Translator",
     "__git_commit__",
     "__version__",
 ]
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 76816ee0a61..d085f21e0ad 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -18,7 +18,7 @@
 import rmm
 from rmm._cuda import gpu
 
-from cudf_polars.dsl.translate import translate_ir
+from cudf_polars.dsl.translate import Translator
 
 if TYPE_CHECKING:
     from collections.abc import Generator
@@ -148,12 +148,7 @@ def _callback(
         return ir.evaluate(cache={}).to_polars()
 
 
-def execute_with_cudf(
-    nt: NodeTraverser,
-    *,
-    config: GPUEngine,
-    exception: type[Exception] | tuple[type[Exception], ...] = Exception,
-) -> None:
+def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
     """
     A post optimization callback that attempts to execute the plan with cudf.
 
@@ -165,10 +160,15 @@ def execute_with_cudf(
     config
         GPUEngine configuration object
 
-    exception
-        Optional exception, or tuple of exceptions, to catch during
-        translation. Defaults to ``Exception``.
+    Raises
+    ------
+    ValueError
+        If the config contains unsupported keys.
+    NotImplementedError
+        If translation of the plan is unsupported.
 
+    Notes
+    -----
     The NodeTraverser is mutated if the libcudf executor can handle the plan.
     """
     device = config.device
@@ -178,22 +178,27 @@ def execute_with_cudf(
         raise ValueError(
             f"Engine configuration contains unsupported settings {unsupported}"
         )
-    try:
-        with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
-            nt.set_udf(
-                partial(
-                    _callback,
-                    translate_ir(nt),
-                    device=device,
-                    memory_resource=memory_resource,
-                )
+    with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
+        translator = Translator(nt)
+        ir = translator.translate_ir()
+        ir_translation_errors = translator.errors
+        if len(ir_translation_errors):
+            # TODO: Display these errors in user-friendly way.
+            # tracked in https://github.com/rapidsai/cudf/issues/17051
+            unique_errors = sorted(set(ir_translation_errors), key=str)
+            formatted_errors = "\n".join(
+                f"- {e.__class__.__name__}: {e}" for e in unique_errors
             )
-    except exception as e:
-        if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
-            warnings.warn(
-                f"Query execution with GPU not supported, reason: {type(e)}: {e}",
-                PerformanceWarning,
-                stacklevel=2,
+            error_message = (
+                "Query execution with GPU not possible: unsupported operations."
+                f"\nThe errors were:\n{formatted_errors}"
+            )
+            exception = NotImplementedError(error_message, unique_errors)
+            if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
+                warnings.warn(error_message, PerformanceWarning, stacklevel=2)
+            if raise_on_fail:
+                raise exception
+        else:
+            nt.set_udf(
+                partial(_callback, ir, device=device, memory_resource=memory_resource)
             )
-        if raise_on_fail:
-            raise
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 08bc9d0ea3f..7560a0f5a64 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -60,7 +60,7 @@ def to_polars(self) -> pl.DataFrame:
         # To guarantee we produce correct names, we therefore
         # serialise with names we control and rename with that map.
         name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
-        table: pa.Table = plc.interop.to_arrow(
+        table = plc.interop.to_arrow(
             self.table,
             [plc.interop.ColumnMetadata(name=name) for name in name_map],
         )
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index e748ec16f14..326d6b65cbe 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -19,6 +19,8 @@
 from cudf_polars.dsl.expressions.base import (
     AggInfo,
     Col,
+    ColRef,
+    ErrorExpr,
     Expr,
     NamedExpr,
 )
@@ -35,11 +37,13 @@
 
 __all__ = [
     "Expr",
+    "ErrorExpr",
     "NamedExpr",
     "Literal",
     "LiteralColumn",
     "Len",
     "Col",
+    "ColRef",
     "BooleanFunction",
     "StringFunction",
     "TemporalFunction",
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
index effe8cb2378..23851f91938 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -20,7 +20,7 @@
 
     from cudf_polars.containers import Column, DataFrame
 
-__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext"]
+__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext", "ColRef"]
 
 
 class AggInfo(NamedTuple):
@@ -155,6 +155,17 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         )  # pragma: no cover; check_agg trips first
 
 
+class ErrorExpr(Expr):
+    __slots__ = ("error",)
+    _non_child = ("dtype", "error")
+    error: str
+
+    def __init__(self, dtype: plc.DataType, error: str) -> None:
+        self.dtype = dtype
+        self.error = error
+        self.children = ()
+
+
 class NamedExpr:
     # NamedExpr does not inherit from Expr since it does not appear
     # when evaluating expressions themselves, only when constructing
@@ -249,3 +260,36 @@ def do_evaluate(
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
         return AggInfo([(self, plc.aggregation.collect_list(), self)])
+
+
+class ColRef(Expr):
+    __slots__ = ("index", "table_ref")
+    _non_child = ("dtype", "index", "table_ref")
+    index: int
+    table_ref: plc.expressions.TableReference
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        index: int,
+        table_ref: plc.expressions.TableReference,
+        column: Expr,
+    ) -> None:
+        if not isinstance(column, Col):
+            raise TypeError("Column reference should only apply to columns")
+        self.dtype = dtype
+        self.index = index
+        self.table_ref = table_ref
+        self.children = (column,)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        raise NotImplementedError(
+            "Only expect this node as part of an expression translated to libcudf AST."
+        )
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
index 65fa4bfa62f..cd8e5c6a4eb 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -27,7 +27,9 @@
 
 class TemporalFunction(Expr):
     __slots__ = ("name", "options")
-    _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
+    _COMPONENT_MAP: ClassVar[
+        dict[pl_expr.TemporalFunction, plc.datetime.DatetimeComponent]
+    ] = {
         pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
         pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
         pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
index c16313bf83c..7eba0c110ab 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
@@ -58,7 +58,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class LiteralColumn(Expr):
     __slots__ = ("value",)
     _non_child = ("dtype", "value")
-    value: pa.Array[Any, Any]
+    value: pa.Array[Any]
 
     def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
         self.dtype = dtype
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index a242ff9300f..98e8a83b04e 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -29,8 +29,9 @@
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import Column, DataFrame
 from cudf_polars.dsl.nodebase import Node
-from cudf_polars.dsl.to_ast import to_parquet_filter
+from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter
 from cudf_polars.utils import dtypes
+from cudf_polars.utils.versions import POLARS_VERSION_GT_112
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Hashable, MutableMapping, Sequence
@@ -41,6 +42,7 @@
 
 __all__ = [
     "IR",
+    "ErrorNode",
     "PythonScan",
     "Scan",
     "Cache",
@@ -48,6 +50,7 @@
     "Select",
     "GroupBy",
     "Join",
+    "ConditionalJoin",
     "HStack",
     "Distinct",
     "Sort",
@@ -210,6 +213,23 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         )
 
 
+class ErrorNode(IR):
+    """Represents an error translating the IR."""
+
+    __slots__ = ("error",)
+    _non_child = (
+        "schema",
+        "error",
+    )
+    error: str
+    """The error."""
+
+    def __init__(self, schema: Schema, error: str):
+        self.schema = schema
+        self.error = error
+        self.children = ()
+
+
 class PythonScan(IR):
     """Representation of input from a python function."""
 
@@ -498,7 +518,7 @@ def do_evaluate(
                 # Mask must have been applied.
                 return df
         elif typ == "ndjson":
-            json_schema: list[tuple[str, str, list]] = [
+            json_schema: list[plc.io.json.NameAndType] = [
                 (name, typ, []) for name, typ in schema.items()
             ]
             plc_tbl_w_meta = plc.io.json.read_json(
@@ -522,6 +542,12 @@ def do_evaluate(
             )  # pragma: no cover; post init trips first
         if row_index is not None:
             name, offset = row_index
+            if POLARS_VERSION_GT_112:
+                # If we sliced away some data from the start, that
+                # shifts the row index.
+                # But prior to 1.13, polars had this wrong, so we match behaviour
+                # https://github.com/pola-rs/polars/issues/19607
+                offset += skip_rows
             dtype = schema[name]
             step = plc.interop.from_arrow(
                 pa.scalar(1, type=plc.interop.to_arrow(dtype))
@@ -890,6 +916,66 @@ def do_evaluate(
         return DataFrame(broadcasted).slice(options.slice)
 
 
+class ConditionalJoin(IR):
+    """A conditional inner join of two dataframes on a predicate."""
+
+    __slots__ = ("predicate", "options", "ast_predicate")
+    _non_child = ("schema", "predicate", "options")
+    predicate: expr.Expr
+    options: tuple
+
+    def __init__(
+        self, schema: Schema, predicate: expr.Expr, options: tuple, left: IR, right: IR
+    ) -> None:
+        self.schema = schema
+        self.predicate = predicate
+        self.options = options
+        self.children = (left, right)
+        self.ast_predicate = to_ast(predicate)
+        _, join_nulls, zlice, suffix, coalesce = self.options
+        # Preconditions from polars
+        assert not join_nulls
+        assert not coalesce
+        if self.ast_predicate is None:
+            raise NotImplementedError(
+                f"Conditional join with predicate {predicate}"
+            )  # pragma: no cover; polars never delivers expressions we can't handle
+        self._non_child_args = (self.ast_predicate, zlice, suffix)
+
+    @classmethod
+    def do_evaluate(
+        cls,
+        predicate: plc.expressions.Expression,
+        zlice: tuple[int, int] | None,
+        suffix: str,
+        left: DataFrame,
+        right: DataFrame,
+    ) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        lg, rg = plc.join.conditional_inner_join(left.table, right.table, predicate)
+        left = DataFrame.from_table(
+            plc.copying.gather(
+                left.table, lg, plc.copying.OutOfBoundsPolicy.DONT_CHECK
+            ),
+            left.column_names,
+        )
+        right = DataFrame.from_table(
+            plc.copying.gather(
+                right.table, rg, plc.copying.OutOfBoundsPolicy.DONT_CHECK
+            ),
+            right.column_names,
+        )
+        right = right.rename_columns(
+            {
+                name: f"{name}{suffix}"
+                for name in right.column_names
+                if name in left.column_names_set
+            }
+        )
+        result = left.with_columns(right.columns)
+        return result.slice(zlice)
+
+
 class Join(IR):
     """A join of two dataframes."""
 
@@ -1464,7 +1550,7 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR):
                 raise NotImplementedError(
                     "Unpivot cannot cast all input columns to "
                     f"{self.schema[value_name].id()}"
-                )
+                )  # pragma: no cover
             self.options = (
                 tuple(indices),
                 tuple(pivotees),
diff --git a/python/cudf_polars/cudf_polars/dsl/nodebase.py b/python/cudf_polars/cudf_polars/dsl/nodebase.py
index 228d300f467..dd5c40a00be 100644
--- a/python/cudf_polars/cudf_polars/dsl/nodebase.py
+++ b/python/cudf_polars/cudf_polars/dsl/nodebase.py
@@ -43,9 +43,7 @@ class Node(Generic[T]):
     def _ctor_arguments(self, children: Sequence[T]) -> Sequence[Any | T]:
         return (*(getattr(self, attr) for attr in self._non_child), *children)
 
-    def reconstruct(
-        self, children: Sequence[T]
-    ) -> Self:  # pragma: no cover; not yet used
+    def reconstruct(self, children: Sequence[T]) -> Self:
         """
         Rebuild this node with new children.
 
diff --git a/python/cudf_polars/cudf_polars/dsl/to_ast.py b/python/cudf_polars/cudf_polars/dsl/to_ast.py
index 9a0838631cc..acc4b3669af 100644
--- a/python/cudf_polars/cudf_polars/dsl/to_ast.py
+++ b/python/cudf_polars/cudf_polars/dsl/to_ast.py
@@ -14,12 +14,14 @@
 from pylibcudf import expressions as plc_expr
 
 from cudf_polars.dsl import expr
-from cudf_polars.dsl.traversal import CachingVisitor
+from cudf_polars.dsl.traversal import CachingVisitor, reuse_if_unchanged
 from cudf_polars.typing import GenericTransformer
 
 if TYPE_CHECKING:
     from collections.abc import Mapping
 
+    from cudf_polars.typing import ExprTransformer
+
 # Can't merge these op-mapping dictionaries because scoped enum values
 # are exposed by cython with equality/hash based one their underlying
 # representation type. So in a dict they are just treated as integers.
@@ -128,7 +130,14 @@ def _to_ast(node: expr.Expr, self: Transformer) -> plc_expr.Expression:
 def _(node: expr.Col, self: Transformer) -> plc_expr.Expression:
     if self.state["for_parquet"]:
         return plc_expr.ColumnNameReference(node.name)
-    return plc_expr.ColumnReference(self.state["name_to_index"][node.name])
+    raise TypeError("Should always be wrapped in a ColRef node before translation")
+
+
+@_to_ast.register
+def _(node: expr.ColRef, self: Transformer) -> plc_expr.Expression:
+    if self.state["for_parquet"]:
+        raise TypeError("Not expecting ColRef node in parquet filter")
+    return plc_expr.ColumnReference(node.index, node.table_ref)
 
 
 @_to_ast.register
@@ -238,9 +247,7 @@ def to_parquet_filter(node: expr.Expr) -> plc_expr.Expression | None:
         return None
 
 
-def to_ast(
-    node: expr.Expr, *, name_to_index: Mapping[str, int]
-) -> plc_expr.Expression | None:
+def to_ast(node: expr.Expr) -> plc_expr.Expression | None:
     """
     Convert an expression to libcudf AST nodes suitable for compute_column.
 
@@ -248,18 +255,66 @@ def to_ast(
     ----------
     node
         Expression to convert.
-    name_to_index
-        Mapping from column names to their index in the table that
-        will be used for expression evaluation.
+
+    Notes
+    -----
+    `Col` nodes must always be wrapped in `TableRef` nodes when
+    converting to an ast expression so that their table reference and
+    index are provided.
 
     Returns
     -------
-    pylibcudf Expressoin if conversion is possible, otherwise None.
+    pylibcudf Expression if conversion is possible, otherwise None.
     """
-    mapper = CachingVisitor(
-        _to_ast, state={"for_parquet": False, "name_to_index": name_to_index}
-    )
+    mapper = CachingVisitor(_to_ast, state={"for_parquet": False})
     try:
         return mapper(node)
     except (KeyError, NotImplementedError):
         return None
+
+
+def _insert_colrefs(node: expr.Expr, rec: ExprTransformer) -> expr.Expr:
+    if isinstance(node, expr.Col):
+        return expr.ColRef(
+            node.dtype,
+            rec.state["name_to_index"][node.name],
+            rec.state["table_ref"],
+            node,
+        )
+    return reuse_if_unchanged(node, rec)
+
+
+def insert_colrefs(
+    node: expr.Expr,
+    *,
+    table_ref: plc.expressions.TableReference,
+    name_to_index: Mapping[str, int],
+) -> expr.Expr:
+    """
+    Insert column references into an expression before conversion to libcudf AST.
+
+    Parameters
+    ----------
+    node
+        Expression to insert references into.
+    table_ref
+        pylibcudf `TableReference` indicating whether column
+        references are coming from the left or right table.
+    name_to_index:
+        Mapping from column names to column indices in the table
+        eventually used for evaluation.
+
+    Notes
+    -----
+    All column references are wrapped in the same, singular, table
+    reference, so this function relies on the expression only
+    containing column references from a single table.
+
+    Returns
+    -------
+    New expression with column references inserted.
+    """
+    mapper = CachingVisitor(
+        _insert_colrefs, state={"table_ref": table_ref, "name_to_index": name_to_index}
+    )
+    return mapper(node)
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 5181214819e..e8ed009cdf2 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -21,14 +21,127 @@
 import pylibcudf as plc
 
 from cudf_polars.dsl import expr, ir
-from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged
+from cudf_polars.dsl.to_ast import insert_colrefs
 from cudf_polars.typing import NodeTraverser
 from cudf_polars.utils import dtypes, sorting
 
 if TYPE_CHECKING:
-    from cudf_polars.typing import ExprTransformer
+    from cudf_polars.typing import NodeTraverser
 
-__all__ = ["translate_ir", "translate_named_expr"]
+__all__ = ["Translator", "translate_named_expr"]
+
+
+class Translator:
+    """
+    Translates polars-internal IR nodes and expressions to our representation.
+
+    Parameters
+    ----------
+    visitor
+        Polars NodeTraverser object
+    """
+
+    def __init__(self, visitor: NodeTraverser):
+        self.visitor = visitor
+        self.errors: list[Exception] = []
+
+    def translate_ir(self, *, n: int | None = None) -> ir.IR:
+        """
+        Translate a polars-internal IR node to our representation.
+
+        Parameters
+        ----------
+        visitor
+            Polars NodeTraverser object
+        n
+            Optional node to start traversing from, if not provided uses
+            current polars-internal node.
+
+        Returns
+        -------
+        Translated IR object
+
+        Raises
+        ------
+        NotImplementedError
+            If the version of Polars IR is unsupported.
+
+        Notes
+        -----
+        Any expression nodes that cannot be translated are replaced by
+        :class:`expr.ErrorNode` nodes and collected in the the `errors` attribute.
+        After translation is complete, this list of errors should be inspected
+        to determine if the query is supported.
+        """
+        ctx: AbstractContextManager[None] = (
+            set_node(self.visitor, n) if n is not None else noop_context
+        )
+        # IR is versioned with major.minor, minor is bumped for backwards
+        # compatible changes (e.g. adding new nodes), major is bumped for
+        # incompatible changes (e.g. renaming nodes).
+        if (version := self.visitor.version()) >= (4, 0):
+            e = NotImplementedError(
+                f"No support for polars IR {version=}"
+            )  # pragma: no cover; no such version for now.
+            self.errors.append(e)  # pragma: no cover
+            raise e  # pragma: no cover
+
+        with ctx:
+            polars_schema = self.visitor.get_schema()
+            try:
+                schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()}
+            except Exception as e:
+                self.errors.append(NotImplementedError(str(e)))
+                return ir.ErrorNode({}, str(e))
+            try:
+                node = self.visitor.view_current_node()
+            except Exception as e:
+                self.errors.append(e)
+                return ir.ErrorNode(schema, str(e))
+            try:
+                result = _translate_ir(node, self, schema)
+            except Exception as e:
+                self.errors.append(e)
+                return ir.ErrorNode(schema, str(e))
+            if any(
+                isinstance(dtype, pl.Null)
+                for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values())
+            ):
+                error = NotImplementedError(
+                    f"No GPU support for {result} with Null column dtype."
+                )
+                self.errors.append(error)
+                return ir.ErrorNode(schema, str(error))
+
+            return result
+
+    def translate_expr(self, *, n: int) -> expr.Expr:
+        """
+        Translate a polars-internal expression IR into our representation.
+
+        Parameters
+        ----------
+        n
+            Node to translate, an integer referencing a polars internal node.
+
+        Returns
+        -------
+        Translated IR object.
+
+        Notes
+        -----
+        Any expression nodes that cannot be translated are replaced by
+        :class:`expr.ErrorExpr` nodes and collected in the the `errors` attribute.
+        After translation is complete, this list of errors should be inspected
+        to determine if the query is supported.
+        """
+        node = self.visitor.view_expression(n)
+        dtype = dtypes.from_polars(self.visitor.get_dtype(n))
+        try:
+            return _translate_expr(node, self, dtype)
+        except Exception as e:
+            self.errors.append(e)
+            return expr.ErrorExpr(dtype, str(e))
 
 
 class set_node(AbstractContextManager[None]):
@@ -70,7 +183,7 @@ def __exit__(self, *args: Any) -> None:
 
 @singledispatch
 def _translate_ir(
-    node: Any, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: Any, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     raise NotImplementedError(
         f"Translation for {type(node).__name__}"
@@ -79,19 +192,19 @@ def _translate_ir(
 
 @_translate_ir.register
 def _(
-    node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.PythonScan, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     scan_fn, with_columns, source_type, predicate, nrows = node.options
     options = (scan_fn, with_columns, source_type, nrows)
     predicate = (
-        translate_named_expr(visitor, n=predicate) if predicate is not None else None
+        translate_named_expr(translator, n=predicate) if predicate is not None else None
     )
     return ir.PythonScan(schema, options, predicate)
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Scan, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     typ, *options = node.scan_type
     if typ == "ndjson":
@@ -120,7 +233,7 @@ def _(
         skip_rows,
         n_rows,
         row_index,
-        translate_named_expr(visitor, n=node.predicate)
+        translate_named_expr(translator, n=node.predicate)
         if node.predicate is not None
         else None,
     )
@@ -128,20 +241,20 @@ def _(
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Cache, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Cache, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input))
+    return ir.Cache(schema, node.id_, translator.translate_ir(n=node.input))
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.DataFrameScan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.DataFrameScan, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     return ir.DataFrameScan(
         schema,
         node.df,
         node.projection,
-        translate_named_expr(visitor, n=node.selection)
+        translate_named_expr(translator, n=node.selection)
         if node.selection is not None
         else None,
     )
@@ -149,22 +262,22 @@ def _(
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Select, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Select, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    with set_node(visitor, node.input):
-        inp = translate_ir(visitor, n=None)
-        exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
+    with set_node(translator.visitor, node.input):
+        inp = translator.translate_ir(n=None)
+        exprs = [translate_named_expr(translator, n=e) for e in node.expr]
     return ir.Select(schema, exprs, node.should_broadcast, inp)
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.GroupBy, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.GroupBy, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    with set_node(visitor, node.input):
-        inp = translate_ir(visitor, n=None)
-        aggs = [translate_named_expr(visitor, n=e) for e in node.aggs]
-        keys = [translate_named_expr(visitor, n=e) for e in node.keys]
+    with set_node(translator.visitor, node.input):
+        inp = translator.translate_ir(n=None)
+        aggs = [translate_named_expr(translator, n=e) for e in node.aggs]
+        keys = [translate_named_expr(translator, n=e) for e in node.keys]
     return ir.GroupBy(
         schema,
         keys,
@@ -177,17 +290,17 @@ def _(
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Join, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Join, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     # Join key dtypes are dependent on the schema of the left and
     # right inputs, so these must be translated with the relevant
     # input active.
-    with set_node(visitor, node.input_left):
-        inp_left = translate_ir(visitor, n=None)
-        left_on = [translate_named_expr(visitor, n=e) for e in node.left_on]
-    with set_node(visitor, node.input_right):
-        inp_right = translate_ir(visitor, n=None)
-        right_on = [translate_named_expr(visitor, n=e) for e in node.right_on]
+    with set_node(translator.visitor, node.input_left):
+        inp_left = translator.translate_ir(n=None)
+        left_on = [translate_named_expr(translator, n=e) for e in node.left_on]
+    with set_node(translator.visitor, node.input_right):
+        inp_right = translator.translate_ir(n=None)
+        right_on = [translate_named_expr(translator, n=e) for e in node.right_on]
     if (how := node.options[0]) in {
         "inner",
         "left",
@@ -204,80 +317,65 @@ def _(
             raise NotImplementedError(
                 f"Unsupported join type {how}"
             )  # pragma: no cover; asof joins not yet exposed
-        # No exposure of mixed/conditional joins in pylibcudf yet, so in
-        # the first instance, implement by doing a cross join followed by
-        # a filter.
-        _, join_nulls, zlice, suffix, coalesce = node.options
-        cross = ir.Join(
-            schema,
-            [],
-            [],
-            ("cross", join_nulls, None, suffix, coalesce),
-            inp_left,
-            inp_right,
-        )
-        dtype = plc.DataType(plc.TypeId.BOOL8)
         if op2 is None:
             ops = [op1]
         else:
             ops = [op1, op2]
-        suffix = cross.options[3]
-
-        # Column references in the right table refer to the post-join
-        # names, so with suffixes.
-        def _rename(e: expr.Expr, rec: ExprTransformer) -> expr.Expr:
-            if isinstance(e, expr.Col) and e.name in inp_left.schema:
-                return type(e)(e.dtype, f"{e.name}{suffix}")
-            return reuse_if_unchanged(e, rec)
-
-        mapper = make_recursive(_rename)
-        right_on = [
-            expr.NamedExpr(
-                f"{old.name}{suffix}" if old.name in inp_left.schema else old.name, new
-            )
-            for new, old in zip(
-                (mapper(e.value) for e in right_on), right_on, strict=True
-            )
-        ]
-        mask = functools.reduce(
+
+        dtype = plc.DataType(plc.TypeId.BOOL8)
+        predicate = functools.reduce(
             functools.partial(
                 expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND
             ),
             (
-                expr.BinOp(dtype, expr.BinOp._MAPPING[op], left.value, right.value)
+                expr.BinOp(
+                    dtype,
+                    expr.BinOp._MAPPING[op],
+                    insert_colrefs(
+                        left.value,
+                        table_ref=plc.expressions.TableReference.LEFT,
+                        name_to_index={
+                            name: i for i, name in enumerate(inp_left.schema)
+                        },
+                    ),
+                    insert_colrefs(
+                        right.value,
+                        table_ref=plc.expressions.TableReference.RIGHT,
+                        name_to_index={
+                            name: i for i, name in enumerate(inp_right.schema)
+                        },
+                    ),
+                )
                 for op, left, right in zip(ops, left_on, right_on, strict=True)
             ),
         )
-        filtered = ir.Filter(schema, expr.NamedExpr("mask", mask), cross)
-        if zlice is not None:
-            offset, length = zlice
-            return ir.Slice(schema, offset, length, filtered)
-        return filtered
+
+        return ir.ConditionalJoin(schema, predicate, node.options, inp_left, inp_right)
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.HStack, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.HStack, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    with set_node(visitor, node.input):
-        inp = translate_ir(visitor, n=None)
-        exprs = [translate_named_expr(visitor, n=e) for e in node.exprs]
+    with set_node(translator.visitor, node.input):
+        inp = translator.translate_ir(n=None)
+        exprs = [translate_named_expr(translator, n=e) for e in node.exprs]
     return ir.HStack(schema, exprs, node.should_broadcast, inp)
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Reduce, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Reduce, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:  # pragma: no cover; polars doesn't emit this node yet
-    with set_node(visitor, node.input):
-        inp = translate_ir(visitor, n=None)
-        exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
+    with set_node(translator.visitor, node.input):
+        inp = translator.translate_ir(n=None)
+        exprs = [translate_named_expr(translator, n=e) for e in node.expr]
     return ir.Reduce(schema, exprs, inp)
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Distinct, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     (keep, subset, maintain_order, zlice) = node.options
     keep = ir.Distinct._KEEP_MAP[keep]
@@ -288,17 +386,17 @@ def _(
         subset,
         zlice,
         maintain_order,
-        translate_ir(visitor, n=node.input),
+        translator.translate_ir(n=node.input),
     )
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Sort, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Sort, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    with set_node(visitor, node.input):
-        inp = translate_ir(visitor, n=None)
-        by = [translate_named_expr(visitor, n=e) for e in node.by_column]
+    with set_node(translator.visitor, node.input):
+        inp = translator.translate_ir(n=None)
+        by = [translate_named_expr(translator, n=e) for e in node.by_column]
     stable, nulls_last, descending = node.sort_options
     order, null_order = sorting.sort_order(
         descending, nulls_last=nulls_last, num_keys=len(by)
@@ -308,33 +406,35 @@ def _(
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Slice, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.Slice(schema, node.offset, node.len, translate_ir(visitor, n=node.input))
+    return ir.Slice(
+        schema, node.offset, node.len, translator.translate_ir(n=node.input)
+    )
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Filter, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Filter, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    with set_node(visitor, node.input):
-        inp = translate_ir(visitor, n=None)
-        mask = translate_named_expr(visitor, n=node.predicate)
+    with set_node(translator.visitor, node.input):
+        inp = translator.translate_ir(n=None)
+        mask = translate_named_expr(translator, n=node.predicate)
     return ir.Filter(schema, mask, inp)
 
 
 @_translate_ir.register
 def _(
     node: pl_ir.SimpleProjection,
-    visitor: NodeTraverser,
+    translator: Translator,
     schema: dict[str, plc.DataType],
 ) -> ir.IR:
-    return ir.Projection(schema, translate_ir(visitor, n=node.input))
+    return ir.Projection(schema, translator.translate_ir(n=node.input))
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.MapFunction, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.MapFunction, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     name, *options = node.function
     return ir.MapFunction(
@@ -342,83 +442,36 @@ def _(
         name,
         options,
         # TODO: merge_sorted breaks this pattern
-        translate_ir(visitor, n=node.input),
+        translator.translate_ir(n=node.input),
     )
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Union, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     return ir.Union(
-        schema, node.options, *(translate_ir(visitor, n=n) for n in node.inputs)
+        schema, node.options, *(translator.translate_ir(n=n) for n in node.inputs)
     )
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.HConcat, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.HConcat(schema, *(translate_ir(visitor, n=n) for n in node.inputs))
-
-
-def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
-    """
-    Translate a polars-internal IR node to our representation.
-
-    Parameters
-    ----------
-    visitor
-        Polars NodeTraverser object
-    n
-        Optional node to start traversing from, if not provided uses
-        current polars-internal node.
-
-    Returns
-    -------
-    Translated IR object
-
-    Raises
-    ------
-    NotImplementedError
-        If we can't translate the nodes due to unsupported functionality.
-    """
-    ctx: AbstractContextManager[None] = (
-        set_node(visitor, n) if n is not None else noop_context
-    )
-    # IR is versioned with major.minor, minor is bumped for backwards
-    # compatible changes (e.g. adding new nodes), major is bumped for
-    # incompatible changes (e.g. renaming nodes).
-    if (version := visitor.version()) >= (4, 0):
-        raise NotImplementedError(
-            f"No support for polars IR {version=}"
-        )  # pragma: no cover; no such version for now.
-
-    with ctx:
-        polars_schema = visitor.get_schema()
-        node = visitor.view_current_node()
-        schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()}
-        result = _translate_ir(node, visitor, schema)
-        if any(
-            isinstance(dtype, pl.Null)
-            for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values())
-        ):
-            raise NotImplementedError(
-                f"No GPU support for {result} with Null column dtype."
-            )
-        return result
+    return ir.HConcat(schema, *(translator.translate_ir(n=n) for n in node.inputs))
 
 
 def translate_named_expr(
-    visitor: NodeTraverser, *, n: pl_expr.PyExprIR
+    translator: Translator, *, n: pl_expr.PyExprIR
 ) -> expr.NamedExpr:
     """
     Translate a polars-internal named expression IR object into our representation.
 
     Parameters
     ----------
-    visitor
-        Polars NodeTraverser object
+    translator
+        Translator object
     n
         Node to translate, a named expression node.
 
@@ -438,12 +491,12 @@ def translate_named_expr(
     NotImplementedError
         If any translation fails due to unsupported functionality.
     """
-    return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node))
+    return expr.NamedExpr(n.output_name, translator.translate_expr(n=n.node))
 
 
 @singledispatch
 def _translate_expr(
-    node: Any, visitor: NodeTraverser, dtype: plc.DataType
+    node: Any, translator: Translator, dtype: plc.DataType
 ) -> expr.Expr:
     raise NotImplementedError(
         f"Translation for {type(node).__name__}"
@@ -451,7 +504,7 @@ def _translate_expr(
 
 
 @_translate_expr.register
-def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     name, *options = node.function_data
     options = tuple(options)
     if isinstance(name, pl_expr.StringFunction):
@@ -460,7 +513,7 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             pl_expr.StringFunction.StripCharsStart,
             pl_expr.StringFunction.StripCharsEnd,
         }:
-            column, chars = (translate_expr(visitor, n=n) for n in node.input)
+            column, chars = (translator.translate_expr(n=n) for n in node.input)
             if isinstance(chars, expr.Literal):
                 if chars.value == pa.scalar(""):
                     # No-op in polars, but libcudf uses empty string
@@ -477,11 +530,11 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             dtype,
             name,
             options,
-            *(translate_expr(visitor, n=n) for n in node.input),
+            *(translator.translate_expr(n=n) for n in node.input),
         )
     elif isinstance(name, pl_expr.BooleanFunction):
         if name == pl_expr.BooleanFunction.IsBetween:
-            column, lo, hi = (translate_expr(visitor, n=n) for n in node.input)
+            column, lo, hi = (translator.translate_expr(n=n) for n in node.input)
             (closed,) = options
             lop, rop = expr.BooleanFunction._BETWEEN_OPS[closed]
             return expr.BinOp(
@@ -494,7 +547,7 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             dtype,
             name,
             options,
-            *(translate_expr(visitor, n=n) for n in node.input),
+            *(translator.translate_expr(n=n) for n in node.input),
         )
     elif isinstance(name, pl_expr.TemporalFunction):
         # functions for which evaluation of the expression may not return
@@ -514,14 +567,14 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             dtype,
             name,
             options,
-            *(translate_expr(visitor, n=n) for n in node.input),
+            *(translator.translate_expr(n=n) for n in node.input),
         )
         if name in needs_cast:
             return expr.Cast(dtype, result_expr)
         return result_expr
 
     elif isinstance(name, str):
-        children = (translate_expr(visitor, n=n) for n in node.input)
+        children = (translator.translate_expr(n=n) for n in node.input)
         if name == "log":
             (base,) = options
             (child,) = children
@@ -540,26 +593,26 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
 
 
 @_translate_expr.register
-def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Window, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     # TODO: raise in groupby?
     if isinstance(node.options, pl_expr.RollingGroupOptions):
         # pl.col("a").rolling(...)
         return expr.RollingWindow(
-            dtype, node.options, translate_expr(visitor, n=node.function)
+            dtype, node.options, translator.translate_expr(n=node.function)
         )
     elif isinstance(node.options, pl_expr.WindowMapping):
         # pl.col("a").over(...)
         return expr.GroupedRollingWindow(
             dtype,
             node.options,
-            translate_expr(visitor, n=node.function),
-            *(translate_expr(visitor, n=n) for n in node.partition_by),
+            translator.translate_expr(n=node.function),
+            *(translator.translate_expr(n=n) for n in node.partition_by),
         )
     assert_never(node.options)
 
 
 @_translate_expr.register
-def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Literal, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     if isinstance(node.value, plrs.PySeries):
         return expr.LiteralColumn(dtype, pl.Series._from_pyseries(node.value))
     value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype))
@@ -567,42 +620,42 @@ def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> exp
 
 
 @_translate_expr.register
-def _(node: pl_expr.Sort, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Sort, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     # TODO: raise in groupby
-    return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr))
+    return expr.Sort(dtype, node.options, translator.translate_expr(n=node.expr))
 
 
 @_translate_expr.register
-def _(node: pl_expr.SortBy, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.SortBy, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     return expr.SortBy(
         dtype,
         node.sort_options,
-        translate_expr(visitor, n=node.expr),
-        *(translate_expr(visitor, n=n) for n in node.by),
+        translator.translate_expr(n=node.expr),
+        *(translator.translate_expr(n=n) for n in node.by),
     )
 
 
 @_translate_expr.register
-def _(node: pl_expr.Gather, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Gather, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     return expr.Gather(
         dtype,
-        translate_expr(visitor, n=node.expr),
-        translate_expr(visitor, n=node.idx),
+        translator.translate_expr(n=node.expr),
+        translator.translate_expr(n=node.idx),
     )
 
 
 @_translate_expr.register
-def _(node: pl_expr.Filter, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Filter, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     return expr.Filter(
         dtype,
-        translate_expr(visitor, n=node.input),
-        translate_expr(visitor, n=node.by),
+        translator.translate_expr(n=node.input),
+        translator.translate_expr(n=node.by),
     )
 
 
 @_translate_expr.register
-def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
-    inner = translate_expr(visitor, n=node.expr)
+def _(node: pl_expr.Cast, translator: Translator, dtype: plc.DataType) -> expr.Expr:
+    inner = translator.translate_expr(n=node.expr)
     # Push casts into literals so we can handle Cast(Literal(Null))
     if isinstance(inner, expr.Literal):
         return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype)))
@@ -614,17 +667,17 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E
 
 
 @_translate_expr.register
-def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Column, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     return expr.Col(dtype, node.name)
 
 
 @_translate_expr.register
-def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Agg, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     value = expr.Agg(
         dtype,
         node.name,
         node.options,
-        *(translate_expr(visitor, n=n) for n in node.arguments),
+        *(translator.translate_expr(n=n) for n in node.arguments),
     )
     if value.name == "count" and value.dtype.id() != plc.TypeId.INT32:
         return expr.Cast(value.dtype, value)
@@ -632,55 +685,30 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex
 
 
 @_translate_expr.register
-def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Ternary, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     return expr.Ternary(
         dtype,
-        translate_expr(visitor, n=node.predicate),
-        translate_expr(visitor, n=node.truthy),
-        translate_expr(visitor, n=node.falsy),
+        translator.translate_expr(n=node.predicate),
+        translator.translate_expr(n=node.truthy),
+        translator.translate_expr(n=node.falsy),
     )
 
 
 @_translate_expr.register
 def _(
-    node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType
+    node: pl_expr.BinaryExpr, translator: Translator, dtype: plc.DataType
 ) -> expr.Expr:
     return expr.BinOp(
         dtype,
         expr.BinOp._MAPPING[node.op],
-        translate_expr(visitor, n=node.left),
-        translate_expr(visitor, n=node.right),
+        translator.translate_expr(n=node.left),
+        translator.translate_expr(n=node.right),
     )
 
 
 @_translate_expr.register
-def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Len, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     value = expr.Len(dtype)
     if dtype.id() != plc.TypeId.INT32:
         return expr.Cast(dtype, value)
     return value  # pragma: no cover; never reached since polars len has uint32 dtype
-
-
-def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr:
-    """
-    Translate a polars-internal expression IR into our representation.
-
-    Parameters
-    ----------
-    visitor
-        Polars NodeTraverser object
-    n
-        Node to translate, an integer referencing a polars internal node.
-
-    Returns
-    -------
-    Translated IR object.
-
-    Raises
-    ------
-    NotImplementedError
-        If any translation fails due to unsupported functionality.
-    """
-    node = visitor.view_expression(n)
-    dtype = dtypes.from_polars(visitor.get_dtype(n))
-    return _translate_expr(node, visitor, dtype)
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index 7b45c1eaa06..2207545aa60 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -10,7 +10,7 @@
 from polars import GPUEngine
 from polars.testing.asserts import assert_frame_equal
 
-from cudf_polars.dsl.translate import translate_ir
+from cudf_polars.dsl.translate import Translator
 
 if TYPE_CHECKING:
     import polars as pl
@@ -117,12 +117,14 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception])
     AssertionError
        If the specified exceptions were not raised.
     """
-    try:
-        _ = translate_ir(q._ldf.visit())
-    except exceptions:
+    translator = Translator(q._ldf.visit())
+    translator.translate_ir()
+    if errors := translator.errors:
+        for err in errors:
+            assert any(
+                isinstance(err, err_type) for err_type in exceptions
+            ), f"Translation DID NOT RAISE {exceptions}"
         return
-    except Exception as e:
-        raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e
     else:
         raise AssertionError(f"Translation DID NOT RAISE {exceptions}")
 
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 2f95cd38c57..080a1af6e19 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -40,7 +40,7 @@ def pytest_configure(config: pytest.Config) -> None:
     )
     config.addinivalue_line(
         "filterwarnings",
-        "ignore:.*Query execution with GPU not supported",
+        "ignore:.*Query execution with GPU not possible",
     )
 
 
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index a90c283ee54..e7ac72df609 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -71,11 +71,16 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool:
     -------
     True if casting is supported, False otherwise
     """
+    has_empty = from_.id() == plc.TypeId.EMPTY or to.id() == plc.TypeId.EMPTY
     return (
         (
-            plc.traits.is_fixed_width(to)
-            and plc.traits.is_fixed_width(from_)
-            and plc.unary.is_supported_cast(from_, to)
+            from_ == to
+            or not has_empty
+            and (
+                plc.traits.is_fixed_width(to)
+                and plc.traits.is_fixed_width(from_)
+                and plc.unary.is_supported_cast(from_, to)
+            )
         )
         or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to))
         or (to.id() == plc.TypeId.STRING and is_numeric_not_bool(from_))
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index a119cab3b74..b08cede8f7f 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -14,6 +14,8 @@
 
 POLARS_VERSION_LT_111 = POLARS_VERSION < parse("1.11")
 POLARS_VERSION_LT_112 = POLARS_VERSION < parse("1.12")
+POLARS_VERSION_GT_112 = POLARS_VERSION > parse("1.12")
+POLARS_VERSION_LT_113 = POLARS_VERSION < parse("1.13")
 
 
 def _ensure_polars_version():
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index 17a94c633f8..2f2361223d2 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -458,12 +458,12 @@ translate it to our intermediate representation (IR), and then execute
 and convert back to polars:
 
 ```python
-from cudf_polars.dsl.translate import translate_ir
+from cudf_polars.dsl.translate import Translator
 
 q = ...
 
 # Convert to our IR
-ir = translate_ir(q._ldf.visit())
+ir = Translator(q._ldf.visit()).translate_ir()
 
 # DataFrame living on the device
 result = ir.evaluate(cache={})
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 2e75dff5c9e..785e87391e7 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.11,<1.13",
+    "polars>=1.11,<1.14",
     "pylibcudf==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -49,6 +49,14 @@ license-files = ["LICENSE"]
 [tool.setuptools.dynamic]
 version = {file = "cudf_polars/VERSION"}
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
diff --git a/python/cudf_polars/tests/dsl/test_to_ast.py b/python/cudf_polars/tests/dsl/test_to_ast.py
index 57d794d4890..f6c24da0180 100644
--- a/python/cudf_polars/tests/dsl/test_to_ast.py
+++ b/python/cudf_polars/tests/dsl/test_to_ast.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import pyarrow as pa
 import pytest
 
 import polars as pl
@@ -10,10 +11,11 @@
 
 import pylibcudf as plc
 
+import cudf_polars.dsl.expr as expr_nodes
 import cudf_polars.dsl.ir as ir_nodes
-from cudf_polars import translate_ir
+from cudf_polars import Translator
 from cudf_polars.containers.dataframe import DataFrame, NamedColumn
-from cudf_polars.dsl.to_ast import to_ast
+from cudf_polars.dsl.to_ast import insert_colrefs, to_ast, to_parquet_filter
 
 
 @pytest.fixture(scope="module")
@@ -58,14 +60,21 @@ def df():
 )
 def test_compute_column(expr, df):
     q = df.select(expr)
-    ir = translate_ir(q._ldf.visit())
+    ir = Translator(q._ldf.visit()).translate_ir()
 
     assert isinstance(ir, ir_nodes.Select)
     table = ir.children[0].evaluate(cache={})
     name_to_index = {c.name: i for i, c in enumerate(table.columns)}
 
     def compute_column(e):
-        ast = to_ast(e.value, name_to_index=name_to_index)
+        e_with_colrefs = insert_colrefs(
+            e.value,
+            table_ref=plc.expressions.TableReference.LEFT,
+            name_to_index=name_to_index,
+        )
+        with pytest.raises(NotImplementedError):
+            e_with_colrefs.evaluate(table)
+        ast = to_ast(e_with_colrefs)
         if ast is not None:
             return NamedColumn(
                 plc.transform.compute_column(table.table, ast), name=e.name
@@ -77,3 +86,28 @@ def compute_column(e):
     expect = q.collect()
 
     assert_frame_equal(expect, got)
+
+
+def test_invalid_colref_construction_raises():
+    literal = expr_nodes.Literal(
+        plc.DataType(plc.TypeId.INT8), pa.scalar(1, type=pa.int8())
+    )
+    with pytest.raises(TypeError):
+        expr_nodes.ColRef(
+            literal.dtype, 0, plc.expressions.TableReference.LEFT, literal
+        )
+
+
+def test_to_ast_without_colref_raises():
+    col = expr_nodes.Col(plc.DataType(plc.TypeId.INT8), "a")
+
+    with pytest.raises(TypeError):
+        to_ast(col)
+
+
+def test_to_parquet_filter_with_colref_raises():
+    col = expr_nodes.Col(plc.DataType(plc.TypeId.INT8), "a")
+    colref = expr_nodes.ColRef(col.dtype, 0, plc.expressions.TableReference.LEFT, col)
+
+    with pytest.raises(TypeError):
+        to_parquet_filter(colref)
diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py
index 15c644d7978..8958c2a0f84 100644
--- a/python/cudf_polars/tests/dsl/test_traversal.py
+++ b/python/cudf_polars/tests/dsl/test_traversal.py
@@ -10,7 +10,7 @@
 
 import pylibcudf as plc
 
-from cudf_polars import translate_ir
+from cudf_polars import Translator
 from cudf_polars.dsl import expr, ir
 from cudf_polars.dsl.traversal import (
     CachingVisitor,
@@ -109,7 +109,7 @@ def test_rewrite_ir_node():
     df = pl.LazyFrame({"a": [1, 2, 1], "b": [1, 3, 4]})
     q = df.group_by("a").agg(pl.col("b").sum()).sort("b")
 
-    orig = translate_ir(q._ldf.visit())
+    orig = Translator(q._ldf.visit()).translate_ir()
 
     new_df = pl.DataFrame({"a": [1, 1, 2], "b": [-1, -2, -4]})
 
@@ -150,7 +150,7 @@ def replace_scan(node, rec):
 
     mapper = CachingVisitor(replace_scan)
 
-    orig = translate_ir(q._ldf.visit())
+    orig = Translator(q._ldf.visit()).translate_ir()
     new = mapper(orig)
 
     result = new.evaluate(cache={}).to_polars()
@@ -174,7 +174,7 @@ def test_rewrite_names_and_ops():
         .collect()
     )
 
-    qir = translate_ir(q._ldf.visit())
+    qir = Translator(q._ldf.visit()).translate_ir()
 
     @singledispatch
     def _transform(e: expr.Expr, fn: ExprTransformer) -> expr.Expr:
diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
index 62df8ce1498..6170281ad54 100644
--- a/python/cudf_polars/tests/expressions/test_sort.py
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -10,7 +10,7 @@
 
 import pylibcudf as plc
 
-from cudf_polars import translate_ir
+from cudf_polars import Translator
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
@@ -68,7 +68,7 @@ def test_setsorted(descending, nulls_last, with_nulls):
 
     assert_gpu_result_equal(q)
 
-    df = translate_ir(q._ldf.visit()).evaluate(cache={})
+    df = Translator(q._ldf.visit()).translate_ir().evaluate(cache={})
 
     a = df.column_map["a"]
 
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
index 9900f598e5f..25b71716eed 100644
--- a/python/cudf_polars/tests/test_config.py
+++ b/python/cudf_polars/tests/test_config.py
@@ -30,7 +30,7 @@ def raise_unimplemented(self, *args):
         pytest.raises(pl.exceptions.ComputeError),
         pytest.warns(
             pl.exceptions.PerformanceWarning,
-            match="Query execution with GPU not supported",
+            match="Query execution with GPU not possible",
         ),
     ):
         # And ensure that collecting issues the correct warning.
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 8ca7a7b9264..2fcbbf21f1c 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -13,7 +13,7 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
-from cudf_polars.utils.versions import POLARS_VERSION_LT_112
+from cudf_polars.utils.versions import POLARS_VERSION_LT_112, POLARS_VERSION_LT_113
 
 
 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"])
@@ -110,7 +110,11 @@ def test_cross_join(left, right, zlice):
 
 
 @pytest.mark.parametrize(
-    "left_on,right_on", [(pl.col("a"), pl.lit(2)), (pl.lit(2), pl.col("a"))]
+    "left_on,right_on",
+    [
+        (pl.col("a"), pl.lit(2, dtype=pl.Int64)),
+        (pl.lit(2, dtype=pl.Int64), pl.col("a")),
+    ],
 )
 def test_join_literal_key_unsupported(left, right, left_on, right_on):
     q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
@@ -125,7 +129,13 @@ def test_join_literal_key_unsupported(left, right, left_on, right_on):
         [pl.col("a_right") <= pl.col("a") * 2],
         [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
         [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],
-        [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
+        pytest.param(
+            [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
+            marks=pytest.mark.xfail(
+                POLARS_VERSION_LT_113,
+                reason="https://github.com/pola-rs/polars/issues/19597",
+            ),
+        ),
     ],
 )
 def test_join_where(left, right, conditions, zlice):
diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py
index e895f27f637..63aa1c573a9 100644
--- a/python/cudf_polars/tests/test_mapfunction.py
+++ b/python/cudf_polars/tests/test_mapfunction.py
@@ -93,16 +93,3 @@ def test_unpivot_defaults():
     )
     q = df.unpivot(index="d")
     assert_gpu_result_equal(q)
-
-
-def test_unpivot_unsupported_cast_raises():
-    df = pl.LazyFrame(
-        {
-            "a": ["x", "y", "z"],
-            "b": pl.Series([1, 3, 5], dtype=pl.Int16),
-        }
-    )
-
-    q = df.unpivot(["a", "b"])
-
-    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index d3baf3bf4d2..dd67a019c77 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -65,6 +65,14 @@ include = [
 ]
 exclude = ["*tests*"]
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.ruff]
 extend = "../../pyproject.toml"
 
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 48cea7266af..a7a116875ea 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -5,6 +5,8 @@
 from dask_expr.io.io import FusedParquetIO
 from dask_expr.io.parquet import FragmentWrapper, ReadParquetPyarrowFS
 
+from dask._task_spec import Task
+
 import cudf
 
 from dask_cudf import _deprecated_api
@@ -19,7 +21,7 @@ def _load_multiple_files(
         frag_filters,
         columns,
         schema,
-        *to_pandas_args,
+        **to_pandas_kwargs,
     ):
         import pyarrow as pa
 
@@ -46,7 +48,7 @@ def _load_multiple_files(
         )
         return CudfReadParquetPyarrowFS._table_to_pandas(
             get(dsk, name),
-            *to_pandas_args,
+            **to_pandas_kwargs,
         )
 
 
@@ -89,7 +91,7 @@ def _table_to_pandas(table, index_name):
             df = df.set_index(index_name)
         return df
 
-    def _filtered_task(self, index: int):
+    def _filtered_task(self, name, index: int):
         columns = self.columns.copy()
         index_name = self.index.name
         if self.index is not None:
@@ -99,16 +101,20 @@ def _filtered_task(self, index: int):
             if columns is None:
                 columns = list(schema.names)
             columns.append(index_name)
-        return (
+        return Task(
+            name,
             self._table_to_pandas,
-            (
+            Task(
+                None,
                 self._fragment_to_table,
-                FragmentWrapper(self.fragments[index], filesystem=self.fs),
-                self.filters,
-                columns,
-                schema,
+                fragment_wrapper=FragmentWrapper(
+                    self.fragments[index], filesystem=self.fs
+                ),
+                filters=self.filters,
+                columns=columns,
+                schema=schema,
             ),
-            index_name,
+            index_name=index_name,
         )
 
     def _tune_up(self, parent):
diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
index 4351b672151..f11a5252080 100644
--- a/python/dask_cudf/dask_cudf/tests/test_reductions.py
+++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -1,7 +1,5 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-import numpy as np
-import pandas as pd
 import pytest
 
 import dask
@@ -10,20 +8,7 @@
 import cudf
 
 import dask_cudf
-
-
-def _make_random_frame(nelem, npartitions=2):
-    rng = np.random.default_rng(seed=0)
-    df = pd.DataFrame(
-        {
-            "x": rng.integers(0, 5, size=nelem),
-            "y": rng.normal(loc=1.0, scale=1.0, size=nelem),
-        }
-    )
-    gdf = cudf.DataFrame.from_pandas(df)
-    dgf = dask_cudf.from_cudf(gdf, npartitions=npartitions)
-    return df, dgf
-
+from dask_cudf.tests.utils import _make_random_frame
 
 _reducers = ["sum", "count", "mean", "var", "std", "min", "max"]
 
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index a9f61f75762..b44b3f939e7 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -19,7 +19,7 @@
 
 
 def _make_random_frame(nelem, npartitions=2, include_na=False):
-    rng = np.random.default_rng(seed=None)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {"x": rng.random(size=nelem), "y": rng.random(size=nelem)}
     )
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index c7e4cbc45ea..07d9143db36 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -46,7 +46,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
 [project.optional-dependencies]
 test = [
     "dask-cuda==24.12.*,>=0.0.0a0",
-    "numba-cuda>=0.0.13",
+    "numba-cuda>=0.0.13,<0.0.18",
     "pytest-cov",
     "pytest-xdist",
     "pytest<8",
@@ -81,6 +81,14 @@ section-order = ["future", "standard-library", "third-party", "dask", "rapids",
 dask = ["dask", "distributed", "dask_cuda"]
 rapids = ["rmm", "cudf"]
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
diff --git a/python/libcudf/cmake/Modules/WheelHelpers.cmake b/python/libcudf/cmake/Modules/WheelHelpers.cmake
deleted file mode 100644
index 278d6751c15..00000000000
--- a/python/libcudf/cmake/Modules/WheelHelpers.cmake
+++ /dev/null
@@ -1,59 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-include_guard(GLOBAL)
-
-# Making libraries available inside wheels by installing the associated targets.
-function(install_aliased_imported_targets)
-  list(APPEND CMAKE_MESSAGE_CONTEXT "install_aliased_imported_targets")
-
-  set(options "")
-  set(one_value "DESTINATION")
-  set(multi_value "TARGETS")
-  cmake_parse_arguments(_ "${options}" "${one_value}" "${multi_value}" ${ARGN})
-
-  message(VERBOSE "Installing targets '${__TARGETS}' into lib_dir '${__DESTINATION}'")
-
-  foreach(target IN LISTS __TARGETS)
-
-    if(NOT TARGET ${target})
-      message(VERBOSE "No target named ${target}")
-      continue()
-    endif()
-
-    get_target_property(alias_target ${target} ALIASED_TARGET)
-    if(alias_target)
-      set(target ${alias_target})
-    endif()
-
-    get_target_property(is_imported ${target} IMPORTED)
-    if(NOT is_imported)
-      # If the target isn't imported, install it into the wheel
-      install(TARGETS ${target} DESTINATION ${__DESTINATION})
-      message(VERBOSE "install(TARGETS ${target} DESTINATION ${__DESTINATION})")
-    else()
-      # If the target is imported, make sure it's global
-      get_target_property(type ${target} TYPE)
-      if(${type} STREQUAL "UNKNOWN_LIBRARY")
-        install(FILES $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})
-        message(VERBOSE "install(FILES $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})")
-      else()
-        install(IMPORTED_RUNTIME_ARTIFACTS ${target} DESTINATION ${__DESTINATION})
-        message(
-          VERBOSE
-          "install(IMPORTED_RUNTIME_ARTIFACTS $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})"
-        )
-      endif()
-    endif()
-  endforeach()
-endfunction()
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 62726bb0df4..8c650eb2144 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -48,6 +48,14 @@ Homepage = "https://github.com/rapidsai/cudf"
 [project.entry-points."cmake.prefix"]
 libcudf = "libcudf"
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 600 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '525M'
+
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
diff --git a/python/pylibcudf/pylibcudf/aggregation.pyi b/python/pylibcudf/pylibcudf/aggregation.pyi
new file mode 100644
index 00000000000..a59e2a9dc93
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/aggregation.pyi
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.types import (
+    DataType,
+    Interpolation,
+    NanEquality,
+    NullEquality,
+    NullOrder,
+    NullPolicy,
+    Order,
+)
+
+class Kind(IntEnum):
+    SUM = ...
+    PRODUCT = ...
+    MIN = ...
+    MAX = ...
+    COUNT_VALID = ...
+    COUNT_ALL = ...
+    ANY = ...
+    ALL = ...
+    SUM_OF_SQUARES = ...
+    MEAN = ...
+    VARIANCE = ...
+    STD = ...
+    MEDIAN = ...
+    QUANTILE = ...
+    ARGMAX = ...
+    ARGMIN = ...
+    NUNIQUE = ...
+    NTH_ELEMENT = ...
+    RANK = ...
+    COLLECT_LIST = ...
+    COLLECT_SET = ...
+    PTX = ...
+    CUDA = ...
+    CORRELATION = ...
+    COVARIANCE = ...
+
+class CorrelationType(IntEnum):
+    PEARSON = ...
+    KENDALL = ...
+    SPEARMAN = ...
+
+class EWMHistory(IntEnum):
+    INFINITE = ...
+    FINITE = ...
+
+class RankMethod(IntEnum):
+    FIRST = ...
+    AVERAGE = ...
+    MIN = ...
+    MAX = ...
+    DENSE = ...
+
+class RankPercentage(IntEnum):
+    NONE = ...
+    ZERO_NORMALIZED = ...
+    ONE_NORMALIZED = ...
+
+class UdfType(IntEnum):
+    CUDA = ...
+    PTX = ...
+
+class Aggregation:
+    def __init__(self): ...
+    def kind(self) -> Kind: ...
+
+def sum() -> Aggregation: ...
+def product() -> Aggregation: ...
+def min() -> Aggregation: ...
+def max() -> Aggregation: ...
+def count(null_handling: NullPolicy = NullPolicy.INCLUDE) -> Aggregation: ...
+def any() -> Aggregation: ...
+def all() -> Aggregation: ...
+def sum_of_squares() -> Aggregation: ...
+def mean() -> Aggregation: ...
+def variance(ddof: int = 1) -> Aggregation: ...
+def std(ddof: int = 1) -> Aggregation: ...
+def median() -> Aggregation: ...
+def quantile(
+    quantiles: list[float], interp: Interpolation = Interpolation.LINEAR
+) -> Aggregation: ...
+def argmax() -> Aggregation: ...
+def argmin() -> Aggregation: ...
+def ewma(center_of_mass: float, history: EWMHistory) -> Aggregation: ...
+def nunique(null_handling: NullPolicy = NullPolicy.EXCLUDE) -> Aggregation: ...
+def nth_element(
+    n: int, null_handling: NullPolicy = NullPolicy.INCLUDE
+) -> Aggregation: ...
+def collect_list(
+    null_handling: NullPolicy = NullPolicy.INCLUDE,
+) -> Aggregation: ...
+def collect_set(
+    null_handling: NullPolicy = NullPolicy.INCLUDE,
+    nulls_equal: NullEquality = NullEquality.EQUAL,
+    nans_equal: NanEquality = NanEquality.ALL_EQUAL,
+) -> Aggregation: ...
+def udf(operation: str, output_type: DataType) -> Aggregation: ...
+def correlation(type: CorrelationType, min_periods: int) -> Aggregation: ...
+def covariance(min_periods: int, ddof: int) -> Aggregation: ...
+def rank(
+    method: RankMethod,
+    column_order: Order = Order.ASCENDING,
+    null_handling: NullPolicy = NullPolicy.EXCLUDE,
+    null_precedence: NullOrder = NullOrder.AFTER,
+    percentage: RankPercentage = RankPercentage.NONE,
+) -> Aggregation: ...
diff --git a/python/pylibcudf/pylibcudf/aggregation.pyx b/python/pylibcudf/pylibcudf/aggregation.pyx
index e510b738f70..662f76d5c8e 100644
--- a/python/pylibcudf/pylibcudf/aggregation.pyx
+++ b/python/pylibcudf/pylibcudf/aggregation.pyx
@@ -64,6 +64,40 @@ from pylibcudf.libcudf.aggregation import udf_type as UdfType  # no-cython-lint
 from .types cimport DataType
 
 
+__all__ = [
+    "Aggregation",
+    "CorrelationType",
+    "EWMHistory",
+    "Kind",
+    "RankMethod",
+    "RankPercentage",
+    "UdfType",
+    "all",
+    "any",
+    "argmax",
+    "argmin",
+    "collect_list",
+    "collect_set",
+    "correlation",
+    "count",
+    "covariance",
+    "ewma",
+    "max",
+    "mean",
+    "median",
+    "min",
+    "nth_element",
+    "nunique",
+    "product",
+    "quantile",
+    "rank",
+    "std",
+    "sum",
+    "sum_of_squares",
+    "udf",
+    "variance",
+]
+
 cdef class Aggregation:
     """A type of aggregation to perform.
 
diff --git a/python/pylibcudf/pylibcudf/binaryop.pyi b/python/pylibcudf/pylibcudf/binaryop.pyi
new file mode 100644
index 00000000000..f745e6c6854
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/binaryop.pyi
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.types import DataType
+
+class BinaryOperator(IntEnum):
+    ADD = ...
+    SUB = ...
+    MUL = ...
+    DIV = ...
+    TRUE_DIV = ...
+    FLOOR_DIV = ...
+    MOD = ...
+    PMOD = ...
+    PYMOD = ...
+    POW = ...
+    INT_POW = ...
+    LOG_BASE = ...
+    ATAN2 = ...
+    SHIFT_LEFT = ...
+    SHIFT_RIGHT = ...
+    SHIFT_RIGHT_UNSIGNED = ...
+    BITWISE_AND = ...
+    BITWISE_OR = ...
+    BITWISE_XOR = ...
+    LOGICAL_AND = ...
+    LOGICAL_OR = ...
+    EQUAL = ...
+    NOT_EQUAL = ...
+    LESS = ...
+    GREATER = ...
+    LESS_EQUAL = ...
+    GREATER_EQUAL = ...
+    NULL_EQUALS = ...
+    NULL_MAX = ...
+    NULL_MIN = ...
+    NULL_NOT_EQUALS = ...
+    GENERIC_BINARY = ...
+    NULL_LOGICAL_AND = ...
+    NULL_LOGICAL_OR = ...
+    INVALID_BINARY = ...
+
+def binary_operation(
+    lhs: Column | Scalar,
+    rhs: Column | Scalar,
+    op: BinaryOperator,
+    output_type: DataType,
+) -> Column: ...
+def is_supported_operation(
+    out: DataType, lhs: DataType, rhs: DataType, op: BinaryOperator
+) -> bool: ...
diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
index eef73bf4e9d..b7b4ecc6e83 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -16,6 +16,7 @@ from .column cimport Column
 from .scalar cimport Scalar
 from .types cimport DataType
 
+__all__ = ["BinaryOperator", "binary_operation", "is_supported_operation"]
 
 cpdef Column binary_operation(
     LeftBinaryOperand lhs,
diff --git a/python/pylibcudf/pylibcudf/column.pyi b/python/pylibcudf/pylibcudf/column.pyi
new file mode 100644
index 00000000000..c9f70de3dbf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/column.pyi
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from collections.abc import Sequence
+from typing import Any
+
+from pylibcudf.gpumemoryview import gpumemoryview
+from pylibcudf.scalar import Scalar
+from pylibcudf.types import DataType
+
+class Column:
+    def __init__(
+        self,
+        data_type: DataType,
+        size: int,
+        data: gpumemoryview | None,
+        mask: gpumemoryview | None,
+        null_count: int,
+        offset: int,
+        children: list[Column],
+    ) -> None: ...
+    def type(self) -> DataType: ...
+    def child(self, index: int) -> Column: ...
+    def size(self) -> int: ...
+    def null_count(self) -> int: ...
+    def offset(self) -> int: ...
+    def data(self) -> gpumemoryview | None: ...
+    def null_mask(self) -> gpumemoryview | None: ...
+    def children(self) -> list[Column]: ...
+    def copy(self) -> Column: ...
+    def with_mask(
+        self, mask: gpumemoryview | None, null_count: int
+    ) -> Column: ...
+    def list_view(self) -> ListColumnView: ...
+    @staticmethod
+    def from_scalar(scalar: Scalar, size: int) -> Column: ...
+    @staticmethod
+    def all_null_like(like: Column, size: int) -> Column: ...
+    @staticmethod
+    def from_cuda_array_interface_obj(obj: Any) -> Column: ...
+
+class ListColumnView:
+    def __init__(self, column: Column): ...
+    def child(self) -> Column: ...
+    def offsets(self) -> Column: ...
+
+def is_c_contiguous(
+    shape: Sequence[int], strides: Sequence[int], itemsize: int
+) -> bool: ...
diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx
index 4e5698566d0..9bb5574608e 100644
--- a/python/pylibcudf/pylibcudf/column.pyx
+++ b/python/pylibcudf/pylibcudf/column.pyx
@@ -17,6 +17,7 @@ from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
 
 import functools
 
+__all__ = ["Column", "ListColumnView", "is_c_contiguous"]
 
 cdef class Column:
     """A container of nullable device data as a column of elements.
@@ -61,6 +62,8 @@ cdef class Column:
         self._children = children
         self._num_children = len(children)
 
+    __hash__ = None
+
     cdef column_view view(self) nogil:
         """Generate a libcudf column_view to pass to libcudf algorithms.
 
@@ -384,6 +387,8 @@ cdef class ListColumnView:
             raise TypeError("Column is not a list type")
         self._column = col
 
+    __hash__ = None
+
     cpdef child(self):
         """The data column of the underlying list column."""
         return self._column.child(1)
diff --git a/python/pylibcudf/pylibcudf/column_factories.pyi b/python/pylibcudf/pylibcudf/column_factories.pyi
new file mode 100644
index 00000000000..c87fe423acb
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/column_factories.pyi
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from pylibcudf.column import Column
+from pylibcudf.types import DataType, MaskState, TypeId
+
+def make_empty_column(type_or_id: DataType | TypeId) -> Column: ...
+def make_numeric_column(
+    type_: DataType, size: int, mstate: MaskState
+) -> Column: ...
+def make_fixed_point_column(
+    type_: DataType, size: int, mstate: MaskState
+) -> Column: ...
+def make_timestamp_column(
+    type_: DataType, size: int, mstate: MaskState
+) -> Column: ...
+def make_duration_column(
+    type_: DataType, size: int, mstate: MaskState
+) -> Column: ...
+def make_fixed_width_column(
+    type_: DataType, size: int, mstate: MaskState
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx
index ac942a620b5..c4969a7f502 100644
--- a/python/pylibcudf/pylibcudf/column_factories.pyx
+++ b/python/pylibcudf/pylibcudf/column_factories.pyx
@@ -17,6 +17,15 @@ from .types cimport DataType, type_id
 from .types import MaskState, TypeId
 
 
+__all__ = [
+    "make_duration_column",
+    "make_empty_column",
+    "make_fixed_point_column",
+    "make_fixed_width_column",
+    "make_numeric_column",
+    "make_timestamp_column",
+]
+
 cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id):
     """Creates an empty column of the specified type.
 
diff --git a/python/pylibcudf/pylibcudf/concatenate.pyi b/python/pylibcudf/pylibcudf/concatenate.pyi
new file mode 100644
index 00000000000..79076f509e0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/concatenate.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+
+def concatenate[ColumnOrTable: (Column, Table)](
+    objects: list[ColumnOrTable],
+) -> ColumnOrTable: ...
diff --git a/python/pylibcudf/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx
index 10c860d97bb..42c5f34cf3e 100644
--- a/python/pylibcudf/pylibcudf/concatenate.pyx
+++ b/python/pylibcudf/pylibcudf/concatenate.pyx
@@ -12,6 +12,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 from .column cimport Column
 from .table cimport Table
 
+__all__ = ["concatenate"]
 
 cpdef concatenate(list objects):
     """Concatenate columns or tables.
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyi b/python/pylibcudf/pylibcudf/contiguous_split.pyi
new file mode 100644
index 00000000000..dd6328fbf23
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.gpumemoryview import gpumemoryview
+from pylibcudf.table import Table
+
+class PackedColumns:
+    def __init__(self): ...
+    def release(self) -> tuple[memoryview, gpumemoryview]: ...
+
+def pack(input: Table) -> PackedColumns: ...
+def unpack(input: PackedColumns) -> Table: ...
+def unpack_from_memoryviews(
+    metadata: memoryview, gpu_data: gpumemoryview
+) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx
index ed926a3fcc0..94873e079c9 100644
--- a/python/pylibcudf/pylibcudf/contiguous_split.pyx
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx
@@ -20,6 +20,13 @@ from .table cimport Table
 from .utils cimport int_to_void_ptr
 
 
+__all__ = [
+    "PackedColumns",
+    "pack",
+    "unpack",
+    "unpack_from_memoryviews",
+]
+
 cdef class HostBuffer:
     """Owning host buffer that implements the buffer protocol"""
     cdef unique_ptr[vector[uint8_t]] c_obj
@@ -38,6 +45,8 @@ cdef class HostBuffer:
         out.strides[0] = 1
         return out
 
+    __hash__ = None
+
     def __getbuffer__(self, Py_buffer *buffer, int flags):
         buffer.buf = dereference(self.c_obj).data()
         buffer.format = NULL  # byte
@@ -69,6 +78,8 @@ cdef class PackedColumns:
             "Use one of the factories."
         )
 
+    __hash__ = None
+
     @staticmethod
     cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data):
         """Create a Python PackedColumns from a libcudf packed_columns."""
diff --git a/python/pylibcudf/pylibcudf/copying.pyi b/python/pylibcudf/pylibcudf/copying.pyi
new file mode 100644
index 00000000000..6cf4ed48724
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/copying.pyi
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+from typing import TypeVar
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+
+class MaskAllocationPolicy(IntEnum):
+    NEVER = ...
+    RETAIN = ...
+    ALWAYS = ...
+
+class OutOfBoundsPolicy(IntEnum):
+    NULLIFY = ...
+    DONT_CHECK = ...
+
+ColumnOrTable = TypeVar("ColumnOrTable", Column, Table)
+
+def gather(
+    source_table: Table, gather_map: Column, bounds_policy: OutOfBoundsPolicy
+) -> Table: ...
+def scatter(
+    source: Table | list[Scalar], scatter_map: Column, target_table: Table
+) -> Table: ...
+def empty_like(input: ColumnOrTable) -> ColumnOrTable: ...
+def allocate_like(
+    input_column: Column, policy: MaskAllocationPolicy, size: int | None = None
+) -> Column: ...
+def copy_range_in_place(
+    input_column: Column,
+    target_column: Column,
+    input_begin: int,
+    input_end: int,
+    target_begin: int,
+) -> Column: ...
+def copy_range(
+    input_column: Column,
+    target_column: Column,
+    input_begin: int,
+    input_end: int,
+    target_begin: int,
+) -> Column: ...
+def shift(input: Column, offset: int, fill_value: Scalar) -> Column: ...
+def slice(input: ColumnOrTable, indices: list[int]) -> list[ColumnOrTable]: ...
+def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ...
+def copy_if_else(
+    lhs: Column | Scalar, rhs: Column | Scalar, boolean_mask: Column
+) -> Column: ...
+def boolean_mask_scatter(
+    input: Table | list[Scalar], target: Table, boolean_mask: Column
+) -> Table: ...
+def get_element(input_column: Column, index: int) -> Scalar: ...
diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx
index 4938f1a3dda..fb8b6f9890e 100644
--- a/python/pylibcudf/pylibcudf/copying.pyx
+++ b/python/pylibcudf/pylibcudf/copying.pyx
@@ -36,6 +36,23 @@ from .table cimport Table
 from .utils cimport _as_vector
 
 
+__all__ = [
+    "MaskAllocationPolicy",
+    "OutOfBoundsPolicy",
+    "allocate_like",
+    "boolean_mask_scatter",
+    "copy_if_else",
+    "copy_range",
+    "copy_range_in_place",
+    "empty_like",
+    "gather",
+    "get_element",
+    "scatter",
+    "shift",
+    "slice",
+    "split",
+]
+
 cpdef Table gather(
     Table source_table,
     Column gather_map,
diff --git a/python/pylibcudf/pylibcudf/datetime.pyi b/python/pylibcudf/pylibcudf/datetime.pyi
new file mode 100644
index 00000000000..6a3ae7953d9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/datetime.pyi
@@ -0,0 +1,45 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class DatetimeComponent(IntEnum):
+    YEAR = ...
+    MONTH = ...
+    DAY = ...
+    WEEKDAY = ...
+    HOUR = ...
+    MINUTE = ...
+    SECOND = ...
+    MILLISECOND = ...
+    MICROSECOND = ...
+    NANOSECOND = ...
+
+class RoundingFrequency(IntEnum):
+    DAY = ...
+    HOUR = ...
+    MINUTE = ...
+    SECOND = ...
+    MILLISECOND = ...
+    MICROSECOND = ...
+    NANOSECOND = ...
+
+def extract_millisecond_fraction(input: Column) -> Column: ...
+def extract_microsecond_fraction(input: Column) -> Column: ...
+def extract_nanosecond_fraction(input: Column) -> Column: ...
+def extract_datetime_component(
+    input: Column, component: DatetimeComponent
+) -> Column: ...
+def ceil_datetimes(input: Column, freq: RoundingFrequency) -> Column: ...
+def floor_datetimes(input: Column, freq: RoundingFrequency) -> Column: ...
+def round_datetimes(input: Column, freq: RoundingFrequency) -> Column: ...
+def add_calendrical_months(
+    input: Column, months: Column | Scalar
+) -> Column: ...
+def day_of_year(input: Column) -> Column: ...
+def is_leap_year(input: Column) -> Column: ...
+def last_day_of_month(input: Column) -> Column: ...
+def extract_quarter(input: Column) -> Column: ...
+def days_in_month(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx
index 9e5e709d81d..b100e3e22d0 100644
--- a/python/pylibcudf/pylibcudf/datetime.pyx
+++ b/python/pylibcudf/pylibcudf/datetime.pyx
@@ -29,6 +29,24 @@ from cython.operator cimport dereference
 
 from .column cimport Column
 
+__all__ = [
+    "DatetimeComponent",
+    "RoundingFrequency",
+    "add_calendrical_months",
+    "ceil_datetimes",
+    "day_of_year",
+    "days_in_month",
+    "extract_datetime_component",
+    "extract_microsecond_fraction",
+    "extract_millisecond_fraction",
+    "extract_nanosecond_fraction",
+    "extract_quarter",
+    "floor_datetimes",
+    "is_leap_year",
+    "last_day_of_month",
+    "round_datetimes",
+]
+
 cpdef Column extract_millisecond_fraction(
     Column input
 ):
diff --git a/python/pylibcudf/pylibcudf/experimental.pyi b/python/pylibcudf/pylibcudf/experimental.pyi
new file mode 100644
index 00000000000..bbfb86b0ff6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/experimental.pyi
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+def enable_prefetching(key: str) -> None: ...
+def disable_prefetching(key: str) -> None: ...
+def prefetch_debugging(enable: bool) -> None: ...
diff --git a/python/pylibcudf/pylibcudf/experimental.pyx b/python/pylibcudf/pylibcudf/experimental.pyx
index b25a53e13b2..d94d6d087ac 100644
--- a/python/pylibcudf/pylibcudf/experimental.pyx
+++ b/python/pylibcudf/pylibcudf/experimental.pyx
@@ -5,6 +5,8 @@ from libcpp.string cimport string
 from pylibcudf.libcudf cimport experimental as cpp_experimental
 
 
+__all__ = ["disable_prefetching", "enable_prefetching", "prefetch_debugging"]
+
 cpdef enable_prefetching(str key):
     """Turn on prefetch instructions for the given key.
 
diff --git a/python/pylibcudf/pylibcudf/expressions.pyi b/python/pylibcudf/pylibcudf/expressions.pyi
new file mode 100644
index 00000000000..12b473d8605
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/expressions.pyi
@@ -0,0 +1,79 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from enum import IntEnum
+
+from pylibcudf.scalar import Scalar
+
+class TableReference(IntEnum):
+    LEFT = ...
+    RIGHT = ...
+
+class ASTOperator(IntEnum):
+    ADD = ...
+    SUB = ...
+    MUL = ...
+    DIV = ...
+    TRUE_DIV = ...
+    FLOOR_DIV = ...
+    MOD = ...
+    PYMOD = ...
+    POW = ...
+    EQUAL = ...
+    NULL_EQUAL = ...
+    NOT_EQUAL = ...
+    LESS = ...
+    GREATER = ...
+    LESS_EQUAL = ...
+    GREATER_EQUAL = ...
+    BITWISE_AND = ...
+    BITWISE_OR = ...
+    BITWISE_XOR = ...
+    NULL_LOGICAL_AND = ...
+    LOGICAL_AND = ...
+    NULL_LOGICAL_OR = ...
+    LOGICAL_OR = ...
+    IDENTITY = ...
+    IS_NULL = ...
+    SIN = ...
+    COS = ...
+    TAN = ...
+    ARCSIN = ...
+    ARCCOS = ...
+    ARCTAN = ...
+    SINH = ...
+    COSH = ...
+    TANH = ...
+    ARCSINH = ...
+    ARCCOSH = ...
+    ARCTANH = ...
+    EXP = ...
+    LOG = ...
+    SQRT = ...
+    CBRT = ...
+    CEIL = ...
+    FLOOR = ...
+    ABS = ...
+    RINT = ...
+    BIT_INVERT = ...
+    NOT = ...
+
+class Expression:
+    def __init__(self): ...
+
+class Literal(Expression):
+    def __init__(self, value: Scalar): ...
+
+class ColumnReference(Expression):
+    def __init__(
+        self, index: int, table_source: TableReference = TableReference.LEFT
+    ): ...
+
+class ColumnNameReference(Expression):
+    def __init__(self, name: str): ...
+
+class Operation(Expression):
+    def __init__(
+        self,
+        op: ASTOperator,
+        left: Expression,
+        right: Expression | None = None,
+    ): ...
diff --git a/python/pylibcudf/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx
index 1535f68366b..0f12cfe313c 100644
--- a/python/pylibcudf/pylibcudf/expressions.pyx
+++ b/python/pylibcudf/pylibcudf/expressions.pyx
@@ -49,6 +49,16 @@ from .types cimport DataType
 # Aliases for simplicity
 ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
 
+__all__ = [
+    "ASTOperator",
+    "ColumnNameReference",
+    "ColumnReference",
+    "Expression",
+    "Literal",
+    "Operation",
+    "TableReference",
+]
+
 # Define this class just to have a docstring for it
 cdef class Expression:
     """
@@ -58,7 +68,7 @@ cdef class Expression:
 
     For details, see :cpp:class:`cudf::ast::expression`.
     """
-    pass
+    __hash__ = None
 
 cdef class Literal(Expression):
     """
diff --git a/python/pylibcudf/pylibcudf/filling.pxd b/python/pylibcudf/pylibcudf/filling.pxd
index b9345f8cd42..56aef086e1b 100644
--- a/python/pylibcudf/pylibcudf/filling.pxd
+++ b/python/pylibcudf/pylibcudf/filling.pxd
@@ -33,3 +33,9 @@ cpdef Table repeat(
     Table input_table,
     ColumnOrSize count
 )
+
+cpdef Column calendrical_month_sequence(
+    size_type n,
+    Scalar init,
+    size_type months,
+)
diff --git a/python/pylibcudf/pylibcudf/filling.pyi b/python/pylibcudf/pylibcudf/filling.pyi
new file mode 100644
index 00000000000..0b5e29bdc32
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/filling.pyi
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+
+def fill(
+    destination: Column, begin: int, end: int, value: Scalar
+) -> Column: ...
+def fill_in_place(
+    destination: Column, begin: int, end: int, value: Scalar
+) -> None: ...
+def sequence(size: int, init: Scalar, step: Scalar) -> Column: ...
+def repeat(input_table: Table, count: Column | int) -> Table: ...
+def calendrical_month_sequence(
+    n: int, init: Scalar, months: int
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx
index a47004a1e42..ea5b45ff7c2 100644
--- a/python/pylibcudf/pylibcudf/filling.pyx
+++ b/python/pylibcudf/pylibcudf/filling.pyx
@@ -9,6 +9,7 @@ from pylibcudf.libcudf.filling cimport (
     fill_in_place as cpp_fill_in_place,
     repeat as cpp_repeat,
     sequence as cpp_sequence,
+    calendrical_month_sequence as cpp_calendrical_month_sequence
 )
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
@@ -18,6 +19,14 @@ from .scalar cimport Scalar
 from .table cimport Table
 
 
+__all__ = [
+    "fill",
+    "fill_in_place",
+    "repeat",
+    "sequence",
+    "calendrical_month_sequence",
+]
+
 cpdef Column fill(
     Column destination,
     size_type begin,
@@ -164,3 +173,39 @@ cpdef Table repeat(
                 count
             )
     return Table.from_libcudf(move(result))
+
+
+cpdef Column calendrical_month_sequence(
+    size_type n,
+    Scalar init,
+    size_type months,
+):
+
+    """Fill destination column from begin to end with value.
+
+    For details, see :cpp:func:`calendrical_month_sequence`.
+
+    Parameters
+    ----------
+    n : size_type
+        Number of timestamps to generate
+    init : Scalar
+        The initial timestamp
+    months : size_type
+        Months to increment
+
+    Returns
+    -------
+    pylibcudf.Column
+        Timestamps column with sequences of months
+    """
+
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_calendrical_month_sequence(
+            n,
+            dereference(init.c_obj),
+            months
+        )
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyi b/python/pylibcudf/pylibcudf/gpumemoryview.pyi
new file mode 100644
index 00000000000..50f1f39a515
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyi
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from collections.abc import Mapping
+from typing import Any
+
+class gpumemoryview:
+    def __init__(self, data: Any): ...
+    @property
+    def __cuda_array_interface__(self) -> Mapping[str, Any]: ...
diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyx b/python/pylibcudf/pylibcudf/gpumemoryview.pyx
index 0904022a944..41316eddb60 100644
--- a/python/pylibcudf/pylibcudf/gpumemoryview.pyx
+++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+__all__ = ["gpumemoryview"]
 
 cdef class gpumemoryview:
     """Minimal representation of a memory buffer.
@@ -25,3 +26,5 @@ cdef class gpumemoryview:
     @property
     def __cuda_array_interface__(self):
         return self.obj.__cuda_array_interface__
+
+    __hash__ = None
diff --git a/python/pylibcudf/pylibcudf/groupby.pyi b/python/pylibcudf/pylibcudf/groupby.pyi
new file mode 100644
index 00000000000..883ad6e34cf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/groupby.pyi
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.aggregation import Aggregation
+from pylibcudf.column import Column
+from pylibcudf.replace import ReplacePolicy
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+from pylibcudf.types import NullOrder, NullPolicy, Order, Sorted
+
+class GroupByRequest:
+    def __init__(
+        self, values: Column, aggregations: list[Aggregation]
+    ) -> None: ...
+
+class GroupBy:
+    def __init__(
+        self,
+        keys: Table,
+        null_handling: NullPolicy = NullPolicy.EXCLUDE,
+        keys_are_sorted: Sorted = Sorted.NO,
+        column_order: list[Order] | None = None,
+        null_precedence: list[NullOrder] | None = None,
+    ) -> None: ...
+    def aggregate(
+        self, requests: list[GroupByRequest]
+    ) -> tuple[Table, list[Table]]: ...
+    def scan(
+        self, requests: list[GroupByRequest]
+    ) -> tuple[Table, list[Table]]: ...
+    def shift(
+        self, values: Table, offset: list[int], fill_values: list[Scalar]
+    ) -> tuple[Table, Table]: ...
+    def replace_nulls(
+        self, value: Table, replace_policies: list[ReplacePolicy]
+    ) -> tuple[Table, Table]: ...
+    def get_groups(
+        self, values: Table | None = None
+    ) -> tuple[list[int], Table, Table]: ...
diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx
index 71f9ecb0453..e6cb3ac81a7 100644
--- a/python/pylibcudf/pylibcudf/groupby.pyx
+++ b/python/pylibcudf/pylibcudf/groupby.pyx
@@ -25,6 +25,8 @@ from .types cimport null_order, null_policy, order, sorted
 from .utils cimport _as_vector
 
 
+__all__ = ["GroupBy", "GroupByRequest"]
+
 cdef class GroupByRequest:
     """A request for a groupby aggregation or scan.
 
@@ -45,6 +47,8 @@ cdef class GroupByRequest:
         self._values = values
         self._aggregations = aggregations
 
+    __hash__ = None
+
     cdef aggregation_request _to_libcudf_agg_request(self) except *:
         """Convert to a libcudf aggregation_request object.
 
@@ -127,6 +131,8 @@ cdef class GroupBy:
         # deallocated from under us:
         self._keys = keys
 
+    __hash__ = None
+
     @staticmethod
     cdef tuple _parse_outputs(
         pair[unique_ptr[table], vector[aggregation_result]] c_res
diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi
new file mode 100644
index 00000000000..a849f5d0729
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/hashing.pyi
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from typing import Final
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+
+LIBCUDF_DEFAULT_HASH_SEED: Final[int]
+
+def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ...
+def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ...
+def xxhash_64(input: Table, seed: int = ...) -> Column: ...
+def md5(input: Table) -> Column: ...
+def sha1(input: Table) -> Column: ...
+def sha224(input: Table) -> Column: ...
+def sha256(input: Table) -> Column: ...
+def sha384(input: Table) -> Column: ...
+def sha512(input: Table) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx
index 9ea3d4d1bda..548cffc0ce8 100644
--- a/python/pylibcudf/pylibcudf/hashing.pyx
+++ b/python/pylibcudf/pylibcudf/hashing.pyx
@@ -20,6 +20,19 @@ from pylibcudf.libcudf.table.table cimport table
 from .column cimport Column
 from .table cimport Table
 
+__all__ = [
+    "LIBCUDF_DEFAULT_HASH_SEED",
+    "md5",
+    "murmurhash3_x64_128",
+    "murmurhash3_x86_32",
+    "sha1",
+    "sha224",
+    "sha256",
+    "sha384",
+    "sha512",
+    "xxhash_64",
+]
+
 LIBCUDF_DEFAULT_HASH_SEED = DEFAULT_HASH_SEED
 
 cpdef Column murmurhash3_x86_32(
diff --git a/python/pylibcudf/pylibcudf/interop.pyi b/python/pylibcudf/pylibcudf/interop.pyi
new file mode 100644
index 00000000000..63de816010b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/interop.pyi
@@ -0,0 +1,52 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass
+from typing import Any, overload
+
+import pyarrow as pa
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+from pylibcudf.types import DataType
+
+@dataclass
+class ColumnMetadata:
+    name: str = ...
+    children_meta: list[ColumnMetadata] = ...
+
+@overload
+def from_arrow(obj: pa.DataType) -> DataType: ...
+@overload
+def from_arrow(
+    obj: pa.Scalar[Any], *, data_type: DataType | None = None
+) -> Scalar: ...
+@overload
+def from_arrow(obj: pa.Array[Any]) -> Column: ...
+@overload
+def from_arrow(obj: pa.Table) -> Table: ...
+@overload
+def to_arrow(
+    obj: DataType,
+    *,
+    precision: int | None = None,
+    fields: Iterable[pa.Field[pa.DataType] | tuple[str, pa.DataType]]
+    | Mapping[str, pa.DataType]
+    | None = None,
+    value_type: pa.DataType | None = None,
+) -> pa.DataType: ...
+@overload
+def to_arrow(
+    obj: Table, metadata: list[ColumnMetadata | str] | None = None
+) -> pa.Table: ...
+@overload
+def to_arrow(
+    obj: Column, metadata: ColumnMetadata | str | None = None
+) -> pa.Array[Any]: ...
+@overload
+def to_arrow(
+    obj: Scalar, metadata: ColumnMetadata | str | None = None
+) -> pa.Scalar[Any]: ...
+def from_dlpack(managed_tensor: Any) -> Table: ...
+def to_dlpack(input: Table) -> Any: ...
diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
index 61e812353b7..bd5397ac328 100644
--- a/python/pylibcudf/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -38,6 +38,14 @@ from .scalar cimport Scalar
 from .table cimport Table
 from .types cimport DataType, type_id
 
+__all__ = [
+    "ColumnMetadata",
+    "from_arrow",
+    "from_dlpack",
+    "to_arrow",
+    "to_dlpack",
+]
+
 ARROW_TO_PYLIBCUDF_TYPES = {
     pa.int8(): type_id.INT8,
     pa.int16(): type_id.INT16,
diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
index 965724a47b1..664faef718f 100644
--- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
@@ -12,8 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx
-                   types.pyx
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx
+                   parquet_metadata.pyx text.pyx timezone.pyx types.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd
index 1bcc0a3f963..663804e714d 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/io/__init__.pxd
@@ -1,5 +1,15 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 # CSV is removed since it is def not cpdef (to force kw-only arguments)
-from . cimport avro, datasource, json, orc, parquet, timezone, types
+from . cimport (
+    avro,
+    datasource,
+    json,
+    orc,
+    parquet,
+    parquet_metadata,
+    text,
+    timezone,
+    types,
+)
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
index 2e4f215b12c..f913a400684 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.py
+++ b/python/pylibcudf/pylibcudf/io/__init__.py
@@ -1,4 +1,31 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, csv, datasource, json, orc, parquet, timezone, types
+from . import (
+    avro,
+    csv,
+    datasource,
+    json,
+    orc,
+    parquet,
+    parquet_metadata,
+    text,
+    timezone,
+    types,
+)
 from .types import SinkInfo, SourceInfo, TableWithMetadata
+
+__all__ = [
+    "SinkInfo",
+    "SourceInfo",
+    "TableWithMetadata",
+    "avro",
+    "csv",
+    "datasource",
+    "json",
+    "orc",
+    "parquet",
+    "parquet_metadata",
+    "text",
+    "timezone",
+    "types",
+]
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyi b/python/pylibcudf/pylibcudf/io/avro.pyi
new file mode 100644
index 00000000000..49c2f083702
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/avro.pyi
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from pylibcudf.io.types import SourceInfo, TableWithMetadata
+
+__all__ = ["read_avro"]
+
+def read_avro(
+    source_info: SourceInfo,
+    columns: list[str] | None = None,
+    skip_rows: int = 0,
+    num_rows: int = -1,
+) -> TableWithMetadata: ...
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx
index fe765b34f82..4271333511a 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyx
+++ b/python/pylibcudf/pylibcudf/io/avro.pyx
@@ -10,6 +10,8 @@ from pylibcudf.libcudf.io.avro cimport (
 )
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = ["read_avro"]
+
 
 cpdef TableWithMetadata read_avro(
     SourceInfo source_info,
diff --git a/python/pylibcudf/pylibcudf/io/csv.pyi b/python/pylibcudf/pylibcudf/io/csv.pyi
new file mode 100644
index 00000000000..356825a927d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/csv.pyi
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from collections.abc import Mapping
+
+from pylibcudf.io.types import (
+    CompressionType,
+    QuoteStyle,
+    SourceInfo,
+    TableWithMetadata,
+)
+from pylibcudf.types import DataType
+
+def read_csv(
+    source_info: SourceInfo,
+    *,
+    compression: CompressionType = CompressionType.AUTO,
+    byte_range_offset: int = 0,
+    byte_range_size: int = 0,
+    col_names: list[str] | None = None,
+    prefix: str = "",
+    mangle_dupe_cols: bool = True,
+    usecols: list[int] | list[str] | None = None,
+    nrows: int = -1,
+    skiprows: int = 0,
+    skipfooter: int = 0,
+    header: int = 0,
+    lineterminator: str = "\n",
+    delimiter: str | None = None,
+    thousands: str | None = None,
+    decimal: str = ".",
+    comment: str | None = None,
+    delim_whitespace: bool = False,
+    skipinitialspace: bool = False,
+    skip_blank_lines: bool = True,
+    quoting: QuoteStyle = QuoteStyle.MINIMAL,
+    quotechar: str = '"',
+    doublequote: bool = True,
+    parse_dates: list[str] | list[int] | None = None,
+    parse_hex: list[str] | list[int] | None = None,
+    # Technically this should be dict/list
+    # but using a fused type prevents using None as default
+    dtypes: Mapping[str, DataType] | list[DataType] | None = None,
+    true_values: list[str] | None = None,
+    false_values: list[str] | None = None,
+    na_values: list[str] | None = None,
+    keep_default_na: bool = True,
+    na_filter: bool = True,
+    dayfirst: bool = False,
+    # Note: These options are supported by the libcudf reader
+    # but are not exposed here since there is no demand for them
+    # on the Python side yet.
+    # detect_whitespace_around_quotes: bool = False,
+    # timestamp_type: DataType = DataType(type_id.EMPTY),
+) -> TableWithMetadata: ...
diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx
index 2c61cc42d82..858e580ab34 100644
--- a/python/pylibcudf/pylibcudf/io/csv.pyx
+++ b/python/pylibcudf/pylibcudf/io/csv.pyx
@@ -19,6 +19,8 @@ from pylibcudf.libcudf.types cimport data_type, size_type
 from pylibcudf.types cimport DataType
 
 
+__all__ = ["read_csv"]
+
 cdef tuple _process_parse_dates_hex(list cols):
     cdef vector[string] str_cols
     cdef vector[int] int_cols
diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyi b/python/pylibcudf/pylibcudf/io/datasource.pyi
new file mode 100644
index 00000000000..e52197f793b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/datasource.pyi
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+class Datasource:
+    def __init__(self): ...
diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyx b/python/pylibcudf/pylibcudf/io/datasource.pyx
index 02418444caa..aac1c0d1014 100644
--- a/python/pylibcudf/pylibcudf/io/datasource.pyx
+++ b/python/pylibcudf/pylibcudf/io/datasource.pyx
@@ -2,8 +2,10 @@
 
 from pylibcudf.libcudf.io.datasource cimport datasource
 
+__all__ = ["Datasource"]
 
 cdef class Datasource:
+    __hash__ = None
     cdef datasource* get_datasource(self) except * nogil:
         with gil:
             raise NotImplementedError("get_datasource() should not "
diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi
new file mode 100644
index 00000000000..b2bc6a43700
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/json.pyi
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from collections.abc import Mapping
+from typing import TypeAlias
+
+from pylibcudf.column import Column
+from pylibcudf.io.types import (
+    CompressionType,
+    JSONRecoveryMode,
+    SinkInfo,
+    SourceInfo,
+    TableWithMetadata,
+)
+from pylibcudf.types import DataType
+
+ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap]
+
+NameAndType: TypeAlias = tuple[str, DataType, list[NameAndType]]
+
+def read_json(
+    source_info: SourceInfo,
+    dtypes: list[NameAndType] | None = None,
+    compression: CompressionType = CompressionType.AUTO,
+    lines: bool = False,
+    byte_range_offset: int = 0,
+    byte_range_size: int = 0,
+    keep_quotes: bool = False,
+    mixed_types_as_string: bool = False,
+    prune_columns: bool = False,
+    recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL,
+) -> TableWithMetadata: ...
+def write_json(
+    sink_info: SinkInfo,
+    table_w_meta: TableWithMetadata,
+    na_rep: str = "",
+    include_nulls: bool = False,
+    lines: bool = False,
+    rows_per_chunk: int = 2**32 - 1,
+    true_value: str = "true",
+    false_value: str = "false",
+) -> None: ...
+def chunked_read_json(
+    source_info: SourceInfo,
+    dtypes: list[NameAndType] | None = None,
+    compression: CompressionType = CompressionType.AUTO,
+    keep_quotes: bool = False,
+    mixed_types_as_string: bool = False,
+    prune_columns: bool = False,
+    recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL,
+    chunk_size: int = 100_000_000,
+) -> tuple[list[Column], list[str], ChildNameToTypeMap]: ...
diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx
index 65f78f830f1..ad2989925c9 100644
--- a/python/pylibcudf/pylibcudf/io/json.pyx
+++ b/python/pylibcudf/pylibcudf/io/json.pyx
@@ -23,6 +23,7 @@ from pylibcudf.libcudf.io.types cimport (
 from pylibcudf.libcudf.types cimport data_type, size_type
 from pylibcudf.types cimport DataType
 
+__all__ = ["chunked_read_json", "read_json", "write_json"]
 
 cdef map[string, schema_element] _generate_schema_map(list dtypes):
     cdef map[string, schema_element] schema_map
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi
new file mode 100644
index 00000000000..4cf87f1a832
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/orc.pyi
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from typing import Any
+
+from pylibcudf.io.types import SourceInfo, TableWithMetadata
+from pylibcudf.types import DataType
+
+def read_orc(
+    source_info: SourceInfo,
+    columns: list[str] | None = None,
+    stripes: list[list[int]] | None = None,
+    skip_rows: int = 0,
+    nrows: int = -1,
+    use_index: bool = True,
+    use_np_dtypes: bool = True,
+    timestamp_type: DataType | None = None,
+    decimal128_columns: list[str] | None = None,
+) -> TableWithMetadata: ...
+
+class OrcColumnStatistics:
+    def __init__(self): ...
+    @property
+    def number_of_values(self) -> int | None: ...
+    @property
+    def has_null(self) -> bool | None: ...
+    def __getitem__(self, item: str) -> Any: ...
+    def __contains__(self, item: str) -> bool: ...
+    def get[T](self, item: str, default: None | T = None) -> T | None: ...
+
+class ParsedOrcStatistics:
+    def __init__(self): ...
+    @property
+    def column_names(self) -> list[str]: ...
+    @property
+    def file_stats(self) -> list[OrcColumnStatistics]: ...
+    @property
+    def stripes_stats(self) -> list[OrcColumnStatistics]: ...
+
+def read_parsed_orc_statistics(
+    source_info: SourceInfo,
+) -> ParsedOrcStatistics: ...
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx
index 70e0a7995a2..4270f5b4f95 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyx
+++ b/python/pylibcudf/pylibcudf/io/orc.pyx
@@ -30,6 +30,12 @@ from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.types cimport DataType
 from pylibcudf.variant cimport get_if, holds_alternative
 
+__all__ = [
+    "OrcColumnStatistics",
+    "ParsedOrcStatistics",
+    "read_orc",
+    "read_parsed_orc_statistics",
+]
 
 cdef class OrcColumnStatistics:
     def __init__(self):
@@ -39,6 +45,8 @@ cdef class OrcColumnStatistics:
             "use `OrcColumnStatistics.from_libcudf` instead."
         )
 
+    __hash__ = None
+
     @property
     def number_of_values(self):
         if self.number_of_values_c.has_value():
@@ -183,6 +191,8 @@ cdef class OrcColumnStatistics:
 
 cdef class ParsedOrcStatistics:
 
+    __hash__ = None
+
     @property
     def column_names(self):
         return [name.decode() for name in self.c_obj.column_names]
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi
new file mode 100644
index 00000000000..bcf1d1cce09
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyi
@@ -0,0 +1,36 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.expressions import Expression
+from pylibcudf.io.types import SourceInfo, TableWithMetadata
+
+class ChunkedParquetReader:
+    def __init__(
+        self,
+        source_info: SourceInfo,
+        columns: list[str] | None = None,
+        row_groups: list[list[int]] | None = None,
+        use_pandas_metadata: bool = True,
+        convert_strings_to_categories: bool = False,
+        skip_rows: int = 0,
+        nrows: int = 0,
+        chunk_read_limit: int = 0,
+        pass_read_limit: int = 1024000000,
+        allow_mismatched_pq_schemas: bool = False,
+    ) -> None: ...
+    def has_next(self) -> bool: ...
+    def read_chunk(self) -> TableWithMetadata: ...
+
+def read_parquet(
+    source_info: SourceInfo,
+    columns: list[str] | None = None,
+    row_groups: list[list[int]] | None = None,
+    filters: Expression | None = None,
+    convert_strings_to_categories: bool = False,
+    use_pandas_metadata: bool = True,
+    skip_rows: int = 0,
+    nrows: int = -1,
+    allow_mismatched_pq_schemas: bool = False,
+    # disabled see comment in parquet.pyx for more
+    # reader_column_schema: ReaderColumnSchema = *,
+    # timestamp_type: DataType = *
+) -> TableWithMetadata: ...
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index 981ca7b8159..b76a352d633 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -16,6 +16,8 @@ from pylibcudf.libcudf.io.parquet cimport (
 from pylibcudf.libcudf.io.types cimport table_with_metadata
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = ["ChunkedParquetReader", "read_parquet"]
+
 
 cdef parquet_reader_options _setup_parquet_reader_options(
     SourceInfo source_info,
@@ -123,6 +125,8 @@ cdef class ChunkedParquetReader:
                 )
             )
 
+    __hash__ = None
+
     cpdef bool has_next(self):
         """
         Returns True if there is another chunk in the Parquet file
diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd
new file mode 100644
index 00000000000..e421a64adc8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.io.types cimport SourceInfo
+from pylibcudf.libcudf.io.parquet_metadata cimport(
+    parquet_metadata,
+    parquet_schema,
+    parquet_column_schema,
+)
+
+cdef class ParquetColumnSchema:
+    cdef parquet_column_schema column_schema
+
+    @staticmethod
+    cdef from_column_schema(parquet_column_schema column_schema)
+
+    cpdef str name(self)
+
+    cpdef int num_children(self)
+
+    cpdef ParquetColumnSchema child(self, int idx)
+
+    cpdef list children(self)
+
+
+cdef class ParquetSchema:
+    cdef parquet_schema schema
+
+    @staticmethod
+    cdef from_schema(parquet_schema schema)
+
+    cpdef ParquetColumnSchema root(self)
+
+
+cdef class ParquetMetadata:
+    cdef parquet_metadata meta
+
+    @staticmethod
+    cdef from_metadata(parquet_metadata meta)
+
+    cpdef ParquetSchema schema(self)
+
+    cpdef int num_rows(self)
+
+    cpdef int num_rowgroups(self)
+
+    cpdef dict metadata(self)
+
+    cpdef list rowgroup_metadata(self)
+
+
+cpdef ParquetMetadata read_parquet_metadata(SourceInfo src_info)
diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx
new file mode 100644
index 00000000000..0ad4dafb0cf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx
@@ -0,0 +1,214 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.io.types cimport SourceInfo
+from pylibcudf.libcudf.io cimport parquet_metadata as cpp_parquet_metadata
+
+
+__all__ = [
+    "ParquetColumnSchema",
+    "ParquetMetadata",
+    "ParquetSchema",
+    "read_parquet_metadata",
+]
+
+cdef class ParquetColumnSchema:
+    """
+    Schema of a parquet column, including the nested columns.
+
+    Parameters
+    ----------
+    parquet_column_schema
+    """
+    def __init__(self):
+        raise ValueError("Construct ParquetColumnSchema with from_column_schema.")
+
+    @staticmethod
+    cdef from_column_schema(cpp_parquet_metadata.parquet_column_schema column_schema):
+        cdef ParquetColumnSchema result = ParquetColumnSchema.__new__(
+            ParquetColumnSchema
+        )
+        result.column_schema = column_schema
+        return result
+
+    cpdef str name(self):
+        """
+        Returns parquet column name; can be empty.
+
+        Returns
+        -------
+        str
+            Column name
+        """
+        return self.column_schema.name().decode()
+
+    cpdef int num_children(self):
+        """
+        Returns the number of child columns.
+
+        Returns
+        -------
+        int
+            Children count
+        """
+        return self.column_schema.num_children()
+
+    cpdef ParquetColumnSchema child(self, int idx):
+        """
+        Returns schema of the child with the given index.
+
+        Parameters
+        ----------
+        idx : int
+            Child Index
+
+        Returns
+        -------
+        ParquetColumnSchema
+            Child schema
+        """
+        return ParquetColumnSchema.from_column_schema(self.column_schema.child(idx))
+
+    cpdef list children(self):
+        """
+        Returns schemas of all child columns.
+
+        Returns
+        -------
+        list[ParquetColumnSchema]
+            Child schemas.
+        """
+        cdef cpp_parquet_metadata.parquet_column_schema child
+        return [
+            ParquetColumnSchema.from_column_schema(child)
+            for child in self.column_schema.children()
+        ]
+
+
+cdef class ParquetSchema:
+    """
+    Schema of a parquet file.
+
+    Parameters
+    ----------
+    parquet_schema
+    """
+
+    def __init__(self):
+        raise ValueError("Construct ParquetSchema with from_schema.")
+
+    @staticmethod
+    cdef from_schema(cpp_parquet_metadata.parquet_schema schema):
+        cdef ParquetSchema result = ParquetSchema.__new__(ParquetSchema)
+        result.schema = schema
+        return result
+
+    cpdef ParquetColumnSchema root(self):
+        """
+        Returns the schema of the struct column that contains all columns as fields.
+
+        Returns
+        -------
+        ParquetColumnSchema
+            Root column schema
+        """
+        return ParquetColumnSchema.from_column_schema(self.schema.root())
+
+
+cdef class ParquetMetadata:
+    """
+    Information about content of a parquet file.
+
+    Parameters
+    ----------
+    parquet_metadata
+    """
+
+    def __init__(self):
+        raise ValueError("Construct ParquetMetadata with from_metadata.")
+
+    @staticmethod
+    cdef from_metadata(cpp_parquet_metadata.parquet_metadata meta):
+        cdef ParquetMetadata result = ParquetMetadata.__new__(ParquetMetadata)
+        result.meta = meta
+        return result
+
+    cpdef ParquetSchema schema(self):
+        """
+        Returns the parquet schema.
+
+        Returns
+        -------
+        ParquetSchema
+            Parquet schema
+        """
+        return ParquetSchema.from_schema(self.meta.schema())
+
+    cpdef int num_rows(self):
+        """
+        Returns the number of rows of the root column.
+
+        Returns
+        -------
+        int
+            Number of rows
+        """
+        return self.meta.num_rows()
+
+    cpdef int num_rowgroups(self):
+        """
+        Returns the number of rowgroups in the file.
+
+        Returns
+        -------
+        int
+            Number of row groups.
+        """
+        return self.meta.num_rowgroups()
+
+    cpdef dict metadata(self):
+        """
+        Returns the key-value metadata in the file footer.
+
+        Returns
+        -------
+        dict[str, str]
+            Key value metadata as a map.
+        """
+        return {key.decode(): val.decode() for key, val in self.meta.metadata()}
+
+    cpdef list rowgroup_metadata(self):
+        """
+        Returns the row group metadata in the file footer.
+
+        Returns
+        -------
+        list[dict[str, int]]
+            Vector of row group metadata as maps.
+        """
+        return [
+            {key.decode(): val for key, val in metadata}
+            for metadata in self.meta.rowgroup_metadata()
+        ]
+
+
+cpdef ParquetMetadata read_parquet_metadata(SourceInfo src_info):
+    """
+    Reads metadata of parquet dataset.
+
+    Parameters
+    ----------
+    src_info : SourceInfo
+        Dataset source.
+
+    Returns
+    -------
+    ParquetMetadata
+        Parquet_metadata with parquet schema, number of rows,
+        number of row groups and key-value metadata.
+    """
+    cdef cpp_parquet_metadata.parquet_metadata c_result
+
+    with nogil:
+        c_result = cpp_parquet_metadata.read_parquet_metadata(src_info.c_obj)
+
+    return ParquetMetadata.from_metadata(c_result)
diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd
new file mode 100644
index 00000000000..051e9bc0cde
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/text.pxd
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.io.text cimport parse_options, data_chunk_source
+
+cdef class ParseOptions:
+    cdef parse_options c_options
+
+cdef class DataChunkSource:
+    cdef unique_ptr[data_chunk_source] c_source
+    cdef string data_ref
+
+
+cpdef Column multibyte_split(
+    DataChunkSource source,
+    str delimiter,
+    ParseOptions options=*
+)
+
+cpdef DataChunkSource make_source(str data)
+
+cpdef DataChunkSource make_source_from_file(str filename)
+
+cpdef DataChunkSource make_source_from_bgzip_file(
+    str filename,
+    int virtual_begin=*,
+    int virtual_end=*,
+)
diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx
new file mode 100644
index 00000000000..d3cbdc4cd60
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/text.pyx
@@ -0,0 +1,202 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libc.stdint cimport uint64_t
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.io cimport text as cpp_text
+
+__all__ = [
+    "DataChunkSource",
+    "ParseOptions",
+    "make_source",
+    "make_source_from_bgzip_file",
+    "make_source_from_file",
+    "multibyte_split",
+]
+
+cdef class ParseOptions:
+    """
+    Parsing options for `multibyte_split`
+
+    Parameters
+    ----------
+    byte_range : list | tuple, default None
+        Only rows starting inside this byte range will be
+        part of the output column.
+
+    strip_delimiters : bool, default True
+        Whether delimiters at the end of rows should
+        be stripped from the output column.
+    """
+    def __init__(
+        self,
+        *,
+        byte_range=None,
+        strip_delimiters=False,
+    ):
+        self.c_options = cpp_text.parse_options()
+        if byte_range is not None:
+            c_byte_range_offset = byte_range[0]
+            c_byte_range_size = byte_range[1]
+            self.c_options.byte_range = cpp_text.byte_range_info(
+                c_byte_range_offset,
+                c_byte_range_size
+            )
+        self.c_options.strip_delimiters = strip_delimiters
+
+
+cdef class DataChunkSource:
+    """
+    Data source for `multibyte_split`
+
+    Parameters
+    ----------
+    data : str
+        Filename or data itself.
+    """
+
+    def __cinit__(self, str data):
+        # Need to keep a reference alive for make_source
+        self.data_ref = data.encode()
+
+
+cpdef DataChunkSource make_source(str data):
+    """
+    Creates a data source capable of producing device-buffered views
+    of the given string.
+
+    Parameters
+    ----------
+    data : str
+        The host data to be exposed as a data chunk source.
+
+    Returns
+    -------
+    DataChunkSource
+        The data chunk source for the provided host data.
+    """
+    cdef DataChunkSource dcs = DataChunkSource(data)
+    with nogil:
+        dcs.c_source = move(cpp_text.make_source(dcs.data_ref))
+    return dcs
+
+
+cpdef DataChunkSource make_source_from_file(str filename):
+    """
+    Creates a data source capable of producing device-buffered views of the file.
+
+    Parameters
+    ----------
+    filename : str
+        The filename of the file to be exposed as a data chunk source.
+
+    Returns
+    -------
+    DataChunkSource
+        The data chunk source for the provided filename.
+    """
+    cdef DataChunkSource dcs = DataChunkSource(filename)
+    with nogil:
+        dcs.c_source = move(cpp_text.make_source_from_file(dcs.data_ref))
+    return dcs
+
+cpdef DataChunkSource make_source_from_bgzip_file(
+    str filename,
+    int virtual_begin=-1,
+    int virtual_end=-1,
+):
+    """
+    Creates a data source capable of producing device-buffered views of
+    a BGZIP compressed file with virtual record offsets.
+
+    Parameters
+    ----------
+    filename : str
+        The filename of the BGZIP-compressed file to be exposed as a data chunk source.
+
+    virtual_begin : int
+        The virtual (Tabix) offset of the first byte to be read. Its upper 48 bits
+        describe the offset into the compressed file, its lower 16 bits describe the
+        block-local offset.
+
+    virtual_end : int, default None
+        The virtual (Tabix) offset one past the last byte to be read
+
+    Returns
+    -------
+    DataChunkSource
+        The data chunk source for the provided filename.
+    """
+    cdef uint64_t c_virtual_begin
+    cdef uint64_t c_virtual_end
+    cdef DataChunkSource dcs = DataChunkSource(filename)
+
+    if virtual_begin == -1 and virtual_end == -1:
+        with nogil:
+            dcs.c_source = move(cpp_text.make_source_from_bgzip_file(dcs.data_ref))
+    elif virtual_begin != -1 and virtual_end != -1:
+        c_virtual_begin = virtual_begin
+        c_virtual_end = virtual_end
+        with nogil:
+            dcs.c_source = move(
+                cpp_text.make_source_from_bgzip_file(
+                    dcs.data_ref,
+                    c_virtual_begin,
+                    c_virtual_end,
+                )
+            )
+    else:
+        raise ValueError(
+            "virtual_begin and virtual_end must both be None or both be int"
+        )
+    return dcs
+
+cpdef Column multibyte_split(
+    DataChunkSource source,
+    str delimiter,
+    ParseOptions options=None
+):
+    """
+    Splits the source text into a strings column using a multiple byte delimiter.
+
+    For details, see :cpp:func:`cudf::io::text::multibyte_split`
+
+    Parameters
+    ----------
+    source :
+        The source string.
+
+    delimiter : str
+        UTF-8 encoded string for which to find offsets in the source.
+
+    options : ParseOptions
+        The parsing options to use (including byte range).
+
+    Returns
+    -------
+    Column
+        The strings found by splitting the source by the delimiter
+        within the relevant byte range.
+    """
+    cdef unique_ptr[column] c_result
+    cdef unique_ptr[data_chunk_source] c_source = move(source.c_source)
+    cdef string c_delimiter = delimiter.encode()
+
+    if options is None:
+        options = ParseOptions()
+
+    cdef cpp_text.parse_options c_options = options.c_options
+
+    with nogil:
+        c_result = cpp_text.multibyte_split(
+            dereference(c_source),
+            c_delimiter,
+            c_options
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyi b/python/pylibcudf/pylibcudf/io/timezone.pyi
new file mode 100644
index 00000000000..0582800c4af
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/timezone.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.table import Table
+
+def make_timezone_transition_table(
+    tzif_dir: str, timezone_name: str
+) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx
index f120b65fb2c..af7cf8a4ee5 100644
--- a/python/pylibcudf/pylibcudf/io/timezone.pyx
+++ b/python/pylibcudf/pylibcudf/io/timezone.pyx
@@ -11,6 +11,7 @@ from pylibcudf.libcudf.table.table cimport table
 
 from ..table cimport Table
 
+__all__ = ["make_timezone_transition_table"]
 
 cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name):
     """
diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi
new file mode 100644
index 00000000000..a4f4fc13bdc
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/types.pyi
@@ -0,0 +1,97 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import io
+import os
+from collections.abc import Mapping
+from enum import IntEnum
+from typing import Any, Literal, TypeAlias, overload
+
+from pylibcudf.column import Column
+from pylibcudf.io.datasource import Datasource
+from pylibcudf.table import Table
+
+class JSONRecoveryMode(IntEnum):
+    FAIL = ...
+    RECOVER_WITH_NULL = ...
+
+class CompressionType(IntEnum):
+    NONE = ...
+    AUTO = ...
+    SNAPPY = ...
+    GZIP = ...
+    BZIP2 = ...
+    BROTLI = ...
+    ZIP = ...
+    XZ = ...
+    ZLIB = ...
+    LZ4 = ...
+    LZO = ...
+    ZSTD = ...
+
+class ColumnEncoding(IntEnum):
+    USE_DEFAULT = ...
+    DICTIONARY = ...
+    PLAIN = ...
+    DELTA_BINARY_PACKED = ...
+    DELTA_LENGTH_BYTE_ARRAY = ...
+    DELTA_BYTE_ARRAY = ...
+    BYTE_STREAM_SPLIT = ...
+    DIRECT = ...
+    DIRECT_V2 = ...
+    DICTIONARY_V2 = ...
+
+class DictionaryPolicy(IntEnum):
+    NEVER = ...
+    ADAPTIVE = ...
+    ALWAYS = ...
+
+class StatisticsFreq(IntEnum):
+    STATISTICS_NONE = ...
+    STATISTICS_ROWGROUP = ...
+    STATISTICS_PAGE = ...
+    STATISTICS_COLUMN = ...
+
+class QuoteStyle(IntEnum):
+    MINIMAL = ...
+    ALL = ...
+    NONNUMERIC = ...
+    NONE = ...
+
+ColumnNameSpec: TypeAlias = tuple[str, list[ColumnNameSpec]]
+ChildNameSpec: TypeAlias = Mapping[str, ChildNameSpec]
+
+class TableWithMetadata:
+    tbl: Table
+    def __init__(
+        self, tbl: Table, column_names: list[ColumnNameSpec]
+    ) -> None: ...
+    @property
+    def columns(self) -> list[Column]: ...
+    @overload
+    def column_names(self, include_children: Literal[False]) -> list[str]: ...
+    @overload
+    def column_names(
+        self, include_children: Literal[True]
+    ) -> list[ColumnNameSpec]: ...
+    @overload
+    def column_names(
+        self, include_children: bool = False
+    ) -> list[str] | list[ColumnNameSpec]: ...
+    @property
+    def child_names(self) -> ChildNameSpec: ...
+    @property
+    def per_file_user_data(self) -> list[Mapping[str, str]]: ...
+
+class SourceInfo:
+    def __init__(
+        self, sources: list[str] | list[os.PathLike[Any]] | list[Datasource]
+    ) -> None: ...
+
+class SinkInfo:
+    def __init__(
+        self,
+        sinks: list[os.PathLike[Any]]
+        | list[io.StringIO]
+        | list[io.BytesIO]
+        | list[io.TextIOBase]
+        | list[str],
+    ) -> None: ...
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index 967d05e7057..5db4eeb9583 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -20,6 +20,7 @@ import codecs
 import errno
 import io
 import os
+import re
 
 from pylibcudf.libcudf.io.json import \
     json_recovery_mode_t as JSONRecoveryMode  # no-cython-lint
@@ -27,9 +28,21 @@ from pylibcudf.libcudf.io.types import (
     compression_type as CompressionType,  # no-cython-lint
     column_encoding as ColumnEncoding,  # no-cython-lint
     dictionary_policy as DictionaryPolicy,  # no-cython-lint
+    quote_style as QuoteStyle,  # no-cython-lint
     statistics_freq as StatisticsFreq, # no-cython-lint
 )
 
+__all__ = [
+    "ColumnEncoding",
+    "CompressionType",
+    "DictionaryPolicy",
+    "JSONRecoveryMode",
+    "QuoteStyle",
+    "SinkInfo",
+    "SourceInfo",
+    "StatisticsFreq",
+    "TableWithMetadata",
+]
 
 cdef class TableWithMetadata:
     """A container holding a table and its associated metadata
@@ -53,6 +66,8 @@ cdef class TableWithMetadata:
 
         self.metadata.schema_info = self._make_column_info(column_names)
 
+    __hash__ = None
+
     cdef vector[column_name_info] _make_column_info(self, list column_names):
         cdef vector[column_name_info] col_name_infos
         cdef column_name_info info
@@ -147,6 +162,8 @@ cdef class SourceInfo:
 
         Mixing different types of sources will raise a `ValueError`.
     """
+    # Regular expression that match remote file paths supported by libcudf
+    _is_remote_file_pattern = re.compile(r"^s3://", re.IGNORECASE)
 
     def __init__(self, list sources):
         if not sources:
@@ -161,11 +178,10 @@ cdef class SourceInfo:
             for src in sources:
                 if not isinstance(src, (os.PathLike, str)):
                     raise ValueError("All sources must be of the same type!")
-                if not os.path.isfile(src):
-                    raise FileNotFoundError(errno.ENOENT,
-                                            os.strerror(errno.ENOENT),
-                                            src)
-
+                if not (os.path.isfile(src) or self._is_remote_file_pattern.match(src)):
+                    raise FileNotFoundError(
+                        errno.ENOENT, os.strerror(errno.ENOENT), src
+                    )
                 c_files.push_back(<string> str(src).encode())
 
             self.c_obj = move(source_info(c_files))
@@ -217,6 +233,8 @@ cdef class SourceInfo:
 
         self.c_obj = source_info(c_host_buffers)
 
+    __hash__ = None
+
 
 # Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you
 # write from cudf to any python file-like object (File/BytesIO/SocketIO etc)
@@ -299,3 +317,5 @@ cdef class SinkInfo:
         else:
             # we don't have sinks so we must have paths to sinks
             self.c_obj = sink_info(paths)
+
+    __hash__ = None
diff --git a/python/pylibcudf/pylibcudf/join.pyi b/python/pylibcudf/pylibcudf/join.pyi
new file mode 100644
index 00000000000..f34357baa67
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/join.pyi
@@ -0,0 +1,78 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.expressions import Expression
+from pylibcudf.table import Table
+from pylibcudf.types import NullEquality
+
+def inner_join(
+    left_keys: Table, right_keys: Table, nulls_equal: NullEquality
+) -> tuple[Column, Column]: ...
+def left_join(
+    left_keys: Table, right_keys: Table, nulls_equal: NullEquality
+) -> tuple[Column, Column]: ...
+def full_join(
+    left_keys: Table, right_keys: Table, nulls_equal: NullEquality
+) -> tuple[Column, Column]: ...
+def left_semi_join(
+    left_keys: Table, right_keys: Table, nulls_equal: NullEquality
+) -> Column: ...
+def left_anti_join(
+    left_keys: Table, right_keys: Table, nulls_equal: NullEquality
+) -> Column: ...
+def cross_join(left: Table, right: Table) -> Table: ...
+def conditional_inner_join(
+    left: Table, right: Table, binary_predicate: Expression
+) -> tuple[Column, Column]: ...
+def conditional_left_join(
+    left: Table, right: Table, binary_predicate: Expression
+) -> tuple[Column, Column]: ...
+def conditional_full_join(
+    left: Table, right: Table, binary_predicate: Expression
+) -> tuple[Column, Column]: ...
+def conditional_left_semi_join(
+    left: Table, right: Table, binary_predicate: Expression
+) -> Column: ...
+def conditional_left_anti_join(
+    left: Table, right: Table, binary_predicate: Expression
+) -> Column: ...
+def mixed_inner_join(
+    left_keys: Table,
+    right_keys: Table,
+    left_conditional: Table,
+    right_conditional: Table,
+    binary_predicate: Expression,
+    nulls_equal: NullEquality,
+) -> tuple[Column, Column]: ...
+def mixed_left_join(
+    left_keys: Table,
+    right_keys: Table,
+    left_conditional: Table,
+    right_conditional: Table,
+    binary_predicate: Expression,
+    nulls_equal: NullEquality,
+) -> tuple[Column, Column]: ...
+def mixed_full_join(
+    left_keys: Table,
+    right_keys: Table,
+    left_conditional: Table,
+    right_conditional: Table,
+    binary_predicate: Expression,
+    nulls_equal: NullEquality,
+) -> tuple[Column, Column]: ...
+def mixed_left_semi_join(
+    left_keys: Table,
+    right_keys: Table,
+    left_conditional: Table,
+    right_conditional: Table,
+    binary_predicate: Expression,
+    nulls_equal: NullEquality,
+) -> Column: ...
+def mixed_left_anti_join(
+    left_keys: Table,
+    right_keys: Table,
+    left_conditional: Table,
+    right_conditional: Table,
+    binary_predicate: Expression,
+    nulls_equal: NullEquality,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx
index 0d841eee194..c2efe05ffc4 100644
--- a/python/pylibcudf/pylibcudf/join.pyx
+++ b/python/pylibcudf/pylibcudf/join.pyx
@@ -15,6 +15,24 @@ from .column cimport Column
 from .expressions cimport Expression
 from .table cimport Table
 
+__all__ = [
+    "conditional_full_join",
+    "conditional_inner_join",
+    "conditional_left_anti_join",
+    "conditional_left_join",
+    "conditional_left_semi_join",
+    "cross_join",
+    "full_join",
+    "inner_join",
+    "left_anti_join",
+    "left_join",
+    "left_semi_join",
+    "mixed_full_join",
+    "mixed_inner_join",
+    "mixed_left_anti_join",
+    "mixed_left_join",
+    "mixed_left_semi_join",
+]
 
 cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map):
     # helper to convert a gather map to a Column
diff --git a/python/pylibcudf/pylibcudf/json.pyi b/python/pylibcudf/pylibcudf/json.pyi
new file mode 100644
index 00000000000..b93d4876dab
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/json.pyi
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class GetJsonObjectOptions:
+    def __init__(
+        self,
+        *,
+        allow_single_quotes: bool = False,
+        strip_quotes_from_single_strings: bool = True,
+        missing_fields_as_nulls: bool = False,
+    ) -> None: ...
+    def get_allow_single_quotes(self) -> bool: ...
+    def get_strip_quotes_from_single_strings(self) -> bool: ...
+    def get_missing_fields_as_nulls(self) -> bool: ...
+    def set_allow_single_quotes(self, val: bool) -> None: ...
+    def set_strip_quotes_from_single_strings(self, val: bool) -> None: ...
+    def set_missing_fields_as_nulls(self, val: bool) -> None: ...
+
+def get_json_object(
+    col: Column, json_path: Scalar, options: GetJsonObjectOptions | None = None
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx
index ebb82f80408..5ec1e1be971 100644
--- a/python/pylibcudf/pylibcudf/json.pyx
+++ b/python/pylibcudf/pylibcudf/json.pyx
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.scalar cimport Scalar
 
+__all__ = ["GetJsonObjectOptions", "get_json_object"]
 
 cdef class GetJsonObjectOptions:
     """Settings for ``get_json_object()``"""
@@ -26,6 +27,8 @@ cdef class GetJsonObjectOptions:
         )
         self.set_missing_fields_as_nulls(missing_fields_as_nulls)
 
+    __hash__ = None
+
     def get_allow_single_quotes(self):
         """
         Returns true/false depending on whether single-quotes for representing strings
diff --git a/python/pylibcudf/pylibcudf/labeling.pxd b/python/pylibcudf/pylibcudf/labeling.pxd
index 6f8797ae7d3..b1f9f2e806d 100644
--- a/python/pylibcudf/pylibcudf/labeling.pxd
+++ b/python/pylibcudf/pylibcudf/labeling.pxd
@@ -8,7 +8,7 @@ from .column cimport Column
 cpdef Column label_bins(
     Column input,
     Column left_edges,
-    bool left_inclusive,
+    inclusive left_inclusive,
     Column right_edges,
-    bool right_inclusive
+    inclusive right_inclusive
 )
diff --git a/python/pylibcudf/pylibcudf/labeling.pyi b/python/pylibcudf/pylibcudf/labeling.pyi
new file mode 100644
index 00000000000..c3a75d10baf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/labeling.pyi
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+
+class Inclusive(IntEnum):
+    YES = ...
+    NO = ...
+
+def label_bins(
+    input: Column,
+    left_edges: Column,
+    left_inclusive: Inclusive,
+    right_edges: Column,
+    right_inclusive: Inclusive,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx
index 226a9e14172..cae1830f6b9 100644
--- a/python/pylibcudf/pylibcudf/labeling.pyx
+++ b/python/pylibcudf/pylibcudf/labeling.pyx
@@ -10,13 +10,14 @@ from pylibcudf.libcudf.labeling import inclusive as Inclusive  # no-cython-lint
 
 from .column cimport Column
 
+__all__ = ["Inclusive", "label_bins"]
 
 cpdef Column label_bins(
     Column input,
     Column left_edges,
-    bool left_inclusive,
+    inclusive left_inclusive,
     Column right_edges,
-    bool right_inclusive
+    inclusive right_inclusive
 ):
     """Labels elements based on membership in the specified bins.
 
@@ -28,11 +29,11 @@ cpdef Column label_bins(
         Column of input elements to label according to the specified bins.
     left_edges : Column
         Column of the left edge of each bin.
-    left_inclusive : bool
+    left_inclusive : Inclusive
         Whether or not the left edge is inclusive.
     right_edges : Column
         Column of the right edge of each bin.
-    right_inclusive : bool
+    right_inclusive : Inclusive
         Whether or not the right edge is inclusive.
 
     Returns
@@ -42,24 +43,13 @@ cpdef Column label_bins(
         according to the specified bins.
     """
     cdef unique_ptr[column] c_result
-    cdef inclusive c_left_inclusive = (
-        inclusive.YES
-        if left_inclusive
-        else inclusive.NO
-    )
-    cdef inclusive c_right_inclusive = (
-        inclusive.YES
-        if right_inclusive
-        else inclusive.NO
-    )
-
     with nogil:
         c_result = cpp_labeling.label_bins(
             input.view(),
             left_edges.view(),
-            c_left_inclusive,
+            left_inclusive,
             right_edges.view(),
-            c_right_inclusive,
+            right_inclusive,
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
index 15beaee47d4..00669ff579a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
@@ -24,4 +24,5 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp
 )
 add_subdirectory(io)
+add_subdirectory(lists)
 add_subdirectory(strings)
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd
index 8e6da56c9a6..b0ce13e4492 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-cimport pylibcudf.libcudf.io.types as cudf_io_types
 from libc.stdint cimport int64_t
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.io.types cimport source_info
 
 
 cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil:
@@ -28,4 +28,4 @@ cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil:
         unordered_map[string, string] metadata() except+
         vector[unordered_map[string, int64_t]] rowgroup_metadata() except+
 
-    cdef parquet_metadata read_parquet_metadata(cudf_io_types.source_info src) except+
+    cdef parquet_metadata read_parquet_metadata(source_info src_info) except+
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt
new file mode 100644
index 00000000000..c896db2c85a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt
@@ -0,0 +1,23 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources combine.pyx contains.pyx)
+
+set(linked_libraries cudf::cudf)
+
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_lists
+)
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
index d077958ce03..09a5d84c64f 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -9,10 +10,9 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 cdef extern from "cudf/lists/combine.hpp" namespace \
         "cudf::lists" nogil:
 
-    ctypedef enum concatenate_null_policy:
-        IGNORE "cudf::lists::concatenate_null_policy::IGNORE"
-        NULLIFY_OUTPUT_ROW \
-            "cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW"
+    cpdef enum class concatenate_null_policy(int32_t):
+        IGNORE
+        NULLIFY_OUTPUT_ROW
 
     cdef unique_ptr[column] concatenate_rows(
         const table_view input_table
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 41250037dcf..ebf8eda1ce3 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -22,6 +22,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const size_type width,
     ) except +
 
+    cdef unique_ptr[column] minhash_permuted(
+        const column_view &strings,
+        const uint32_t seed,
+        const column_view &a,
+        const column_view &b,
+        const size_type width,
+    ) except +
+
     cdef unique_ptr[column] minhash64(
         const column_view &strings,
         const column_view &seeds,
@@ -34,6 +42,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const size_type width,
     ) except +
 
+    cdef unique_ptr[column] minhash64_permuted(
+        const column_view &strings,
+        const uint64_t seed,
+        const column_view &a,
+        const column_view &b,
+        const size_type width,
+    ) except +
+
     cdef unique_ptr[column] word_minhash(
         const column_view &input,
         const column_view &seeds
diff --git a/python/pylibcudf/pylibcudf/lists.pxd b/python/pylibcudf/pylibcudf/lists.pxd
index e7d006e6e2e..10c1c26e24e 100644
--- a/python/pylibcudf/pylibcudf/lists.pxd
+++ b/python/pylibcudf/pylibcudf/lists.pxd
@@ -1,7 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from pylibcudf.libcudf.types cimport null_order, size_type
+from pylibcudf.libcudf.types cimport (
+    nan_equality, null_equality, null_order, order, size_type
+)
+from pylibcudf.libcudf.lists.combine cimport concatenate_null_policy
+from pylibcudf.libcudf.lists.contains cimport duplicate_find_option
 
 from .column cimport Column
 from .scalar cimport Scalar
@@ -19,13 +23,13 @@ cpdef Table explode_outer(Table, size_type explode_column_idx)
 
 cpdef Column concatenate_rows(Table)
 
-cpdef Column concatenate_list_elements(Column, bool dropna)
+cpdef Column concatenate_list_elements(Column, concatenate_null_policy null_policy)
 
 cpdef Column contains(Column, ColumnOrScalar)
 
 cpdef Column contains_nulls(Column)
 
-cpdef Column index_of(Column, ColumnOrScalar, bool)
+cpdef Column index_of(Column, ColumnOrScalar, duplicate_find_option)
 
 cpdef Column reverse(Column)
 
@@ -37,16 +41,24 @@ cpdef Column count_elements(Column)
 
 cpdef Column sequences(Column, Column, Column steps = *)
 
-cpdef Column sort_lists(Column, bool, null_order, bool stable = *)
+cpdef Column sort_lists(Column, order, null_order, bool stable = *)
 
-cpdef Column difference_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+cpdef Column difference_distinct(
+    Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=*
+)
 
-cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+cpdef Column have_overlap(
+    Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=*
+)
 
-cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+cpdef Column intersect_distinct(
+    Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=*
+)
 
-cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+cpdef Column union_distinct(
+    Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=*
+)
 
 cpdef Column apply_boolean_mask(Column, Column)
 
-cpdef Column distinct(Column, bool, bool)
+cpdef Column distinct(Column, null_equality, nan_equality)
diff --git a/python/pylibcudf/pylibcudf/lists.pyi b/python/pylibcudf/pylibcudf/lists.pyi
new file mode 100644
index 00000000000..dff6c400638
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/lists.pyi
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+from pylibcudf.types import NanEquality, NullEquality, NullOrder, Order
+
+class ConcatenateNullPolicy(IntEnum):
+    IGNORE = ...
+    NULLIFY_OUTPUT_ROW = ...
+
+class DuplicateFindOption(IntEnum):
+    FIND_FIRST = ...
+    FIND_LAST = ...
+
+def explode_outer(input: Table, explode_column_idx: int) -> Table: ...
+def concatenate_rows(input: Table) -> Column: ...
+def concatenate_list_elements(
+    input: Column, null_policy: ConcatenateNullPolicy
+) -> Column: ...
+def contains(input: Column, search_key: Column | Scalar) -> Column: ...
+def contains_nulls(input: Column) -> Column: ...
+def index_of(
+    input: Column,
+    search_key: Column | Scalar,
+    find_option: DuplicateFindOption,
+) -> Column: ...
+def reverse(input: Column) -> Column: ...
+def segmented_gather(input: Column, gather_map_list: Column) -> Column: ...
+def extract_list_element(input: Column, index: Column | int) -> Column: ...
+def count_elements(input: Column) -> Column: ...
+def sequences(
+    starts: Column, sizes: Column, steps: Column | None = None
+) -> Column: ...
+def sort_lists(
+    input: Column,
+    sort_order: Order,
+    na_position: NullOrder,
+    stable: bool = False,
+) -> Column: ...
+def difference_distinct(
+    lhs: Column,
+    rhs: Column,
+    nulls_equal: NullEquality = NullEquality.EQUAL,
+    nans_equal: NanEquality = NanEquality.ALL_EQUAL,
+) -> Column: ...
+def have_overlap(
+    lhs: Column,
+    rhs: Column,
+    nulls_equal: NullEquality = NullEquality.EQUAL,
+    nans_equal: NanEquality = NanEquality.ALL_EQUAL,
+) -> Column: ...
+def intersect_distinct(
+    lhs: Column,
+    rhs: Column,
+    nulls_equal: NullEquality = NullEquality.EQUAL,
+    nans_equal: NanEquality = NanEquality.ALL_EQUAL,
+) -> Column: ...
+def union_distinct(
+    lhs: Column,
+    rhs: Column,
+    nulls_equal: NullEquality = NullEquality.EQUAL,
+    nans_equal: NanEquality = NanEquality.ALL_EQUAL,
+) -> Column: ...
+def apply_boolean_mask(input: Column, boolean_mask: Column) -> Column: ...
+def distinct(
+    input: Column, nulls_equal: NullEquality, nans_equal: NanEquality
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx
index ecaf62d6895..ccc56eaa520 100644
--- a/python/pylibcudf/pylibcudf/lists.pyx
+++ b/python/pylibcudf/pylibcudf/lists.pyx
@@ -42,10 +42,35 @@ from pylibcudf.libcudf.types cimport (
 )
 from pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
 
+from pylibcudf.libcudf.lists.combine import concatenate_null_policy as ConcatenateNullPolicy # no-cython-lint
+from pylibcudf.libcudf.lists.contains import duplicate_find_option as DuplicateFindOption # no-cython-lint
+
 from .column cimport Column, ListColumnView
 from .scalar cimport Scalar
 from .table cimport Table
 
+__all__ = [
+    "ConcatenateNullPolicy",
+    "DuplicateFindOption",
+    "apply_boolean_mask",
+    "concatenate_list_elements",
+    "concatenate_rows",
+    "contains",
+    "contains_nulls",
+    "count_elements",
+    "difference_distinct",
+    "distinct",
+    "explode_outer",
+    "extract_list_element",
+    "have_overlap",
+    "index_of",
+    "intersect_distinct",
+    "reverse",
+    "segmented_gather",
+    "sequences",
+    "sort_lists",
+    "union_distinct",
+]
 
 cpdef Table explode_outer(Table input, size_type explode_column_idx):
     """Explode a column of lists into rows.
@@ -97,7 +122,9 @@ cpdef Column concatenate_rows(Table input):
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column concatenate_list_elements(Column input, bool dropna):
+cpdef Column concatenate_list_elements(
+    Column input, concatenate_null_policy null_policy
+):
     """Concatenate multiple lists on the same row into a single list.
 
     For details, see :cpp:func:`concatenate_list_elements`.
@@ -106,20 +133,14 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
     ----------
     input : Column
         The input column
-    dropna : bool
-        If true, null list elements will be ignored
-        from concatenation. Otherwise any input null values will result in
-        the corresponding output row being set to null.
+    null_policy : ConcatenateNullPolicy
+        How to treat null list elements.
 
     Returns
     -------
     Column
         A new Column of concatenated list elements
     """
-    cdef concatenate_null_policy null_policy = (
-        concatenate_null_policy.IGNORE if dropna
-        else concatenate_null_policy.NULLIFY_OUTPUT_ROW
-    )
     cdef unique_ptr[column] c_result
 
     with nogil:
@@ -191,7 +212,9 @@ cpdef Column contains_nulls(Column input):
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_option):
+cpdef Column index_of(
+    Column input, ColumnOrScalar search_key, duplicate_find_option find_option
+):
     """Create a column of index values indicating the position of a search
     key row within the corresponding list row in the lists column.
 
@@ -207,9 +230,8 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
         The input column.
     search_key : Union[Column, Scalar]
         The search key.
-    find_first_option : bool
-        If true, index_of returns the first match.
-        Otherwise the last match is returned.
+    find_option : DuplicateFindOption
+        Which match to return if there are duplicates.
 
     Returns
     -------
@@ -220,11 +242,6 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
     """
     cdef unique_ptr[column] c_result
     cdef ListColumnView list_view = input.list_view()
-    cdef cpp_contains.duplicate_find_option find_option = (
-        cpp_contains.duplicate_find_option.FIND_FIRST if find_first_option
-        else cpp_contains.duplicate_find_option.FIND_LAST
-    )
-
     with nogil:
         c_result = cpp_contains.index_of(
             list_view.view(),
@@ -380,7 +397,7 @@ cpdef Column sequences(Column starts, Column sizes, Column steps = None):
 
 cpdef Column sort_lists(
     Column input,
-    bool ascending,
+    order sort_order,
     null_order na_position,
     bool stable = False
 ):
@@ -392,8 +409,8 @@ cpdef Column sort_lists(
     ----------
     input : Column
         The input column.
-    ascending : bool
-        If true, the sort order is ascending. Otherwise, the sort order is descending.
+    ascending : Order
+        Sort order in the list.
     na_position : NullOrder
         If na_position equals NullOrder.FIRST, then the null values in the output
         column are placed first. Otherwise, they are be placed after.
@@ -409,21 +426,17 @@ cpdef Column sort_lists(
     cdef unique_ptr[column] c_result
     cdef ListColumnView list_view = input.list_view()
 
-    cdef order c_sort_order = (
-        order.ASCENDING if ascending else order.DESCENDING
-    )
-
     with nogil:
         if stable:
             c_result = cpp_stable_sort_lists(
                     list_view.view(),
-                    c_sort_order,
+                    sort_order,
                     na_position,
             )
         else:
             c_result = cpp_sort_lists(
                     list_view.view(),
-                    c_sort_order,
+                    sort_order,
                     na_position,
             )
     return Column.from_libcudf(move(c_result))
@@ -432,8 +445,8 @@ cpdef Column sort_lists(
 cpdef Column difference_distinct(
     Column lhs,
     Column rhs,
-    bool nulls_equal=True,
-    bool nans_equal=True
+    null_equality nulls_equal=null_equality.EQUAL,
+    nan_equality nans_equal=nan_equality.ALL_EQUAL,
 ):
     """Create a column of index values indicating the position of a search
     key row within the corresponding list row in the lists column.
@@ -446,11 +459,10 @@ cpdef Column difference_distinct(
         The input lists column of elements that may be included.
     rhs : Column
         The input lists column of elements to exclude.
-    nulls_equal : bool, default True
-        If true, null elements are considered equal. Otherwise, unequal.
-    nans_equal : bool, default True
-        If true, libcudf will treat nan elements from {-nan, +nan}
-        as equal. Otherwise, unequal. Otherwise, unequal.
+    nulls_equal : NullEquality, default EQUAL
+        Are nulls considered equal.
+    nans_equal : NanEquality, default ALL_EQUAL
+        Are nans considered equal.
 
     Returns
     -------
@@ -461,19 +473,12 @@ cpdef Column difference_distinct(
     cdef ListColumnView lhs_view = lhs.list_view()
     cdef ListColumnView rhs_view = rhs.list_view()
 
-    cdef null_equality c_nulls_equal = (
-        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
-    )
-    cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
-    )
-
     with nogil:
         c_result = cpp_set_operations.difference_distinct(
             lhs_view.view(),
             rhs_view.view(),
-            c_nulls_equal,
-            c_nans_equal,
+            nulls_equal,
+            nans_equal,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -481,8 +486,8 @@ cpdef Column difference_distinct(
 cpdef Column have_overlap(
     Column lhs,
     Column rhs,
-    bool nulls_equal=True,
-    bool nans_equal=True
+    null_equality nulls_equal=null_equality.EQUAL,
+    nan_equality nans_equal=nan_equality.ALL_EQUAL,
 ):
     """Check if lists at each row of the given lists columns overlap.
 
@@ -494,11 +499,10 @@ cpdef Column have_overlap(
         The input lists column for one side.
     rhs : Column
         The input lists column for the other side.
-    nulls_equal : bool, default True
-        If true, null elements are considered equal. Otherwise, unequal.
-    nans_equal : bool, default True
-        If true, libcudf will treat nan elements from {-nan, +nan}
-        as equal. Otherwise, unequal. Otherwise, unequal.
+    nulls_equal : NullEquality, default EQUAL
+        Are nulls considered equal.
+    nans_equal : NanEquality, default ALL_EQUAL
+        Are nans considered equal.
 
     Returns
     -------
@@ -509,19 +513,12 @@ cpdef Column have_overlap(
     cdef ListColumnView lhs_view = lhs.list_view()
     cdef ListColumnView rhs_view = rhs.list_view()
 
-    cdef null_equality c_nulls_equal = (
-        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
-    )
-    cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
-    )
-
     with nogil:
         c_result = cpp_set_operations.have_overlap(
             lhs_view.view(),
             rhs_view.view(),
-            c_nulls_equal,
-            c_nans_equal,
+            nulls_equal,
+            nans_equal,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -529,8 +526,8 @@ cpdef Column have_overlap(
 cpdef Column intersect_distinct(
     Column lhs,
     Column rhs,
-    bool nulls_equal=True,
-    bool nans_equal=True
+    null_equality nulls_equal=null_equality.EQUAL,
+    nan_equality nans_equal=nan_equality.ALL_EQUAL,
 ):
     """Create a lists column of distinct elements common to two input lists columns.
 
@@ -542,11 +539,10 @@ cpdef Column intersect_distinct(
         The input lists column of elements that may be included.
     rhs : Column
         The input lists column of elements to exclude.
-    nulls_equal : bool, default True
-        If true, null elements are considered equal. Otherwise, unequal.
-    nans_equal : bool, default True
-        If true, libcudf will treat nan elements from {-nan, +nan}
-        as equal. Otherwise, unequal. Otherwise, unequal.
+    nulls_equal : NullEquality, default EQUAL
+        Are nulls considered equal.
+    nans_equal : NanEquality, default ALL_EQUAL
+        Are nans considered equal.
 
     Returns
     -------
@@ -557,19 +553,12 @@ cpdef Column intersect_distinct(
     cdef ListColumnView lhs_view = lhs.list_view()
     cdef ListColumnView rhs_view = rhs.list_view()
 
-    cdef null_equality c_nulls_equal = (
-        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
-    )
-    cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
-    )
-
     with nogil:
         c_result = cpp_set_operations.intersect_distinct(
             lhs_view.view(),
             rhs_view.view(),
-            c_nulls_equal,
-            c_nans_equal,
+            nulls_equal,
+            nans_equal,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -577,8 +566,8 @@ cpdef Column intersect_distinct(
 cpdef Column union_distinct(
     Column lhs,
     Column rhs,
-    bool nulls_equal=True,
-    bool nans_equal=True
+    null_equality nulls_equal=null_equality.EQUAL,
+    nan_equality nans_equal=nan_equality.ALL_EQUAL,
 ):
     """Create a lists column of distinct elements found in
     either of two input lists columns.
@@ -591,11 +580,10 @@ cpdef Column union_distinct(
         The input lists column of elements that may be included.
     rhs : Column
         The input lists column of elements to exclude.
-    nulls_equal : bool, default True
-        If true, null elements are considered equal. Otherwise, unequal.
-    nans_equal : bool, default True
-        If true, libcudf will treat nan elements from {-nan, +nan}
-        as equal. Otherwise, unequal. Otherwise, unequal.
+    nulls_equal : NullEquality, default EQUAL
+        Are nulls considered equal.
+    nans_equal : NanEquality, default ALL_EQUAL
+        Are nans considered equal.
 
     Returns
     -------
@@ -606,19 +594,12 @@ cpdef Column union_distinct(
     cdef ListColumnView lhs_view = lhs.list_view()
     cdef ListColumnView rhs_view = rhs.list_view()
 
-    cdef null_equality c_nulls_equal = (
-        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
-    )
-    cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
-    )
-
     with nogil:
         c_result = cpp_set_operations.union_distinct(
             lhs_view.view(),
             rhs_view.view(),
-            c_nulls_equal,
-            c_nans_equal,
+            nulls_equal,
+            nans_equal,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -651,7 +632,7 @@ cpdef Column apply_boolean_mask(Column input, Column boolean_mask):
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
+cpdef Column distinct(Column input, null_equality nulls_equal, nan_equality nans_equal):
     """Create a new list column without duplicate elements in each list.
 
     For details, see :cpp:func:`distinct`.
@@ -660,11 +641,10 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
     ----------
     input : Column
         The input column.
-    nulls_equal : bool
-        If true, null elements are considered equal. Otherwise, unequal.
-    nans_equal : bool
-        If true, libcudf will treat nan elements from {-nan, +nan}
-        as equal. Otherwise, unequal. Otherwise, unequal.
+    nulls_equal : NullEquality
+        Are nulls considered equal.
+    nans_equal : NanEquality
+        Are nans considered equal.
 
     Returns
     -------
@@ -674,17 +654,10 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
     cdef unique_ptr[column] c_result
     cdef ListColumnView list_view = input.list_view()
 
-    cdef null_equality c_nulls_equal = (
-        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
-    )
-    cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
-    )
-
     with nogil:
         c_result = cpp_distinct(
             list_view.view(),
-            c_nulls_equal,
-            c_nans_equal,
+            nulls_equal,
+            nans_equal,
         )
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/merge.pyi b/python/pylibcudf/pylibcudf/merge.pyi
new file mode 100644
index 00000000000..b18eb01f8a2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/merge.pyi
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.table import Table
+from pylibcudf.types import NullOrder, Order
+
+def merge(
+    tables_to_merge: list[Table],
+    key_cols: list[int],
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx
index 61a21aafdb2..c051cdc0c66 100644
--- a/python/pylibcudf/pylibcudf/merge.pyx
+++ b/python/pylibcudf/pylibcudf/merge.pyx
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport null_order, order, size_type
 
 from .table cimport Table
 
+__all__ = ["merge"]
 
 cpdef Table merge (
     list tables_to_merge,
diff --git a/python/pylibcudf/pylibcudf/null_mask.pyi b/python/pylibcudf/pylibcudf/null_mask.pyi
new file mode 100644
index 00000000000..1a6d96a0822
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/null_mask.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from rmm.pylibrmm.device_buffer import DeviceBuffer
+
+from pylibcudf.column import Column
+from pylibcudf.types import MaskState
+
+def copy_bitmask(col: Column) -> DeviceBuffer: ...
+def bitmask_allocation_size_bytes(number_of_bits: int) -> int: ...
+def create_null_mask(
+    size: int, state: MaskState = MaskState.UNINITIALIZED
+) -> DeviceBuffer: ...
+def bitmask_and(columns: list[Column]) -> tuple[DeviceBuffer, int]: ...
+def bitmask_or(columns: list[Column]) -> tuple[DeviceBuffer, int]: ...
diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx
index 74180951562..adc264e9af6 100644
--- a/python/pylibcudf/pylibcudf/null_mask.pyx
+++ b/python/pylibcudf/pylibcudf/null_mask.pyx
@@ -14,6 +14,13 @@ from pylibcudf.libcudf.types import mask_state as MaskState  # no-cython-lint
 from .column cimport Column
 from .table cimport Table
 
+__all__ = [
+    "bitmask_allocation_size_bytes",
+    "bitmask_and",
+    "bitmask_or",
+    "copy_bitmask",
+    "create_null_mask",
+]
 
 cdef DeviceBuffer buffer_to_python(device_buffer buf):
     return DeviceBuffer.c_from_unique_ptr(make_unique[device_buffer](move(buf)))
diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi
new file mode 100644
index 00000000000..ca39aa16d7e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class BPEMergePairs:
+    def __init__(self, merge_pairs: Column): ...
+
+def byte_pair_encoding(
+    input: Column, merge_pairs: BPEMergePairs, separator: Scalar | None = None
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
index 76caad276d4..7565b21084f 100644
--- a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
@@ -16,6 +16,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport (
 )
 from pylibcudf.scalar cimport Scalar
 
+__all__ = ["BPEMergePairs", "byte_pair_encoding"]
 
 cdef class BPEMergePairs:
     """The table of merge pairs for the BPE encoder.
@@ -27,6 +28,8 @@ cdef class BPEMergePairs:
         with nogil:
             self.c_obj = move(cpp_load_merge_pairs(c_pairs))
 
+    __hash__ = None
+
 cpdef Column byte_pair_encoding(
     Column input,
     BPEMergePairs merge_pairs,
diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi
new file mode 100644
index 00000000000..85bbbb880ee
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def edit_distance(input: Column, targets: Column) -> Column: ...
+def edit_distance_matrix(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
index dcacb2e1267..eceeaff24e3 100644
--- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
@@ -9,6 +9,7 @@ from pylibcudf.libcudf.nvtext.edit_distance cimport (
     edit_distance_matrix as cpp_edit_distance_matrix,
 )
 
+__all__ = ["edit_distance", "edit_distance_matrix"]
 
 cpdef Column edit_distance(Column input, Column targets):
     """
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi
new file mode 100644
index 00000000000..2757518379d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def generate_ngrams(
+    input: Column, ngrams: int, separator: Scalar
+) -> Column: ...
+def generate_character_ngrams(input: Column, ngrams: int = 2) -> Column: ...
+def hash_character_ngrams(input: Column, ngrams: int = 2) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
index 09859d09e9e..521bc0ef4a4 100644
--- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
@@ -14,6 +14,11 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 
+__all__ = [
+    "generate_ngrams",
+    "generate_character_ngrams",
+    "hash_character_ngrams",
+]
 
 cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator):
     """
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi
new file mode 100644
index 00000000000..18263c5c8fd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def jaccard_index(input1: Column, input2: Column, width: int) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
index 3d8669865d9..90cace088f7 100644
--- a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.nvtext.jaccard cimport (
 )
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = ["jaccard_index"]
 
 cpdef Column jaccard_index(Column input1, Column input2, size_type width):
     """
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
index 97e8c9dc83c..6b544282f44 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -11,8 +11,24 @@ ctypedef fused ColumnOrScalar:
 
 cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)
 
+cpdef Column minhash_permuted(
+    Column input,
+    uint32_t seed,
+    Column a,
+    Column b,
+    size_type width
+)
+
 cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)
 
+cpdef Column minhash64_permuted(
+    Column input,
+    uint64_t seed,
+    Column a,
+    Column b,
+    size_type width
+)
+
 cpdef Column word_minhash(Column input, Column seeds)
 
 cpdef Column word_minhash64(Column input, Column seeds)
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
new file mode 100644
index 00000000000..a2d9b6364f7
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def minhash(
+    input: Column, seeds: Column | Scalar, width: int = 4
+) -> Column: ...
+def minhash64(
+    input: Column, seeds: Column | Scalar, width: int = 4
+) -> Column: ...
+def word_minhash(input: Column, seeds: Column) -> Column: ...
+def word_minhash64(input: Column, seeds: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index f1e012e60e5..5448cc6de9b 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -8,6 +8,8 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
+    minhash64_permuted as cpp_minhash64_permuted,
+    minhash_permuted as cpp_minhash_permuted,
     word_minhash as cpp_word_minhash,
     word_minhash64 as cpp_word_minhash64,
 )
@@ -16,7 +18,14 @@ from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
+import warnings
 
+__all__ = [
+    "minhash",
+    "minhash64",
+    "word_minhash",
+    "word_minhash64",
+]
 
 cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
     """
@@ -40,6 +49,12 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
     Column
         List column of minhash values for each string per seed
     """
+    warnings.warn(
+        "Starting in version 25.02, the signature of this function will "
+        "be changed to match pylibcudf.nvtext.minhash_permuted.",
+        FutureWarning
+    )
+
     cdef unique_ptr[column] c_result
 
     if not isinstance(seeds, (Column, Scalar)):
@@ -55,6 +70,50 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
 
     return Column.from_libcudf(move(c_result))
 
+cpdef Column minhash_permuted(
+    Column input,
+    uint32_t seed,
+    Column a,
+    Column b,
+    size_type width
+):
+    """
+    Returns the minhash values for each string.
+    This function uses MurmurHash3_x86_32 for the hash algorithm.
+
+    For details, see :cpp:func:`minhash_permuted`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    seed : uint32_t
+        Seed used for the hash function
+    a : Column
+        1st parameter value used for the minhash algorithm.
+    b : Column
+        2nd parameter value used for the minhash algorithm.
+    width : size_type
+        Character width used for apply substrings;
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_minhash_permuted(
+            input.view(),
+            seed,
+            a.view(),
+            b.view(),
+            width
+        )
+
+    return Column.from_libcudf(move(c_result))
+
 cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
     """
     Returns the minhash values for each string per seed.
@@ -77,6 +136,12 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
     Column
         List column of minhash values for each string per seed
     """
+    warnings.warn(
+        "Starting in version 25.02, the signature of this function will "
+        "be changed to match pylibcudf.nvtext.minhash64_permuted.",
+        FutureWarning
+    )
+
     cdef unique_ptr[column] c_result
 
     if not isinstance(seeds, (Column, Scalar)):
@@ -92,6 +157,50 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
 
     return Column.from_libcudf(move(c_result))
 
+cpdef Column minhash64_permuted(
+    Column input,
+    uint64_t seed,
+    Column a,
+    Column b,
+    size_type width
+):
+    """
+    Returns the minhash values for each string.
+    This function uses MurmurHash3_x64_128 for the hash algorithm.
+
+    For details, see :cpp:func:`minhash64_permuted`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    seed : uint64_t
+        Seed used for the hash function
+    a : Column
+        1st parameter value used for the minhash algorithm.
+    b : Column
+        2nd parameter value used for the minhash algorithm.
+    width : size_type
+        Character width used for apply substrings;
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_minhash64_permuted(
+            input.view(),
+            seed,
+            a.view(),
+            b.view(),
+            width
+        )
+
+    return Column.from_libcudf(move(c_result))
+
 cpdef Column word_minhash(Column input, Column seeds):
     """
     Returns the minhash values for each row of strings per seed.
diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi
new file mode 100644
index 00000000000..224640ed44d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def ngrams_tokenize(
+    input: Column, ngrams: int, delimiter: Scalar, separator: Scalar
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
index 8a1854c5f0d..771c7c019fc 100644
--- a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
@@ -12,6 +12,7 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 
+__all__ = ["ngrams_tokenize"]
 
 cpdef Column ngrams_tokenize(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
new file mode 100644
index 00000000000..1d90a5a8960
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def normalize_spaces(input: Column) -> Column: ...
+def normalize_characters(input: Column, do_lower_case: bool) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
index 637d900b659..b259ccaefa6 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.nvtext.normalize cimport (
     normalize_spaces as cpp_normalize_spaces,
 )
 
+__all__ = ["normalize_characters", "normalize_spaces"]
 
 cpdef Column normalize_spaces(Column input):
     """
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyi b/python/pylibcudf/pylibcudf/nvtext/replace.pyi
new file mode 100644
index 00000000000..1f1ac72ce7c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyi
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def replace_tokens(
+    input: Column,
+    targets: Column,
+    replacements: Column,
+    delimiter: Scalar | None = None,
+) -> Column: ...
+def filter_tokens(
+    input: Column,
+    min_token_length: int,
+    replacement: Scalar | None = None,
+    delimiter: Scalar | None = None,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
index b65348ce14d..a27592fb434 100644
--- a/python/pylibcudf/pylibcudf/nvtext/replace.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
@@ -16,6 +16,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport (
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 
+__all__ = ["filter_tokens", "replace_tokens"]
 
 cpdef Column replace_tokens(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi
new file mode 100644
index 00000000000..d6ba1d189bd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def is_letter(
+    input: Column, check_vowels: bool, indices: Column | int
+) -> Column: ...
+def porter_stemmer_measure(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
index 854d1053624..c9e4f1274e4 100644
--- a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
@@ -12,6 +12,7 @@ from pylibcudf.libcudf.nvtext.stemmer cimport (
 )
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = ["is_letter", "porter_stemmer_measure"]
 
 cpdef Column is_letter(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi
new file mode 100644
index 00000000000..f6618e296b1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+class HashedVocabulary:
+    def __init__(self, hash_file: str): ...
+
+def subword_tokenize(
+    input: Column,
+    vocabulary_table: HashedVocabulary,
+    max_sequence_length: int,
+    stride: int,
+    do_lower_case: bool,
+    do_truncate: bool,
+) -> tuple[Column, Column, Column]: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
index 04643d3bd84..14fb6f5fe1e 100644
--- a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
@@ -13,6 +13,7 @@ from pylibcudf.libcudf.nvtext.subword_tokenize cimport (
     tokenizer_result as cpp_tokenizer_result,
 )
 
+__all__ = ["HashedVocabulary", "subword_tokenize"]
 
 cdef class HashedVocabulary:
     """The vocabulary data for use with the subword_tokenize function.
@@ -24,6 +25,8 @@ cdef class HashedVocabulary:
         with nogil:
             self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))
 
+    __hash__ = None
+
 cpdef tuple[Column, Column, Column] subword_tokenize(
     Column input,
     HashedVocabulary vocabulary_table,
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi
new file mode 100644
index 00000000000..b9aa2393514
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class TokenizeVocabulary:
+    def __init__(self, vocab: Column): ...
+
+def tokenize_scalar(
+    input: Column, delimiter: Scalar | None = None
+) -> Column: ...
+def tokenize_column(input: Column, delimiters: Column) -> Column: ...
+def count_tokens_scalar(
+    input: Column, delimiter: Scalar | None = None
+) -> Column: ...
+def count_tokens_column(input: Column, delimiters: Column) -> Column: ...
+def character_tokenize(input: Column) -> Column: ...
+def detokenize(
+    input: Column, row_indices: Column, separator: Scalar | None = None
+) -> Column: ...
+def tokenize_with_vocabulary(
+    input: Column,
+    vocabulary: TokenizeVocabulary,
+    delimiter: Scalar,
+    default_id: int = -1,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
index ec02e8ebf4e..43d426489b4 100644
--- a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
@@ -20,6 +20,16 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport (
 )
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = [
+    "TokenizeVocabulary",
+    "character_tokenize",
+    "count_tokens_column",
+    "count_tokens_scalar",
+    "detokenize",
+    "tokenize_column",
+    "tokenize_scalar",
+    "tokenize_with_vocabulary",
+]
 
 cdef class TokenizeVocabulary:
     """The Vocabulary object to be used with ``tokenize_with_vocabulary``.
@@ -31,6 +41,8 @@ cdef class TokenizeVocabulary:
         with nogil:
             self.c_obj = move(cpp_load_vocabulary(c_vocab))
 
+    __hash__ = None
+
 cpdef Column tokenize_scalar(Column input, Scalar delimiter=None):
     """
     Returns a single column of strings by tokenizing the input
diff --git a/python/pylibcudf/pylibcudf/partitioning.pyi b/python/pylibcudf/pylibcudf/partitioning.pyi
new file mode 100644
index 00000000000..48a2ade23f1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/partitioning.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+
+def hash_partition(
+    input: Table, columns_to_hash: list[int], num_partitions: int
+) -> tuple[Table, list[int]]: ...
+def partition(
+    t: Table, partition_map: Column, num_partitions: int
+) -> tuple[Table, list[int]]: ...
+def round_robin_partition(
+    input: Table, num_partitions: int, start_partition: int = 0
+) -> tuple[Table, list[int]]: ...
diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx
index 3cff4843735..1dacabceb06 100644
--- a/python/pylibcudf/pylibcudf/partitioning.pyx
+++ b/python/pylibcudf/pylibcudf/partitioning.pyx
@@ -11,6 +11,11 @@ from pylibcudf.libcudf.table.table cimport table
 from .column cimport Column
 from .table cimport Table
 
+__all__ = [
+    "hash_partition",
+    "partition",
+    "round_robin_partition",
+]
 
 cpdef tuple[Table, list] hash_partition(
     Table input,
diff --git a/python/pylibcudf/pylibcudf/py.typed b/python/pylibcudf/pylibcudf/py.typed
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/quantiles.pyi b/python/pylibcudf/pylibcudf/quantiles.pyi
new file mode 100644
index 00000000000..dca6eed013a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/quantiles.pyi
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from collections.abc import Sequence
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+from pylibcudf.types import Interpolation, NullOrder, Order, Sorted
+
+def quantile(
+    input: Column,
+    q: Sequence[float],
+    interp: Interpolation = Interpolation.LINEAR,
+    ordered_indices: Column | None = None,
+    exact: bool = True,
+) -> Column: ...
+def quantiles(
+    input: Table,
+    q: Sequence[float],
+    interp: Interpolation = Interpolation.NEAREST,
+    is_input_sorted: Sorted = Sorted.NO,
+    column_order: list[Order] | None = None,
+    null_precedence: list[NullOrder] | None = None,
+) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx
index 7d92b598bd0..634218586ac 100644
--- a/python/pylibcudf/pylibcudf/quantiles.pyx
+++ b/python/pylibcudf/pylibcudf/quantiles.pyx
@@ -17,6 +17,7 @@ from .column cimport Column
 from .table cimport Table
 from .types cimport interpolation
 
+__all__ = ["quantile", "quantiles"]
 
 cpdef Column quantile(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/reduce.pyi b/python/pylibcudf/pylibcudf/reduce.pyi
new file mode 100644
index 00000000000..a09949b7b30
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/reduce.pyi
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.aggregation import Aggregation
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.types import DataType
+
+class ScanType(IntEnum):
+    INCLUSIVE = ...
+    EXCLUSIVE = ...
+
+def reduce(col: Column, agg: Aggregation, data_type: DataType) -> Scalar: ...
+def scan(col: Column, agg: Aggregation, inclusive: ScanType) -> Column: ...
+def minmax(col: Column) -> tuple[Scalar, Scalar]: ...
diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx
index d9ec3a9bdc4..1d6ffd9de10 100644
--- a/python/pylibcudf/pylibcudf/reduce.pyx
+++ b/python/pylibcudf/pylibcudf/reduce.pyx
@@ -16,6 +16,7 @@ from .types cimport DataType
 
 from pylibcudf.libcudf.reduce import scan_type as ScanType  # no-cython-lint
 
+__all__ = ["ScanType", "minmax", "reduce", "scan"]
 
 cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type):
     """Perform a reduction on a column
diff --git a/python/pylibcudf/pylibcudf/replace.pyi b/python/pylibcudf/pylibcudf/replace.pyi
new file mode 100644
index 00000000000..eed7a2a6c52
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/replace.pyi
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class ReplacePolicy(IntEnum):
+    PRECEDING = ...
+    FOLLOWING = ...
+
+def replace_nulls(
+    source_column: Column, replacement: Column | Scalar | ReplacePolicy
+) -> Column: ...
+def find_and_replace_all(
+    source_column: Column,
+    values_to_replace: Column,
+    replacement_values: Column,
+) -> Column: ...
+def clamp(
+    source_column: Column,
+    lo: Scalar,
+    hi: Scalar,
+    lo_replace: Scalar | None = None,
+    hi_replace: Scalar | None = None,
+) -> Column: ...
+def normalize_nans_and_zeros(
+    source_column: Column, inplace: bool = False
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx
index f77eba7ace5..51be2b29277 100644
--- a/python/pylibcudf/pylibcudf/replace.pyx
+++ b/python/pylibcudf/pylibcudf/replace.pyx
@@ -15,6 +15,14 @@ from pylibcudf.libcudf.replace import \
 from .column cimport Column
 from .scalar cimport Scalar
 
+__all__ = [
+    "ReplacePolicy",
+    "clamp",
+    "find_and_replace_all",
+    "normalize_nans_and_zeros",
+    "replace_nulls",
+]
+
 
 cpdef Column replace_nulls(Column source_column, ReplacementType replacement):
     """Replace nulls in source_column.
diff --git a/python/pylibcudf/pylibcudf/reshape.pyi b/python/pylibcudf/pylibcudf/reshape.pyi
new file mode 100644
index 00000000000..d8d0ffcc3e0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/reshape.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+
+def interleave_columns(source_table: Table) -> Column: ...
+def tile(source_table: Table, count: int) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx
index 6540b5198ab..bdc212a1985 100644
--- a/python/pylibcudf/pylibcudf/reshape.pyx
+++ b/python/pylibcudf/pylibcudf/reshape.pyx
@@ -13,6 +13,7 @@ from pylibcudf.libcudf.types cimport size_type
 from .column cimport Column
 from .table cimport Table
 
+__all__ = ["interleave_columns", "tile"]
 
 cpdef Column interleave_columns(Table source_table):
     """Interleave columns of a table into a single column.
diff --git a/python/pylibcudf/pylibcudf/rolling.pyi b/python/pylibcudf/pylibcudf/rolling.pyi
new file mode 100644
index 00000000000..ca0111e01ec
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/rolling.pyi
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.aggregation import Aggregation
+from pylibcudf.column import Column
+
+def rolling_window[WindowType: (Column, int)](
+    source: Column,
+    preceding_window: WindowType,
+    following_window: WindowType,
+    min_periods: int,
+    agg: Aggregation,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx
index 4fd0b005431..11acf57ccf4 100644
--- a/python/pylibcudf/pylibcudf/rolling.pyx
+++ b/python/pylibcudf/pylibcudf/rolling.pyx
@@ -11,6 +11,7 @@ from pylibcudf.libcudf.types cimport size_type
 from .aggregation cimport Aggregation
 from .column cimport Column
 
+__all__ = ["rolling_window"]
 
 cpdef Column rolling_window(
     Column source,
diff --git a/python/pylibcudf/pylibcudf/round.pyi b/python/pylibcudf/pylibcudf/round.pyi
new file mode 100644
index 00000000000..410cf5de586
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/round.pyi
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+
+class RoundingMethod(IntEnum):
+    HALF_UP = ...
+    HALF_EVEN = ...
+
+def round(
+    source: Column,
+    decimal_places: int = 0,
+    round_method: RoundingMethod = RoundingMethod.HALF_UP,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx
index 689363e652d..09e5a9cc3bc 100644
--- a/python/pylibcudf/pylibcudf/round.pyx
+++ b/python/pylibcudf/pylibcudf/round.pyx
@@ -11,6 +11,7 @@ from pylibcudf.libcudf.column.column cimport column
 
 from .column cimport Column
 
+__all__ = ["RoundingMethod", "round"]
 
 cpdef Column round(
     Column source,
diff --git a/python/pylibcudf/pylibcudf/scalar.pyi b/python/pylibcudf/pylibcudf/scalar.pyi
new file mode 100644
index 00000000000..0b72b10ef86
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/scalar.pyi
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+class Scalar:
+    def type(self) -> DataType: ...
+    def is_valid(self) -> bool: ...
+    @staticmethod
+    def empty_like(column: Column) -> Scalar: ...
diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx
index d4888a62ad1..1ac014e891e 100644
--- a/python/pylibcudf/pylibcudf/scalar.pyx
+++ b/python/pylibcudf/pylibcudf/scalar.pyx
@@ -11,6 +11,8 @@ from rmm.pylibrmm.memory_resource cimport get_current_device_resource
 from .column cimport Column
 from .types cimport DataType
 
+__all__ = ["Scalar"]
+
 
 # The DeviceMemoryResource attribute could be released prematurely
 # by the gc if the Scalar is in a reference cycle. Removing the tp_clear
@@ -37,6 +39,8 @@ cdef class Scalar:
         # DeviceScalar.
         raise ValueError("Scalar should be constructed with a factory")
 
+    __hash__ = None
+
     cdef const scalar* get(self) noexcept nogil:
         return self.c_obj.get()
 
diff --git a/python/pylibcudf/pylibcudf/search.pyi b/python/pylibcudf/pylibcudf/search.pyi
new file mode 100644
index 00000000000..7f292b129b2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/search.pyi
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+from pylibcudf.types import NullOrder, Order
+
+def lower_bound(
+    haystack: Table,
+    needles: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Column: ...
+def upper_bound(
+    haystack: Table,
+    needles: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Column: ...
+def contains(haystack: Column, needles: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx
index 1a870248046..50353fcd0cc 100644
--- a/python/pylibcudf/pylibcudf/search.pyx
+++ b/python/pylibcudf/pylibcudf/search.pyx
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport null_order, order
 from .column cimport Column
 from .table cimport Table
 
+__all__ = ["contains", "lower_bound", "upper_bound"]
 
 cpdef Column lower_bound(
     Table haystack,
diff --git a/python/pylibcudf/pylibcudf/sorting.pyi b/python/pylibcudf/pylibcudf/sorting.pyi
new file mode 100644
index 00000000000..5255d869a4d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/sorting.pyi
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.aggregation import RankMethod
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+from pylibcudf.types import NullOrder, NullPolicy, Order
+
+def sorted_order(
+    source_table: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Column: ...
+def stable_sorted_order(
+    source_table: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Column: ...
+def rank(
+    input_view: Column,
+    method: RankMethod,
+    column_order: Order,
+    null_handling: NullPolicy,
+    null_precedence: NullOrder,
+    percentage: bool,
+) -> Column: ...
+def is_sorted(
+    tbl: Table, column_order: list[Order], null_precedence: list[NullOrder]
+) -> bool: ...
+def segmented_sort_by_key(
+    values: Table,
+    keys: Table,
+    segment_offsets: Column,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
+def stable_segmented_sort_by_key(
+    values: Table,
+    keys: Table,
+    segment_offsets: Column,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
+def sort_by_key(
+    values: Table,
+    keys: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
+def stable_sort_by_key(
+    values: Table,
+    keys: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
+def sort(
+    source_table: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
+def stable_sort(
+    source_table: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx
index fc40f03e1fd..fb29ef8c571 100644
--- a/python/pylibcudf/pylibcudf/sorting.pyx
+++ b/python/pylibcudf/pylibcudf/sorting.pyx
@@ -12,6 +12,18 @@ from pylibcudf.libcudf.types cimport null_order, null_policy, order
 from .column cimport Column
 from .table cimport Table
 
+__all__ = [
+    "is_sorted",
+    "rank",
+    "segmented_sort_by_key",
+    "sort",
+    "sort_by_key",
+    "sorted_order",
+    "stable_segmented_sort_by_key",
+    "stable_sort",
+    "stable_sort_by_key",
+    "stable_sorted_order",
+]
 
 cpdef Column sorted_order(Table source_table, list column_order, list null_precedence):
     """Computes the row indices required to sort the table.
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/stream_compaction.pxd
index a4f39792f0c..a20a23e2e58 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pxd
@@ -17,6 +17,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold)
 
 cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold)
 
+cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask)
+
 cpdef Table unique(
     Table input,
     list keys,
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyi b/python/pylibcudf/pylibcudf/stream_compaction.pyi
new file mode 100644
index 00000000000..99cade48309
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyi
@@ -0,0 +1,53 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+from pylibcudf.types import NanEquality, NanPolicy, NullEquality, NullPolicy
+
+class DuplicateKeepOption(IntEnum):
+    KEEP_ANY = ...
+    KEEP_FIRST = ...
+    KEEP_LAST = ...
+    KEEP_NONE = ...
+
+def drop_nulls(
+    source_table: Table, keys: list[int], keep_threshold: int
+) -> Table: ...
+def drop_nans(
+    source_table: Table, keys: list[int], keep_threshold: int
+) -> Table: ...
+def apply_boolean_mask(source_table: Table, boolean_mask: Column) -> Table: ...
+def unique(
+    input: Table,
+    keys: list[int],
+    keep: DuplicateKeepOption,
+    nulls_equal: NullEquality,
+) -> Table: ...
+def distinct(
+    input: Table,
+    keys: list[int],
+    keep: DuplicateKeepOption,
+    nulls_equal: NullEquality,
+    nans_equal: NanEquality,
+) -> Table: ...
+def distinct_indices(
+    input: Table,
+    keep: DuplicateKeepOption,
+    nulls_equal: NullEquality,
+    nans_equal: NanEquality,
+) -> Column: ...
+def stable_distinct(
+    input: Table,
+    keys: list[int],
+    keep: DuplicateKeepOption,
+    nulls_equal: NullEquality,
+    nans_equal: NanEquality,
+) -> Table: ...
+def unique_count(
+    source: Column, null_handling: NullPolicy, nan_handling: NanPolicy
+) -> int: ...
+def distinct_count(
+    source: Column, null_handling: NullPolicy, nan_handling: NanPolicy
+) -> int: ...
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx
index 2145398a191..6e403ca1b07 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pyx
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx
@@ -21,6 +21,18 @@ from pylibcudf.libcudf.stream_compaction import \
 from .column cimport Column
 from .table cimport Table
 
+__all__ = [
+    "DuplicateKeepOption",
+    "apply_boolean_mask",
+    "distinct",
+    "distinct_count",
+    "distinct_indices",
+    "drop_nans",
+    "drop_nulls",
+    "stable_distinct",
+    "unique",
+    "unique_count",
+]
 
 cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold):
     """Filters out rows from the input table based on the presence of nulls.
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index fa7294c7dbd..67054f0b447 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -28,6 +28,7 @@
 from .side_type import SideType
 
 __all__ = [
+    "SideType",
     "attributes",
     "capitalize",
     "case",
@@ -46,9 +47,8 @@
     "replace",
     "replace_re",
     "slice",
-    "strip",
     "split",
-    "SideType",
+    "strip",
     "translate",
     "wrap",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyi b/python/pylibcudf/pylibcudf/strings/attributes.pyi
new file mode 100644
index 00000000000..7fd5c9773d4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def count_characters(source_strings: Column) -> Column: ...
+def count_bytes(source_strings: Column) -> Column: ...
+def code_points(source_strings: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx
index 8e46a32835d..f1eb09b4965 100644
--- a/python/pylibcudf/pylibcudf/strings/attributes.pyx
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx
@@ -6,6 +6,7 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport attributes as cpp_attributes
 
+__all__ = ["code_points", "count_bytes", "count_characters"]
 
 cpdef Column count_characters(Column source_strings):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyi b/python/pylibcudf/pylibcudf/strings/capitalize.pyi
new file mode 100644
index 00000000000..5c6689418e2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyi
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.strings.char_types import StringCharacterTypes
+
+def capitalize(input: Column, delimiters: Scalar | None = None) -> Column: ...
+def title(
+    input: Column,
+    sequence_type: StringCharacterTypes = StringCharacterTypes.ALPHA,
+) -> Column: ...
+def is_title(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyx b/python/pylibcudf/pylibcudf/strings/capitalize.pyx
index 06b991c3cf1..a54480b8e4a 100644
--- a/python/pylibcudf/pylibcudf/strings/capitalize.pyx
+++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyx
@@ -14,6 +14,7 @@ from pylibcudf.strings.char_types cimport string_character_types
 
 from cython.operator import dereference
 
+__all__ = ["capitalize", "is_title", "title"]
 
 cpdef Column capitalize(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/case.pyi b/python/pylibcudf/pylibcudf/strings/case.pyi
new file mode 100644
index 00000000000..4e50db4d1da
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/case.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def to_lower(input: Column) -> Column: ...
+def to_upper(input: Column) -> Column: ...
+def swapcase(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/case.pyx b/python/pylibcudf/pylibcudf/strings/case.pyx
index 9e6cd7717d3..d0e054bef72 100644
--- a/python/pylibcudf/pylibcudf/strings/case.pyx
+++ b/python/pylibcudf/pylibcudf/strings/case.pyx
@@ -6,6 +6,7 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport case as cpp_case
 
+__all__ = ["swapcase", "to_lower", "to_upper"]
 
 cpdef Column to_lower(Column input):
     cdef unique_ptr[column] c_result
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyi b/python/pylibcudf/pylibcudf/strings/char_types.pyi
new file mode 100644
index 00000000000..daa36cbb68d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pyi
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class StringCharacterTypes(IntEnum):
+    DECIMAL = ...
+    NUMERIC = ...
+    DIGIT = ...
+    ALPHA = ...
+    SPACE = ...
+    UPPER = ...
+    LOWER = ...
+    ALPHANUM = ...
+    CASE_TYPES = ...
+    ALL_TYPES = ...
+
+def all_characters_of_type(
+    source_strings: Column,
+    types: StringCharacterTypes,
+    verify_types: StringCharacterTypes,
+) -> Column: ...
+def filter_characters_of_type(
+    source_strings: Column,
+    types_to_remove: StringCharacterTypes,
+    replacement: Scalar,
+    types_to_keep: StringCharacterTypes,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx
index cb04efe5e8f..0af4a1f9c37 100644
--- a/python/pylibcudf/pylibcudf/strings/char_types.pyx
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx
@@ -12,6 +12,11 @@ from cython.operator import dereference
 from pylibcudf.libcudf.strings.char_types import \
     string_character_types as StringCharacterTypes  # no-cython-lint
 
+__all__ = [
+    "StringCharacterTypes",
+    "all_characters_of_type",
+    "filter_characters_of_type",
+]
 
 cpdef Column all_characters_of_type(
     Column source_strings,
diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyi b/python/pylibcudf/pylibcudf/strings/combine.pyi
new file mode 100644
index 00000000000..3094b20f141
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/combine.pyi
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+
+class SeparatorOnNulls(IntEnum):
+    YES = ...
+    NO = ...
+
+class OutputIfEmptyList(IntEnum):
+    EMPTY_STRING = ...
+    NULL_ELEMENT = ...
+
+def concatenate(
+    strings_columns: Table,
+    separator: Column | Scalar,
+    narep: Scalar | None = None,
+    col_narep: Scalar | None = None,
+    separate_nulls: SeparatorOnNulls = SeparatorOnNulls.YES,
+) -> Column: ...
+def join_strings(
+    input: Column, separator: Scalar, narep: Scalar
+) -> Column: ...
+def join_list_elements(
+    lists_strings_column: Column,
+    separator: Column | Scalar,
+    separator_narep: Scalar,
+    string_narep: Scalar,
+    separate_nulls: SeparatorOnNulls,
+    empty_list_policy: OutputIfEmptyList,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx
index f17d5265ab4..dc1e72c799b 100644
--- a/python/pylibcudf/pylibcudf/strings/combine.pyx
+++ b/python/pylibcudf/pylibcudf/strings/combine.pyx
@@ -17,6 +17,13 @@ from pylibcudf.libcudf.strings.combine import \
 from pylibcudf.libcudf.strings.combine import \
     separator_on_nulls as SeparatorOnNulls  # no-cython-lint
 
+__all__ = [
+    "OutputIfEmptyList",
+    "SeparatorOnNulls",
+    "concatenate",
+    "join_list_elements",
+    "join_strings",
+]
 
 cpdef Column concatenate(
     Table strings_columns,
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyi b/python/pylibcudf/pylibcudf/strings/contains.pyi
new file mode 100644
index 00000000000..1f0620383b3
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.strings.regex_program import RegexProgram
+
+def contains_re(input: Column, prog: RegexProgram) -> Column: ...
+def count_re(input: Column, prog: RegexProgram) -> Column: ...
+def matches_re(input: Column, prog: RegexProgram) -> Column: ...
+def like(
+    input: Column,
+    pattern: Column | Scalar,
+    escape_character: Scalar | None = None,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx
index d4b1130241d..7b4c53ed853 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pyx
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyx
@@ -12,6 +12,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport (
 from pylibcudf.libcudf.strings cimport contains as cpp_contains
 from pylibcudf.strings.regex_program cimport RegexProgram
 
+__all__ = ["contains_re", "count_re", "like", "matches_re"]
 
 cpdef Column contains_re(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index aa27a7c8929..08b5034456e 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -10,3 +10,15 @@
     convert_lists,
     convert_urls,
 )
+
+__all__ = [
+    "convert_booleans",
+    "convert_datetime",
+    "convert_durations",
+    "convert_fixed_point",
+    "convert_floats",
+    "convert_integers",
+    "convert_ipv4",
+    "convert_lists",
+    "convert_urls",
+]
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi
new file mode 100644
index 00000000000..77c09242e9a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def to_booleans(input: Column, true_string: Scalar) -> Column: ...
+def from_booleans(
+    booleans: Column, true_string: Scalar, false_string: Scalar
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
index dc12b291b11..1899a3b27cc 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
@@ -12,6 +12,7 @@ from pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
 
+__all__ = ["from_booleans", "to_booleans"]
 
 cpdef Column to_booleans(Column input, Scalar true_string):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi
new file mode 100644
index 00000000000..c6857169765
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+def to_timestamps(
+    input: Column, timestamp_type: DataType, format: str
+) -> Column: ...
+def from_timestamps(
+    timestamps: Column, format: str, input_strings_names: Column
+) -> Column: ...
+def is_timestamp(input: Column, format: str) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
index 0ee60812e00..f1cd684166c 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
@@ -11,6 +11,7 @@ from pylibcudf.libcudf.strings.convert cimport (
 
 from pylibcudf.types import DataType
 
+__all__ = ["from_timestamps", "is_timestamp", "to_timestamps"]
 
 cpdef Column to_timestamps(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi
new file mode 100644
index 00000000000..a5787a5fe49
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+def to_durations(
+    input: Column, duration_type: DataType, format: str
+) -> Column: ...
+def from_durations(durations: Column, format: str | None = None) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
index 31980ace418..a9654afd00a 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
@@ -11,6 +11,7 @@ from pylibcudf.libcudf.strings.convert cimport (
 
 from pylibcudf.types import DataType
 
+__all__ = ["from_durations", "to_durations"]
 
 cpdef Column to_durations(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi
new file mode 100644
index 00000000000..1192d3dfcd6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+def to_fixed_point(input: Column, output_type: DataType) -> Column: ...
+def from_fixed_point(input: Column) -> Column: ...
+def is_fixed_point(
+    input: Column, decimal_type: DataType | None = None
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
index 962a47dfadf..00cbc822f36 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
@@ -9,6 +9,8 @@ from pylibcudf.libcudf.strings.convert cimport (
 )
 from pylibcudf.types cimport DataType, type_id
 
+__all__ = ["from_fixed_point", "is_fixed_point", "to_fixed_point"]
+
 
 cpdef Column to_fixed_point(Column input, DataType output_type):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi
new file mode 100644
index 00000000000..ddf4042e10d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+def to_floats(strings: Column, output_type: DataType) -> Column: ...
+def from_floats(floats: Column) -> Column: ...
+def is_float(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
index 1296f4f9db5..b5199aac577 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
@@ -9,6 +9,7 @@ from pylibcudf.libcudf.strings.convert cimport (
 )
 from pylibcudf.types cimport DataType
 
+__all__ = ["from_floats", "is_float", "to_floats"]
 
 cpdef Column to_floats(Column strings, DataType output_type):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi
new file mode 100644
index 00000000000..b96226fba90
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+def to_integers(input: Column, output_type: DataType) -> Column: ...
+def from_integers(integers: Column) -> Column: ...
+def is_integer(input: Column, int_type: DataType | None = None) -> Column: ...
+def hex_to_integers(input: Column, output_type: DataType) -> Column: ...
+def is_hex(input: Column) -> Column: ...
+def integers_to_hex(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
index 5558683a502..12984e15ce9 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
@@ -9,6 +9,14 @@ from pylibcudf.libcudf.strings.convert cimport (
 )
 from pylibcudf.types cimport DataType
 
+__all__ = [
+    "from_integers",
+    "hex_to_integers",
+    "integers_to_hex",
+    "is_hex",
+    "is_integer",
+    "to_integers"
+]
 
 cpdef Column to_integers(Column input, DataType output_type):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi
new file mode 100644
index 00000000000..b017b32598c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def ipv4_to_integers(input: Column) -> Column: ...
+def integers_to_ipv4(integers: Column) -> Column: ...
+def is_ipv4(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
index 834781f95f3..e7c6aae4fa8 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
@@ -6,6 +6,7 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4
 
+__all__ = ["integers_to_ipv4", "ipv4_to_integers", "is_ipv4"]
 
 cpdef Column ipv4_to_integers(Column input):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi
new file mode 100644
index 00000000000..6ab3a4183e9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def format_list_column(
+    input: Column,
+    na_rep: Scalar | None = None,
+    separators: Column | None = None,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
index cbfe5f5aa8b..518f72f6644 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
@@ -17,6 +17,7 @@ from pylibcudf.types cimport type_id
 
 from cython.operator import dereference
 
+__all__ = ["format_list_column"]
 
 cpdef Column format_list_column(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi
new file mode 100644
index 00000000000..49b8468957c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def url_encode(input: Column) -> Column: ...
+def url_decode(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
index 82f8a75f1d9..bd5e23bca43 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
@@ -6,6 +6,7 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls
 
+__all__ = ["url_decode", "url_encode"]
 
 cpdef Column url_encode(Column input):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyi b/python/pylibcudf/pylibcudf/strings/extract.pyi
new file mode 100644
index 00000000000..4354bd3072d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/extract.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.strings.regex_program import RegexProgram
+from pylibcudf.table import Table
+
+def extract(input: Column, prog: RegexProgram) -> Table: ...
+def extract_all_record(input: Column, prog: RegexProgram) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx
index b56eccc8287..0ce70666e92 100644
--- a/python/pylibcudf/pylibcudf/strings/extract.pyx
+++ b/python/pylibcudf/pylibcudf/strings/extract.pyx
@@ -9,6 +9,7 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.strings.regex_program cimport RegexProgram
 from pylibcudf.table cimport Table
 
+__all__ = ["extract", "extract_all_record"]
 
 cpdef Table extract(Column input, RegexProgram prog):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/find.pyi b/python/pylibcudf/pylibcudf/strings/find.pyi
new file mode 100644
index 00000000000..3d04a9c3161
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/find.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def find(
+    input: Column, target: Column | Scalar, start: int = 0, stop: int = -1
+) -> Column: ...
+def rfind(
+    input: Column, target: Scalar, start: int = 0, stop: int = -1
+) -> Column: ...
+def contains(input: Column, target: Column | Scalar) -> Column: ...
+def starts_with(input: Column, target: Column | Scalar) -> Column: ...
+def ends_with(input: Column, target: Column | Scalar) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx
index 6fc6dca24fd..f0af339ff08 100644
--- a/python/pylibcudf/pylibcudf/strings/find.pyx
+++ b/python/pylibcudf/pylibcudf/strings/find.pyx
@@ -10,6 +10,7 @@ from cython.operator import dereference
 
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
+__all__ = ["contains", "ends_with", "find", "rfind", "starts_with"]
 
 cpdef Column find(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyi b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi
new file mode 100644
index 00000000000..3d46fd2fa6d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def find_multiple(input: Column, targets: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
index 672aa606bd0..c9ce734b4be 100644
--- a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
@@ -6,6 +6,7 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple
 
+__all__ = ["find_multiple"]
 
 cpdef Column find_multiple(Column input, Column targets):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyi b/python/pylibcudf/pylibcudf/strings/findall.pyi
new file mode 100644
index 00000000000..77e38581d22
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.strings.regex_program import RegexProgram
+
+def find_re(input: Column, pattern: RegexProgram) -> Column: ...
+def findall(input: Column, pattern: RegexProgram) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx
index 89fa4302824..23c84675a16 100644
--- a/python/pylibcudf/pylibcudf/strings/findall.pyx
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyx
@@ -7,6 +7,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport findall as cpp_findall
 from pylibcudf.strings.regex_program cimport RegexProgram
 
+__all__ = ["findall", "find_re"]
 
 cpdef Column findall(Column input, RegexProgram pattern):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyi b/python/pylibcudf/pylibcudf/strings/padding.pyi
new file mode 100644
index 00000000000..a991935e6e5
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/padding.pyi
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.strings.side_type import SideType
+
+def pad(
+    input: Column, width: int, side: SideType, fill_char: str
+) -> Column: ...
+def zfill(input: Column, width: int) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx
index f6950eecf60..0e349a7be47 100644
--- a/python/pylibcudf/pylibcudf/strings/padding.pyx
+++ b/python/pylibcudf/pylibcudf/strings/padding.pyx
@@ -6,6 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport padding as cpp_padding
 from pylibcudf.libcudf.strings.side_type cimport side_type
 
+__all__ = ["pad", "zfill"]
 
 cpdef Column pad(Column input, size_type width, side_type side, str fill_char):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyi b/python/pylibcudf/pylibcudf/strings/regex_flags.pyi
new file mode 100644
index 00000000000..c551cebf181
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from enum import IntEnum
+
+class RegexFlags(IntEnum):
+    DEFAULT = ...
+    MULTILINE = ...
+    DOTALL = ...
diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx
index ce3b6b10a42..65b504e0dc7 100644
--- a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx
+++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx
@@ -2,3 +2,5 @@
 
 from pylibcudf.libcudf.strings.regex_flags import \
     regex_flags as RegexFlags  # no-cython-lint
+
+__all__ = ["RegexFlags"]
diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyi b/python/pylibcudf/pylibcudf/strings/regex_program.pyi
new file mode 100644
index 00000000000..9abd6fa7802
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.strings.regex_flags import RegexFlags
+
+class RegexProgram:
+    def __init__(self): ...
+    @staticmethod
+    def create(pattern: str, flags: RegexFlags) -> RegexProgram: ...
diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyx b/python/pylibcudf/pylibcudf/strings/regex_program.pyx
index 91f585cd637..46bfde074d2 100644
--- a/python/pylibcudf/pylibcudf/strings/regex_program.pyx
+++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyx
@@ -11,6 +11,7 @@ from pylibcudf.strings.regex_flags import RegexFlags
 
 from pylibcudf.strings.regex_flags cimport regex_flags
 
+__all__ = ["RegexProgram"]
 
 cdef class RegexProgram:
     """Regex program class.
@@ -24,6 +25,8 @@ cdef class RegexProgram:
     def __init__(self, *args, **kwargs):
         raise ValueError("Do not instantiate RegexProgram directly, use create")
 
+    __hash__ = None
+
     @staticmethod
     def create(str pattern, int flags):
         """Create a program from a pattern.
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyi b/python/pylibcudf/pylibcudf/strings/repeat.pyi
new file mode 100644
index 00000000000..93a46b71caa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pyi
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def repeat_strings(input: Column, repeat_times: Column | int) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx
index fb2bb13c666..a497b1f438e 100644
--- a/python/pylibcudf/pylibcudf/strings/repeat.pyx
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx
@@ -6,6 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport repeat as cpp_repeat
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = ["repeat_strings"]
 
 cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyi b/python/pylibcudf/pylibcudf/strings/replace.pyi
new file mode 100644
index 00000000000..64df09ef7e8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/replace.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def replace(
+    input: Column, target: Scalar, repl: Scalar, maxrepl: int = -1
+) -> Column: ...
+def replace_multiple(
+    input: Column, target: Column, repl: Column, maxrepl: int = -1
+) -> Column: ...
+def replace_slice(
+    input: Column, repl: Scalar | None = None, start: int = 0, stop: int = -1
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx
index 2b94f5e3fee..3ba6c1b5530 100644
--- a/python/pylibcudf/pylibcudf/strings/replace.pyx
+++ b/python/pylibcudf/pylibcudf/strings/replace.pyx
@@ -16,6 +16,7 @@ from pylibcudf.libcudf.strings.replace cimport (
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 
+__all__ = ["replace", "replace_multiple", "replace_slice"]
 
 cpdef Column replace(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyi b/python/pylibcudf/pylibcudf/strings/replace_re.pyi
new file mode 100644
index 00000000000..056bafbf7ef
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyi
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from typing import overload
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.strings.regex_flags import RegexFlags
+from pylibcudf.strings.regex_program import RegexProgram
+
+@overload
+def replace_re(
+    input: Column,
+    pattern: RegexProgram,
+    replacement: Scalar,
+    max_replace_count: int = -1,
+) -> Column: ...
+@overload
+def replace_re(
+    input: Column,
+    patterns: list[str],
+    replacement: Column,
+    max_replace_count: int = -1,
+    flags: RegexFlags = RegexFlags.DEFAULT,
+) -> Column: ...
+def replace_with_backrefs(
+    input: Column, prog: RegexProgram, replacement: str
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
index ccc33fd4425..bdabc779ddf 100644
--- a/python/pylibcudf/pylibcudf/strings/replace_re.pyx
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
@@ -16,6 +16,7 @@ from pylibcudf.scalar cimport Scalar
 from pylibcudf.strings.regex_flags cimport regex_flags
 from pylibcudf.strings.regex_program cimport RegexProgram
 
+__all__ = ["replace_re", "replace_with_backrefs"]
 
 cpdef Column replace_re(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyi b/python/pylibcudf/pylibcudf/strings/side_type.pyi
new file mode 100644
index 00000000000..532edd60077
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from enum import IntEnum
+
+class SideType(IntEnum):
+    LEFT = ...
+    RIGHT = ...
+    BOTH = ...
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx
index cf0c770cc11..87db4206a9c 100644
--- a/python/pylibcudf/pylibcudf/strings/side_type.pyx
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx
@@ -1,3 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from pylibcudf.libcudf.strings.side_type import \
     side_type as SideType  # no-cython-lint
+
+__all__ = ["SideType"]
diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyi b/python/pylibcudf/pylibcudf/strings/slice.pyi
new file mode 100644
index 00000000000..7bf9a7cb8c6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/slice.pyi
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def slice_strings(
+    input: Column,
+    start: Column | Scalar | None = None,
+    stop: Column | Scalar | None = None,
+    step: Scalar | None = None,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyx b/python/pylibcudf/pylibcudf/strings/slice.pyx
index 70d10cab36c..d32de7c50e0 100644
--- a/python/pylibcudf/pylibcudf/strings/slice.pyx
+++ b/python/pylibcudf/pylibcudf/strings/slice.pyx
@@ -14,6 +14,7 @@ from pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
 
+__all__ = ["slice_strings"]
 
 cpdef Column slice_strings(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/strings/split/__init__.py
index 2033e5e275b..db2a597882e 100644
--- a/python/pylibcudf/pylibcudf/strings/split/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py
@@ -1,2 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from . import partition, split
+
+__all__ = ["partition", "split"]
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyi b/python/pylibcudf/pylibcudf/strings/split/partition.pyi
new file mode 100644
index 00000000000..f19a463bd7e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+
+def partition(input: Column, delimiter: Scalar | None = None) -> Table: ...
+def rpartition(input: Column, delimiter: Scalar | None = None) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
index 0fb4f186c41..75537ea46d3 100644
--- a/python/pylibcudf/pylibcudf/strings/split/partition.pyx
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
@@ -13,6 +13,7 @@ from pylibcudf.table cimport Table
 
 from cython.operator import dereference
 
+__all__ = ["partition", "rpartition"]
 
 cpdef Table partition(Column input, Scalar delimiter=None):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyi b/python/pylibcudf/pylibcudf/strings/split/split.pyi
new file mode 100644
index 00000000000..3ccf0bc2a01
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pyi
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.strings.regex_program import RegexProgram
+from pylibcudf.table import Table
+
+def split(
+    strings_column: Column, delimiter: Scalar, maxsplit: int
+) -> Table: ...
+def rsplit(
+    strings_column: Column, delimiter: Scalar, maxsplit: int
+) -> Table: ...
+def split_record(
+    strings: Column, delimiter: Scalar, maxsplit: int
+) -> Column: ...
+def rsplit_record(
+    strings: Column, delimiter: Scalar, maxsplit: int
+) -> Column: ...
+def split_re(input: Column, prog: RegexProgram, maxsplit: int) -> Table: ...
+def rsplit_re(input: Column, prog: RegexProgram, maxsplit: int) -> Table: ...
+def split_record_re(
+    input: Column, prog: RegexProgram, maxsplit: int
+) -> Column: ...
+def rsplit_record_re(
+    input: Column, prog: RegexProgram, maxsplit: int
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx
index e3827f6645e..90087f996f0 100644
--- a/python/pylibcudf/pylibcudf/strings/split/split.pyx
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx
@@ -13,6 +13,16 @@ from pylibcudf.table cimport Table
 
 from cython.operator import dereference
 
+__all__ = [
+    "rsplit",
+    "rsplit_re",
+    "rsplit_record",
+    "rsplit_record_re",
+    "split",
+    "split_re",
+    "split_record",
+    "split_record_re",
+]
 
 cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyi b/python/pylibcudf/pylibcudf/strings/strip.pyi
new file mode 100644
index 00000000000..680355fc88f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/strip.pyi
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.strings.side_type import SideType
+
+def strip(
+    input: Column,
+    side: SideType = SideType.BOTH,
+    to_strip: Scalar | None = None,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx
index 429a23c3cdf..805d959891b 100644
--- a/python/pylibcudf/pylibcudf/strings/strip.pyx
+++ b/python/pylibcudf/pylibcudf/strings/strip.pyx
@@ -13,6 +13,7 @@ from pylibcudf.libcudf.strings cimport strip as cpp_strip
 from pylibcudf.scalar cimport Scalar
 from pylibcudf.strings.side_type cimport side_type
 
+__all__ = ["strip"]
 
 cpdef Column strip(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyi b/python/pylibcudf/pylibcudf/strings/translate.pyi
new file mode 100644
index 00000000000..7158b6eb05c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/translate.pyi
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from collections.abc import Mapping
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class FilterType(IntEnum):
+    KEEP = ...
+    REMOVE = ...
+
+def translate(
+    input: Column, chars_table: Mapping[int | str, int | str]
+) -> Column: ...
+def filter_characters(
+    input: Column,
+    characters_to_filter: Mapping[int | str, int | str],
+    keep_characters: FilterType,
+    replacement: Scalar,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx
index d85da8e6cdd..ba1e8dc5d27 100644
--- a/python/pylibcudf/pylibcudf/strings/translate.pyx
+++ b/python/pylibcudf/pylibcudf/strings/translate.pyx
@@ -14,6 +14,7 @@ from cython.operator import dereference
 from pylibcudf.libcudf.strings.translate import \
     filter_type as FilterType  # no-cython-lint
 
+__all__ = ["FilterType", "filter_characters", "translate"]
 
 cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyi b/python/pylibcudf/pylibcudf/strings/wrap.pyi
new file mode 100644
index 00000000000..5658f279197
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/wrap.pyi
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def wrap(input: Column, width: int) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx
index 2ced250f837..b696eb48e47 100644
--- a/python/pylibcudf/pylibcudf/strings/wrap.pyx
+++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx
@@ -7,6 +7,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport wrap as cpp_wrap
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = ["wrap"]
 
 cpdef Column wrap(Column input, size_type width):
     """
diff --git a/python/pylibcudf/pylibcudf/table.pyi b/python/pylibcudf/pylibcudf/table.pyi
new file mode 100644
index 00000000000..5aef7e009c8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/table.pyi
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+class Table:
+    def __init__(self, column: list[Column]): ...
+    def num_columns(self) -> int: ...
+    def num_rows(self) -> int: ...
+    def columns(self) -> list[Column]: ...
diff --git a/python/pylibcudf/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx
index d0d6f2343d0..0c1e88a927c 100644
--- a/python/pylibcudf/pylibcudf/table.pyx
+++ b/python/pylibcudf/pylibcudf/table.pyx
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.table.table cimport table
 
 from .column cimport Column
 
+__all__ = ["Table"]
 
 cdef class Table:
     """A list of columns of the same size.
@@ -24,6 +25,8 @@ cdef class Table:
             raise ValueError("All columns must be pylibcudf Column objects")
         self._columns = columns
 
+    __hash__ = None
+
     cdef table_view view(self) nogil:
         """Generate a libcudf table_view to pass to libcudf algorithms.
 
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_text.py b/python/pylibcudf/pylibcudf/tests/io/test_text.py
new file mode 100644
index 00000000000..f69e940e34e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/io/test_text.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.mark.parametrize(
+    "source_func",
+    [
+        "make_source",
+        "make_source_from_file",
+    ],
+)
+@pytest.mark.parametrize("options", [None, plc.io.text.ParseOptions()])
+def test_multibyte_split(source_func, options, tmp_path):
+    data = "x::y::z"
+    func = getattr(plc.io.text, source_func)
+    if source_func == "make_source":
+        source = func(data)
+    elif source_func == "make_source_from_file":
+        fle = tmp_path / "fle.txt"
+        fle.write_text(data)
+        source = func(str(fle))
+    result = plc.io.text.multibyte_split(source, "::", options)
+    expected = pa.array(["x::", "y::", "z"])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
index bbb08e8b95a..a33122221f6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py
+++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
@@ -541,13 +541,6 @@ def py_shift_right_unsigned(x, y):
             plc.binaryop.BinaryOperator.LOGICAL_AND,
             pa.compute.and_,
         ),
-        (
-            "int64",
-            "int64",
-            "int64",
-            plc.binaryop.BinaryOperator.LOGICAL_AND,
-            pa.compute.and_,
-        ),
         (
             "int64",
             "int64",
@@ -562,13 +555,6 @@ def py_shift_right_unsigned(x, y):
             plc.binaryop.BinaryOperator.LOGICAL_OR,
             pa.compute.or_,
         ),
-        (
-            "int64",
-            "int64",
-            "int64",
-            plc.binaryop.BinaryOperator.LOGICAL_OR,
-            pa.compute.or_,
-        ),
         (
             "int64",
             "int64",
diff --git a/python/pylibcudf/pylibcudf/tests/test_filling.py b/python/pylibcudf/pylibcudf/tests/test_filling.py
new file mode 100644
index 00000000000..91c7e42a0a0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_filling.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from datetime import datetime
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq, assert_table_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture
+def pa_col():
+    return pa.array([2, 3, 5, 7, 11])
+
+
+@pytest.fixture
+def pa_table():
+    pa_col = pa.array([1, 2, 3])
+    return pa.table([pa_col], names=["a"])
+
+
+def test_fill(pa_col):
+    result = plc.filling.fill(
+        plc.interop.from_arrow(pa_col),
+        1,
+        3,
+        plc.interop.from_arrow(pa.scalar(5)),
+    )
+    expect = pa.array([2, 5, 5, 7, 11])
+    assert_column_eq(result, expect)
+
+
+def test_fill_in_place(pa_col):
+    result = plc.interop.from_arrow(pa_col)
+    plc.filling.fill_in_place(
+        result,
+        1,
+        3,
+        plc.interop.from_arrow(pa.scalar(5)),
+    )
+    expect = pa.array([2, 5, 5, 7, 11])
+    assert_column_eq(result, expect)
+
+
+def test_sequence():
+    size = 5
+    init_scalar = plc.interop.from_arrow(pa.scalar(10))
+    step_scalar = plc.interop.from_arrow(pa.scalar(2))
+    result = plc.filling.sequence(
+        size,
+        init_scalar,
+        step_scalar,
+    )
+    expect = pa.array([10, 12, 14, 16, 18])
+    assert_column_eq(result, expect)
+
+
+def test_repeat_with_count_int(pa_table):
+    input_table = plc.interop.from_arrow(pa_table)
+    count = 2
+    result = plc.filling.repeat(input_table, count)
+    expect = pa.table([[1, 1, 2, 2, 3, 3]], names=["a"])
+    assert_table_eq(expect, result)
+
+
+def test_repeat_with_count_column(pa_table):
+    input_table = plc.interop.from_arrow(pa_table)
+    count = plc.interop.from_arrow(pa.array([1, 2, 3]))
+    result = plc.filling.repeat(input_table, count)
+    expect = pa.table([[1] + [2] * 2 + [3] * 3], names=["a"])
+    assert_table_eq(expect, result)
+
+
+def test_calendrical_month_sequence():
+    n = 5
+    init_date = datetime(2020, 1, 31)
+    init = plc.interop.from_arrow(
+        pa.scalar(init_date, type=pa.timestamp("ms"))
+    )
+    months = 1
+    result = plc.filling.calendrical_month_sequence(n, init, months)
+    expected_dates = [
+        datetime(2020, 1, 31),
+        datetime(2020, 2, 29),
+        datetime(2020, 3, 31),
+        datetime(2020, 4, 30),
+        datetime(2020, 5, 31),
+    ]
+    expect = pa.array(expected_dates, type=pa.timestamp("ms"))
+    assert_column_eq(result, expect)
diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py
index beacfc63ce5..946d583d1cc 100644
--- a/python/pylibcudf/pylibcudf/tests/test_labeling.py
+++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py
@@ -6,8 +6,12 @@
 import pylibcudf as plc
 
 
-@pytest.mark.parametrize("left_inclusive", [True, False])
-@pytest.mark.parametrize("right_inclusive", [True, False])
+@pytest.mark.parametrize(
+    "left_inclusive", [plc.labeling.Inclusive.YES, plc.labeling.Inclusive.NO]
+)
+@pytest.mark.parametrize(
+    "right_inclusive", [plc.labeling.Inclusive.YES, plc.labeling.Inclusive.NO]
+)
 def test_label_bins(left_inclusive, right_inclusive):
     in_col = plc.interop.from_arrow(pa.array([1, 2, 3]))
     left_edges = plc.interop.from_arrow(pa.array([0, 5]))
diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py
index f3ef555f11d..8c1229c2a04 100644
--- a/python/pylibcudf/pylibcudf/tests/test_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_lists.py
@@ -62,12 +62,12 @@ def test_concatenate_rows(test_data):
     [
         (
             [[[1, 2], [3, 4], [5]], [[6], None, [7, 8, 9]]],
-            False,
+            plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW,
             [[1, 2, 3, 4, 5], None],
         ),
         (
             [[[1, 2], [3, 4], [5, None]], [[6], [None], [7, 8, 9]]],
-            True,
+            plc.lists.ConcatenateNullPolicy.IGNORE,
             [[1, 2, 3, 4, 5, None], [6, None, 7, 8, 9]],
         ),
     ],
@@ -138,7 +138,9 @@ def test_index_of_scalar(list_column, scalar):
 
     plc_column = plc.interop.from_arrow(arr)
     plc_scalar = plc.interop.from_arrow(scalar)
-    res = plc.lists.index_of(plc_column, plc_scalar, True)
+    res = plc.lists.index_of(
+        plc_column, plc_scalar, plc.lists.DuplicateFindOption.FIND_FIRST
+    )
 
     expect = pa.array([1, -1, -1, -1], type=pa.int32())
 
@@ -150,7 +152,9 @@ def test_index_of_list_column(list_column, search_key_column):
     arr2, expect = search_key_column
     plc_column1 = plc.interop.from_arrow(arr1)
     plc_column2 = plc.interop.from_arrow(arr2)
-    res = plc.lists.index_of(plc_column1, plc_column2, True)
+    res = plc.lists.index_of(
+        plc_column1, plc_column2, plc.lists.DuplicateFindOption.FIND_FIRST
+    )
 
     expect = pa.array(search_key_column[1], type=pa.int32())
 
@@ -227,39 +231,34 @@ def test_sequences():
 
 
 @pytest.mark.parametrize(
-    "ascending,na_position,expected",
+    "order,na_position,expected",
     [
         (
-            True,
+            plc.types.Order.ASCENDING,
             plc.types.NullOrder.BEFORE,
             [[1, 2, 3, 4], [None, 1, 2, 4], [-10, 0, 10, 10]],
         ),
         (
-            True,
+            plc.types.Order.ASCENDING,
             plc.types.NullOrder.AFTER,
             [[1, 2, 3, 4], [1, 2, 4, None], [-10, 0, 10, 10]],
         ),
         (
-            False,
+            plc.types.Order.DESCENDING,
             plc.types.NullOrder.BEFORE,
             [[4, 3, 2, 1], [4, 2, 1, None], [10, 10, 0, -10]],
         ),
         (
-            False,
-            plc.types.NullOrder.AFTER,
-            [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]],
-        ),
-        (
-            False,
+            plc.types.Order.DESCENDING,
             plc.types.NullOrder.AFTER,
             [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]],
         ),
     ],
 )
-def test_sort_lists(lists_column, ascending, na_position, expected):
+def test_sort_lists(lists_column, order, na_position, expected):
     plc_column = plc.interop.from_arrow(pa.array(lists_column))
-    res = plc.lists.sort_lists(plc_column, ascending, na_position, False)
-    res_stable = plc.lists.sort_lists(plc_column, ascending, na_position, True)
+    res = plc.lists.sort_lists(plc_column, order, na_position, False)
+    res_stable = plc.lists.sort_lists(plc_column, order, na_position, True)
 
     expect = pa.array(expected)
 
@@ -272,44 +271,44 @@ def test_sort_lists(lists_column, ascending, na_position, expected):
     [
         (
             plc.lists.difference_distinct,
-            True,
-            True,
+            plc.types.NanEquality.ALL_EQUAL,
+            plc.types.NullEquality.EQUAL,
             [[], [1, 2, 3], None, [4, 5]],
         ),
         (
             plc.lists.difference_distinct,
-            False,
-            True,
+            plc.types.NanEquality.UNEQUAL,
+            plc.types.NullEquality.EQUAL,
             [[], [1, 2, 3], None, [4, None, 5]],
         ),
         (
             plc.lists.have_overlap,
-            True,
-            True,
+            plc.types.NanEquality.ALL_EQUAL,
+            plc.types.NullEquality.EQUAL,
             [True, False, None, True],
         ),
         (
             plc.lists.have_overlap,
-            False,
-            False,
+            plc.types.NanEquality.UNEQUAL,
+            plc.types.NullEquality.UNEQUAL,
             [True, False, None, False],
         ),
         (
             plc.lists.intersect_distinct,
-            True,
-            True,
+            plc.types.NanEquality.ALL_EQUAL,
+            plc.types.NullEquality.EQUAL,
             [[np.nan, 1, 2], [], None, [None]],
         ),
         (
             plc.lists.intersect_distinct,
-            True,
-            False,
+            plc.types.NanEquality.ALL_EQUAL,
+            plc.types.NullEquality.UNEQUAL,
             [[1, 2], [], None, [None]],
         ),
         (
             plc.lists.union_distinct,
-            False,
-            True,
+            plc.types.NanEquality.UNEQUAL,
+            plc.types.NullEquality.EQUAL,
             [
                 [np.nan, 2, 1, 3],
                 [1, 2, 3, 4, 5],
@@ -319,8 +318,8 @@ def test_sort_lists(lists_column, ascending, na_position, expected):
         ),
         (
             plc.lists.union_distinct,
-            False,
-            False,
+            plc.types.NanEquality.UNEQUAL,
+            plc.types.NullEquality.UNEQUAL,
             [
                 [np.nan, np.nan, 2, 1, np.nan, 3],
                 [1, 2, 3, 4, 5],
@@ -352,20 +351,24 @@ def test_set_operations(
 @pytest.mark.parametrize(
     "nans_equal,nulls_equal,expected",
     [
-        (True, True, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]),
         (
-            False,
-            True,
+            plc.types.NanEquality.ALL_EQUAL,
+            plc.types.NullEquality.EQUAL,
+            [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]],
+        ),
+        (
+            plc.types.NanEquality.UNEQUAL,
+            plc.types.NullEquality.EQUAL,
             [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]],
         ),
         (
-            True,
-            False,
+            plc.types.NanEquality.ALL_EQUAL,
+            plc.types.NullEquality.UNEQUAL,
             [[np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]],
         ),
         (
-            False,
-            False,
+            plc.types.NanEquality.UNEQUAL,
+            plc.types.NullEquality.UNEQUAL,
             [
                 [np.nan, np.nan, 0, 1, 2, 3],
                 [3, 1, 2],
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index ead9ee094af..ec533e64307 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -21,15 +21,19 @@ def word_minhash_input_data(request):
 
 
 @pytest.mark.parametrize("width", [5, 12])
-def test_minhash(minhash_input_data, width):
+def test_minhash_permuted(minhash_input_data, width):
     input_arr, seeds, seed_type = minhash_input_data
     minhash_func = (
-        plc.nvtext.minhash.minhash
+        plc.nvtext.minhash.minhash_permuted
         if seed_type == pa.uint32()
-        else plc.nvtext.minhash.minhash64
+        else plc.nvtext.minhash.minhash64_permuted
     )
     result = minhash_func(
-        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width
+        plc.interop.from_arrow(input_arr),
+        0,
+        plc.interop.from_arrow(seeds),
+        plc.interop.from_arrow(seeds),
+        width,
     )
     pa_result = plc.interop.to_arrow(result)
     assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
index f461657281a..e85cd1cc443 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
@@ -8,7 +8,7 @@
 import pylibcudf as plc
 
 
-@pytest.fixture()
+@pytest.fixture
 def str_data():
     pa_data = pa.array(["A", None])
     return pa_data, plc.interop.from_arrow(pa_data)
diff --git a/python/pylibcudf/pylibcudf/traits.pyi b/python/pylibcudf/pylibcudf/traits.pyi
new file mode 100644
index 00000000000..fdb31a262cf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/traits.pyi
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.types import DataType
+
+def is_relationally_comparable(typ: DataType) -> bool: ...
+def is_equality_comparable(typ: DataType) -> bool: ...
+def is_numeric(typ: DataType) -> bool: ...
+def is_numeric_not_bool(typ: DataType) -> bool: ...
+def is_index_type(typ: DataType) -> bool: ...
+def is_unsigned(typ: DataType) -> bool: ...
+def is_integral(typ: DataType) -> bool: ...
+def is_integral_not_bool(typ: DataType) -> bool: ...
+def is_floating_point(typ: DataType) -> bool: ...
+def is_boolean(typ: DataType) -> bool: ...
+def is_timestamp(typ: DataType) -> bool: ...
+def is_fixed_point(typ: DataType) -> bool: ...
+def is_duration(typ: DataType) -> bool: ...
+def is_chrono(typ: DataType) -> bool: ...
+def is_dictionary(typ: DataType) -> bool: ...
+def is_fixed_width(typ: DataType) -> bool: ...
+def is_compound(typ: DataType) -> bool: ...
+def is_nested(typ: DataType) -> bool: ...
+def is_bit_castable(source: DataType, target: DataType) -> bool: ...
diff --git a/python/pylibcudf/pylibcudf/traits.pyx b/python/pylibcudf/pylibcudf/traits.pyx
index 9c52e0ac1ab..3cf0a3a4b3b 100644
--- a/python/pylibcudf/pylibcudf/traits.pyx
+++ b/python/pylibcudf/pylibcudf/traits.pyx
@@ -5,6 +5,27 @@ from pylibcudf.libcudf.utilities cimport traits
 
 from .types cimport DataType
 
+__all__ = [
+    "is_bit_castable",
+    "is_boolean",
+    "is_chrono",
+    "is_compound",
+    "is_dictionary",
+    "is_duration",
+    "is_equality_comparable",
+    "is_fixed_point",
+    "is_fixed_width",
+    "is_floating_point",
+    "is_index_type",
+    "is_integral",
+    "is_integral_not_bool",
+    "is_nested",
+    "is_numeric",
+    "is_numeric_not_bool",
+    "is_relationally_comparable",
+    "is_timestamp",
+    "is_unsigned",
+]
 
 cpdef bool is_relationally_comparable(DataType typ):
     """Checks if the given data type supports relational comparisons.
diff --git a/python/pylibcudf/pylibcudf/transform.pyi b/python/pylibcudf/pylibcudf/transform.pyi
new file mode 100644
index 00000000000..5cbd2e635f0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/transform.pyi
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from pylibcudf.column import Column
+from pylibcudf.expressions import Expression
+from pylibcudf.gpumemoryview import gpumemoryview
+from pylibcudf.table import Table
+from pylibcudf.types import DataType
+
+def nans_to_nulls(input: Column) -> tuple[gpumemoryview, int]: ...
+def compute_column(input: Table, expr: Expression) -> Column: ...
+def bools_to_mask(input: Column) -> tuple[gpumemoryview, int]: ...
+def mask_to_bools(bitmask: int, begin_bit: int, end_bit: int) -> Column: ...
+def transform(
+    input: Column, unary_udf: str, output_type: DataType, is_ptx: bool
+) -> Column: ...
+def encode(input: Table) -> tuple[Table, Column]: ...
+def one_hot_encode(input: Column, categories: Column) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index e8d95cadb0c..9700bcff221 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -18,6 +18,15 @@ from .gpumemoryview cimport gpumemoryview
 from .types cimport DataType
 from .utils cimport int_to_bitmask_ptr
 
+__all__ = [
+    "bools_to_mask",
+    "compute_column",
+    "encode",
+    "mask_to_bools",
+    "nans_to_nulls",
+    "one_hot_encode",
+    "transform",
+]
 
 cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
     """Create a null mask preserving existing nulls and converting nans to null.
diff --git a/python/pylibcudf/pylibcudf/transpose.pyi b/python/pylibcudf/pylibcudf/transpose.pyi
new file mode 100644
index 00000000000..a84ab8a60ea
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/transpose.pyi
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from pylibcudf.table import Table
+
+def transpose(input_table: Table) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx
index a24f937ced3..5eb3e58cebc 100644
--- a/python/pylibcudf/pylibcudf/transpose.pyx
+++ b/python/pylibcudf/pylibcudf/transpose.pyx
@@ -9,6 +9,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 from .column cimport Column
 from .table cimport Table
 
+__all__ = ["transpose"]
 
 cpdef Table transpose(Table input_table):
     """Transpose a Table.
diff --git a/python/pylibcudf/pylibcudf/types.pyi b/python/pylibcudf/pylibcudf/types.pyi
new file mode 100644
index 00000000000..c91a95414bd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/types.pyi
@@ -0,0 +1,86 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from enum import IntEnum
+from typing import Final
+
+class Interpolation(IntEnum):
+    LINEAR = ...
+    LOWER = ...
+    HIGHER = ...
+    MIDPOINT = ...
+    NEAREST = ...
+
+class MaskState(IntEnum):
+    UNALLOCATED = ...
+    UNINITIALIZED = ...
+    ALL_VALID = ...
+    ALL_NULL = ...
+
+class NanEquality(IntEnum):
+    ALL_EQUAL = ...
+    UNEQUAL = ...
+
+class NanPolicy(IntEnum):
+    NAN_IS_NULL = ...
+    NAN_IS_VALID = ...
+
+class NullEquality(IntEnum):
+    EQUAL = ...
+    UNEQUAL = ...
+
+class NullOrder(IntEnum):
+    AFTER = ...
+    BEFORE = ...
+
+class NullPolicy(IntEnum):
+    EXCLUDE = ...
+    INCLUDE = ...
+
+class Order(IntEnum):
+    ASCENDING = ...
+    DESCENDING = ...
+
+class Sorted(IntEnum):
+    NO = ...
+    YES = ...
+
+class TypeId(IntEnum):
+    EMPTY = ...
+    INT8 = ...
+    INT16 = ...
+    INT32 = ...
+    INT64 = ...
+    UINT8 = ...
+    UINT16 = ...
+    UINT32 = ...
+    UINT64 = ...
+    FLOAT32 = ...
+    FLOAT64 = ...
+    BOOL8 = ...
+    TIMESTAMP_DAYS = ...
+    TIMESTAMP_SECONDS = ...
+    TIMESTAMP_MILLISECONDS = ...
+    TIMESTAMP_MICROSECONDS = ...
+    TIMESTAMP_NANOSECONDS = ...
+    DURATION_DAYS = ...
+    DURATION_SECONDS = ...
+    DURATION_MILLISECONDS = ...
+    DURATION_MICROSECONDS = ...
+    DURATION_NANOSECONDS = ...
+    DICTIONARY32 = ...
+    STRING = ...
+    LIST = ...
+    DECIMAL32 = ...
+    DECIMAL64 = ...
+    DECIMAL128 = ...
+    STRUCT = ...
+    NUM_TYPE_IDS = ...
+
+class DataType:
+    def __init__(self, type_id: TypeId, scale: int = 0): ...
+    def id(self) -> TypeId: ...
+    def scale(self) -> int: ...
+
+def size_of(t: DataType) -> int: ...
+
+SIZE_TYPE: Final[DataType]
+SIZE_TYPE_ID: Final[TypeId]
diff --git a/python/pylibcudf/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx
index a0c31f994a3..afa1b56f38a 100644
--- a/python/pylibcudf/pylibcudf/types.pyx
+++ b/python/pylibcudf/pylibcudf/types.pyx
@@ -20,6 +20,22 @@ from pylibcudf.libcudf.types import null_order as NullOrder  # no-cython-lint, i
 from pylibcudf.libcudf.types import order as Order  # no-cython-lint, isort:skip
 from pylibcudf.libcudf.types import sorted as Sorted  # no-cython-lint, isort:skip
 
+__all__ = [
+    "DataType",
+    "Interpolation",
+    "MaskState",
+    "NanEquality",
+    "NanPolicy",
+    "NullEquality",
+    "NullOrder",
+    "NullPolicy",
+    "Order",
+    "SIZE_TYPE",
+    "SIZE_TYPE_ID",
+    "Sorted",
+    "TypeId",
+    "size_of"
+]
 
 cdef class DataType:
     """Indicator for the logical data type of an element in a column.
diff --git a/python/pylibcudf/pylibcudf/unary.pyi b/python/pylibcudf/pylibcudf/unary.pyi
new file mode 100644
index 00000000000..7aa23b618f4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/unary.pyi
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+class UnaryOperator(IntEnum):
+    SIN = ...
+    COS = ...
+    TAN = ...
+    ARCSIN = ...
+    ARCCOS = ...
+    ARCTAN = ...
+    SINH = ...
+    COSH = ...
+    TANH = ...
+    ARCSINH = ...
+    ARCCOSH = ...
+    ARCTANH = ...
+    EXP = ...
+    LOG = ...
+    SQRT = ...
+    CBRT = ...
+    CEIL = ...
+    FLOOR = ...
+    ABS = ...
+    RINT = ...
+    BIT_INVERT = ...
+    NOT = ...
+
+def unary_operation(input: Column, op: UnaryOperator) -> Column: ...
+def is_null(input: Column) -> Column: ...
+def is_valid(input: Column) -> Column: ...
+def cast(input: Column, data_type: DataType) -> Column: ...
+def is_nan(input: Column) -> Column: ...
+def is_not_nan(input: Column) -> Column: ...
+def is_supported_cast(from_: DataType, to: DataType) -> bool: ...
diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx
index 53e8c382b5e..b738ab53d1b 100644
--- a/python/pylibcudf/pylibcudf/unary.pyx
+++ b/python/pylibcudf/pylibcudf/unary.pyx
@@ -13,6 +13,16 @@ from pylibcudf.libcudf.unary import \
 from .column cimport Column
 from .types cimport DataType
 
+__all__ = [
+    "UnaryOperator",
+    "cast",
+    "is_nan",
+    "is_not_nan",
+    "is_null",
+    "is_supported_cast",
+    "is_valid",
+    "unary_operation",
+]
 
 cpdef Column unary_operation(Column input, unary_operator op):
     """Perform a unary operation on a column.
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index b2cec80f484..e83db47830c 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -22,8 +22,8 @@ dependencies = [
     "libcudf==24.12.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'",
-    "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'",
+    "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'",
+    "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'",
     "rmm==24.12.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -56,18 +56,43 @@ Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 [tool.ruff]
 extend = "../../pyproject.toml"
 
+[tool.ruff.lint]
+extend-select = [
+  "TCH", # flake8-type-checking
+  "TID", # flake8-tidy-imports
+  "PT", # flake8-pytest-style
+]
+extend-ignore = [
+  "PT011", # pytest.raises(...) is too broad
+]
+
+[tool.ruff.lint.flake8-pytest-style]
+# https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style
+fixture-parentheses = false
+mark-parentheses = false
+parametrize-names-type = "csv"
+parametrize-values-type = "list"
+parametrize-values-row-type = "tuple"
+
 [tool.ruff.lint.isort]
 combine-as-imports = true
-known-first-party = ["cudf"]
-section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+known-first-party = ["pylibcudf"]
+section-order = ["future", "standard-library", "third-party", "rapids", "first-party", "local-folder"]
 
 [tool.ruff.lint.isort.sections]
-dask = ["dask", "distributed", "dask_cuda"]
 rapids = ["rmm"]
 
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["E402", "F401"]
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 # --import-mode=importlib because two test_json.py exists and tests directory is not a structured module
 addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib"