diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7ec48eb7817..a65cae34653 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -50,6 +50,7 @@ jobs:
       test_java: ${{ steps.changed-files.outputs.java_any_changed == 'true' }}
       test_notebooks: ${{ steps.changed-files.outputs.notebooks_any_changed == 'true' }}
       test_python: ${{ steps.changed-files.outputs.python_any_changed == 'true' }}
+      test_cudf_pandas: ${{ steps.changed-files.outputs.cudf_pandas_any_changed == 'true' }}
     steps:
       - name: Get PR info
         id: get-pr-info
@@ -82,6 +83,7 @@ jobs:
               - '!java/**'
               - '!notebooks/**'
               - '!python/**'
+              - '!ci/cudf_pandas_scripts/**'
             java:
               - '**'
               - '!CONTRIBUTING.md'
@@ -90,11 +92,13 @@ jobs:
               - '!img/**'
               - '!notebooks/**'
               - '!python/**'
+              - '!ci/cudf_pandas_scripts/**'
             notebooks:
               - '**'
               - '!CONTRIBUTING.md'
               - '!README.md'
               - '!java/**'
+              - '!ci/cudf_pandas_scripts/**'
             python:
               - '**'
               - '!CONTRIBUTING.md'
@@ -103,6 +107,16 @@ jobs:
               - '!img/**'
               - '!java/**'
               - '!notebooks/**'
+              - '!ci/cudf_pandas_scripts/**'
+            cudf_pandas:
+              - '**'
+              - 'ci/cudf_pandas_scripts/**'
+              - '!CONTRIBUTING.md'
+              - '!README.md'
+              - '!docs/**'
+              - '!img/**'
+              - '!java/**'
+              - '!notebooks/**'
   checks:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
@@ -248,7 +262,7 @@ jobs:
   cudf-polars-polars-tests:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -289,7 +303,7 @@ jobs:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_python == 'true'
+    if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -300,7 +314,7 @@ jobs:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_python == 'true'
+    if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/build.sh b/build.sh
index 69d6481af42..56359eae235 100755
--- a/build.sh
+++ b/build.sh
@@ -17,13 +17,14 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcudf pylibcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
-HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
+VALIDARGS="clean libcudf pylibcudf cudf cudf_polars cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
+HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudf_polars] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
    clean                         - remove all existing build artifacts and configuration (start
                                    over)
    libcudf                       - build the cudf C++ code only
    pylibcudf                     - build the pylibcudf Python package
    cudf                          - build the cudf Python package
+   cudf_polars                   - build the cudf_polars Python package
    cudfjar                       - build cudf JAR with static libcudf using devtoolset toolchain
    dask_cudf                     - build the dask_cudf Python package
    benchmarks                    - build benchmarks
@@ -353,6 +354,12 @@ if buildAll || hasArg cudf; then
         python ${PYTHON_ARGS_FOR_INSTALL} .
 fi
 
+# Build and install the cudf_polars Python package
+if buildAll || hasArg cudf_polars; then
+
+    cd ${REPODIR}/python/cudf_polars
+    python ${PYTHON_ARGS_FOR_INSTALL} .
+fi
 
 # Build and install the dask_cudf Python package
 if buildAll || hasArg dask_cudf; then
diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 7a12db927e5..485b2ac8a51 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -67,20 +67,33 @@ def emoji_failed(x):
 # convert pr_results to a pandas DataFrame and then a markdown table
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
-diff_df = pr_df - main_df
-total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call']
-pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1)
-pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1)
+total_usage = main_df["_slow_function_call"] + main_df["_fast_function_call"]
+main_df["CPU Usage"] = ((main_df["_slow_function_call"] / total_usage) * 100.0).round(1)
+main_df["GPU Usage"] = ((main_df["_fast_function_call"] / total_usage) * 100.0).round(1)
+
+total_usage = pr_df["_slow_function_call"] + pr_df["_fast_function_call"]
+pr_df["CPU Usage"] = ((pr_df["_slow_function_call"] / total_usage) * 100.0).round(1)
+pr_df["GPU Usage"] = ((pr_df["_fast_function_call"] / total_usage) * 100.0).round(1)
+
+cpu_usage_mean = pr_df["CPU Usage"].mean().round(2)
+gpu_usage_mean = pr_df["GPU Usage"].mean().round(2)
+
+gpu_usage_rate_change = abs(pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean())
+pr_df["CPU Usage"] = pr_df["CPU Usage"].fillna(0)
+pr_df["GPU Usage"] = pr_df["GPU Usage"].fillna(0)
+main_df["CPU Usage"] = main_df["CPU Usage"].fillna(0)
+main_df["GPU Usage"] = main_df["GPU Usage"].fillna(0)
 
-cpu_usage_mean = pr_df['CPU Usage'].mean().round(2)
-gpu_usage_mean = pr_df['GPU Usage'].mean().round(2)
+diff_df = pr_df - main_df
+diff_df["CPU Usage"] = diff_df["CPU Usage"].round(1).fillna(0)
+diff_df["GPU Usage"] = diff_df["GPU Usage"].round(1).fillna(0)
 
-# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns
-pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%'
-pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%'
+# Add '%' suffix to "CPU Usage" and "GPU Usage" columns
+pr_df["CPU Usage"] = pr_df["CPU Usage"].astype(str) + "%"
+pr_df["GPU Usage"] = pr_df["GPU Usage"].astype(str) + "%"
 
-pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']]
-diff_df = diff_df[["total", "passed", "failed", "skipped"]]
+pr_df = pr_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
+diff_df = diff_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
 diff_df.columns = diff_df.columns + "_diff"
 diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
 diff_df["failed_diff"] = diff_df["failed_diff"].map(emoji_failed)
@@ -99,13 +112,36 @@ def emoji_failed(x):
         "passed_diff": "Passed delta",
         "failed_diff": "Failed delta",
         "skipped_diff": "Skipped delta",
+        "CPU Usage_diff": "CPU Usage delta",
+        "GPU Usage_diff": "GPU Usage delta",
     }
 )
-df = df.sort_values(by=["Failed tests", "Skipped tests"], ascending=False)
-
+df = df.sort_values(by=["CPU Usage delta", "Total tests"], ascending=False)
+df["CPU Usage delta"] = df["CPU Usage delta"].map(emoji_failed)
+df["GPU Usage delta"] = df["GPU Usage delta"].map(emoji_passed)
+df = df[
+    [
+        "Total tests",
+        "CPU Usage delta",
+        "GPU Usage delta",
+        "Passed tests",
+        "Failed tests",
+        "Skipped tests",
+        "CPU Usage",
+        "GPU Usage",
+        "Total delta",
+        "Passed delta",
+        "Failed delta",
+        "Skipped delta",
+    ]
+]
 print(comment)
 print()
-print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%")
+print(
+    f"Average GPU usage: {gpu_usage_mean}% {'an increase' if gpu_usage_rate_change > 0 else 'a decrease'} by {gpu_usage_rate_change}%"
+)
+print()
+print(f"Average CPU usage: {cpu_usage_mean}%")
 print()
 print("Here are the results of running the Pandas tests against this PR:")
 print()
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index f73e88bc0c8..870901d223b 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -82,6 +82,7 @@ for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do
   sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
+sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_cudf_polars_polars_tests.sh
 
 # Java files
 NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT"
diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh
index 52a827af94c..95f78f17f2f 100755
--- a/ci/run_cudf_polars_polars_tests.sh
+++ b/ci/run_cudf_polars_polars_tests.sh
@@ -21,7 +21,7 @@ python -m pytest \
        -m "" \
        -p cudf_polars.testing.plugin \
        -v \
-       --tb=short \
+       --tb=native \
        ${DESELECTED_TESTS} \
        "$@" \
        py-polars/tests
diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index 6c728a9537f..55399d0371a 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -10,7 +10,7 @@ set -eou pipefail
 # files in cudf_polars/pylibcudf", rather than "are there changes
 # between upstream and this branch which touch cudf_polars/pylibcudf"
 # TODO: is the target branch exposed anywhere in an environment variable?
-if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
 then
     HAS_CHANGES=1
     rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
@@ -33,8 +33,7 @@ python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
 rapids-logger "Install cudf_polars"
 python -m pip install $(echo ./dist/cudf_polars*.whl)
 
-# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
-TAG="py-1.7.0"
+TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
 rapids-logger "Clone polars to ${TAG}"
 git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1
 
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index a36e8734adc..05f882a475b 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -39,7 +39,7 @@ if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
       | tee ./constraints.txt
 fi
 
-# echo to expand wildcard before adding `[extra]` requires for pip
+# echo to expand wildcard before adding `[test]` requires for pip
 python -m pip install \
     -v \
     --constraint ./constraints.txt \
@@ -47,9 +47,6 @@ python -m pip install \
     "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
     "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
-rapids-logger "Pin to 1.7.0 Temporarily"
-python -m pip install polars==1.7.0
-
 rapids-logger "Run cudf_polars tests"
 
 function set_exitcode()
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index f91bf1e7046..8db03812a19 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -65,6 +65,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
+- polars>=1.8,<1.9
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<18.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index f4ec6bd5407..fdbe278b66b 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -63,6 +63,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
+- polars>=1.8,<1.9
 - pre-commit
 - pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 84b462bb884..136f43ee706 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -380,6 +380,7 @@ add_library(
   src/io/functions.cpp
   src/io/json/host_tree_algorithms.cu
   src/io/json/json_column.cu
+  src/io/json/column_tree_construction.cu
   src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index abc6f74fccf..4113e38dcf4 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -230,6 +230,11 @@ ConfigureNVBench(STRUCT_CREATION_NVBENCH structs/create_structs.cpp)
 # --------------------------------------------------------------------------------
 ConfigureBench(QUANTILES_BENCH quantiles/quantiles.cpp)
 
+# ##################################################################################################
+# * tdigest benchmark
+# --------------------------------------------------------------------------------
+ConfigureNVBench(TDIGEST_NVBENCH quantiles/tdigest.cu)
+
 # ##################################################################################################
 # * type_dispatcher benchmark ---------------------------------------------------------------------
 ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher.cu)
diff --git a/cpp/benchmarks/quantiles/tdigest.cu b/cpp/benchmarks/quantiles/tdigest.cu
new file mode 100644
index 00000000000..9d37dbc9a26
--- /dev/null
+++ b/cpp/benchmarks/quantiles/tdigest.cu
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+
+#include <nvbench/nvbench.cuh>
+
+void bm_tdigest_merge(nvbench::state& state)
+{
+  auto const num_tdigests = static_cast<cudf::size_type>(state.get_int64("num_tdigests"));
+  auto const tdigest_size = static_cast<cudf::size_type>(state.get_int64("tdigest_size"));
+  auto const tdigests_per_group =
+    static_cast<cudf::size_type>(state.get_int64("tdigests_per_group"));
+  auto const max_centroids   = static_cast<cudf::size_type>(state.get_int64("max_centroids"));
+  auto const num_groups      = num_tdigests / tdigests_per_group;
+  auto const total_centroids = num_tdigests * tdigest_size;
+
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
+  constexpr int base_value = 5;
+
+  // construct inner means/weights
+  auto val_iter = cudf::detail::make_counting_transform_iterator(
+    0, cuda::proclaim_return_type<double>([tdigest_size](cudf::size_type i) {
+      return static_cast<double>(base_value + (i % tdigest_size));
+    }));
+  auto one_iter = thrust::make_constant_iterator(1);
+  cudf::test::fixed_width_column_wrapper<double> means(val_iter, val_iter + total_centroids);
+  cudf::test::fixed_width_column_wrapper<double> weights(one_iter, one_iter + total_centroids);
+  std::vector<std::unique_ptr<cudf::column>> inner_struct_children;
+  inner_struct_children.push_back(means.release());
+  inner_struct_children.push_back(weights.release());
+  cudf::test::structs_column_wrapper inner_struct(std::move(inner_struct_children));
+
+  // construct the tdigest lists themselves
+  auto offset_iter = cudf::detail::make_counting_transform_iterator(
+    0, cuda::proclaim_return_type<cudf::size_type>([tdigest_size](cudf::size_type i) {
+      return i * tdigest_size;
+    }));
+  cudf::test::fixed_width_column_wrapper<int> offsets(offset_iter, offset_iter + num_tdigests + 1);
+  auto list_col = cudf::make_lists_column(
+    num_tdigests, offsets.release(), inner_struct.release(), 0, {}, stream, mr);
+
+  // min and max columns
+  auto min_iter = thrust::make_constant_iterator(base_value);
+  auto max_iter = thrust::make_constant_iterator(base_value + (tdigest_size - 1));
+  cudf::test::fixed_width_column_wrapper<double> mins(min_iter, min_iter + num_tdigests);
+  cudf::test::fixed_width_column_wrapper<double> maxes(max_iter, max_iter + num_tdigests);
+
+  // assemble the whole thing
+  std::vector<std::unique_ptr<cudf::column>> tdigest_children;
+  tdigest_children.push_back(std::move(list_col));
+  tdigest_children.push_back(mins.release());
+  tdigest_children.push_back(maxes.release());
+  cudf::test::structs_column_wrapper tdigest(std::move(tdigest_children));
+
+  rmm::device_uvector<cudf::size_type> group_offsets(num_groups + 1, stream, mr);
+  rmm::device_uvector<cudf::size_type> group_labels(num_tdigests, stream, mr);
+  auto group_offset_iter = cudf::detail::make_counting_transform_iterator(
+    0,
+    cuda::proclaim_return_type<cudf::size_type>(
+      [tdigests_per_group] __device__(cudf::size_type i) { return i * tdigests_per_group; }));
+  thrust::copy(rmm::exec_policy_nosync(stream, mr),
+               group_offset_iter,
+               group_offset_iter + num_groups + 1,
+               group_offsets.begin());
+  auto group_label_iter = cudf::detail::make_counting_transform_iterator(
+    0,
+    cuda::proclaim_return_type<cudf::size_type>(
+      [tdigests_per_group] __device__(cudf::size_type i) { return i / tdigests_per_group; }));
+  thrust::copy(rmm::exec_policy_nosync(stream, mr),
+               group_label_iter,
+               group_label_iter + num_tdigests,
+               group_labels.begin());
+
+  state.add_element_count(total_centroids);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch, auto& timer) {
+               timer.start();
+               auto result = cudf::tdigest::detail::group_merge_tdigest(
+                 tdigest, group_offsets, group_labels, num_groups, max_centroids, stream, mr);
+               timer.stop();
+             });
+}
+
+NVBENCH_BENCH(bm_tdigest_merge)
+  .set_name("TDigest many tiny groups")
+  .add_int64_axis("num_tdigests", {500'000})
+  .add_int64_axis("tdigest_size", {1, 1000})
+  .add_int64_axis("tdigests_per_group", {1})
+  .add_int64_axis("max_centroids", {10000, 1000});
+
+NVBENCH_BENCH(bm_tdigest_merge)
+  .set_name("TDigest many small groups")
+  .add_int64_axis("num_tdigests", {500'000})
+  .add_int64_axis("tdigest_size", {1, 1000})
+  .add_int64_axis("tdigests_per_group", {3})
+  .add_int64_axis("max_centroids", {10000, 1000});
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index ff25a5bacae..6798557e14e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -105,6 +105,8 @@ class json_reader_options {
   char _delimiter = '\n';
   // Prune columns on read, selected based on the _dtypes option
   bool _prune_columns = false;
+  // Experimental features: new column tree construction
+  bool _experimental = false;
 
   // Bytes to skip from the start
   size_t _byte_range_offset = 0;
@@ -277,6 +279,15 @@ class json_reader_options {
    */
   [[nodiscard]] bool is_enabled_prune_columns() const { return _prune_columns; }
 
+  /**
+   * @brief Whether to enable experimental features.
+   *
+   * When set to true, experimental features, such as the new column tree construction,
+   * utf-8 matching of field names will be enabled.
+   * @return true if experimental features are enabled
+   */
+  [[nodiscard]] bool is_enabled_experimental() const { return _experimental; }
+
   /**
    * @brief Whether to parse dates as DD/MM versus MM/DD.
    *
@@ -453,6 +464,16 @@ class json_reader_options {
    */
   void enable_prune_columns(bool val) { _prune_columns = val; }
 
+  /**
+   * @brief Set whether to enable experimental features.
+   *
+   * When set to true, experimental features, such as the new column tree construction,
+   * utf-8 matching of field names will be enabled.
+   *
+   * @param val Boolean value to enable/disable experimental features
+   */
+  void enable_experimental(bool val) { _experimental = val; }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
@@ -695,6 +716,21 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether to enable experimental features.
+   *
+   * When set to true, experimental features, such as the new column tree construction,
+   * utf-8 matching of field names will be enabled.
+   *
+   * @param val Boolean value to enable/disable experimental features
+   * @return this for chaining
+   */
+  json_reader_options_builder& experimental(bool val)
+  {
+    options._experimental = val;
+    return *this;
+  }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
diff --git a/cpp/src/io/json/column_tree_construction.cu b/cpp/src/io/json/column_tree_construction.cu
new file mode 100644
index 00000000000..c4fe7926706
--- /dev/null
+++ b/cpp/src/io/json/column_tree_construction.cu
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nested_json.hpp"
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/for_each.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/transform_scan.h>
+#include <thrust/unique.h>
+
+namespace cudf::io::json {
+
+using row_offset_t = size_type;
+
+#ifdef CSR_DEBUG_PRINT
+template <typename T>
+void print(device_span<T const> d_vec, std::string name, rmm::cuda_stream_view stream)
+{
+  stream.synchronize();
+  auto h_vec = cudf::detail::make_std_vector_sync(d_vec, stream);
+  std::cout << name << " = ";
+  for (auto e : h_vec) {
+    std::cout << e << " ";
+  }
+  std::cout << std::endl;
+}
+#endif
+
+namespace experimental::detail {
+
+struct level_ordering {
+  device_span<TreeDepthT const> node_levels;
+  device_span<NodeIndexT const> col_ids;
+  device_span<NodeIndexT const> parent_node_ids;
+  __device__ bool operator()(NodeIndexT lhs_node_id, NodeIndexT rhs_node_id) const
+  {
+    auto lhs_parent_col_id = parent_node_ids[lhs_node_id] == parent_node_sentinel
+                               ? parent_node_sentinel
+                               : col_ids[parent_node_ids[lhs_node_id]];
+    auto rhs_parent_col_id = parent_node_ids[rhs_node_id] == parent_node_sentinel
+                               ? parent_node_sentinel
+                               : col_ids[parent_node_ids[rhs_node_id]];
+
+    return (node_levels[lhs_node_id] < node_levels[rhs_node_id]) ||
+           (node_levels[lhs_node_id] == node_levels[rhs_node_id] &&
+            lhs_parent_col_id < rhs_parent_col_id) ||
+           (node_levels[lhs_node_id] == node_levels[rhs_node_id] &&
+            lhs_parent_col_id == rhs_parent_col_id && col_ids[lhs_node_id] < col_ids[rhs_node_id]);
+  }
+};
+
+struct parent_nodeids_to_colids {
+  device_span<NodeIndexT const> rev_mapped_col_ids;
+  __device__ auto operator()(NodeIndexT parent_node_id) -> NodeIndexT
+  {
+    return parent_node_id == parent_node_sentinel ? parent_node_sentinel
+                                                  : rev_mapped_col_ids[parent_node_id];
+  }
+};
+
+/**
+ * @brief Reduces node tree representation to column tree CSR representation.
+ *
+ * @param node_tree Node tree representation of JSON string
+ * @param original_col_ids Column ids of nodes
+ * @param row_offsets Row offsets of nodes
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A tuple of column tree representation of JSON string, column ids of columns, and
+ * max row offsets of columns
+ */
+std::tuple<compressed_sparse_row, column_tree_properties> reduce_to_column_tree(
+  tree_meta_t& node_tree,
+  device_span<NodeIndexT const> original_col_ids,
+  device_span<NodeIndexT const> sorted_col_ids,
+  device_span<NodeIndexT const> ordered_node_ids,
+  device_span<row_offset_t const> row_offsets,
+  bool is_array_of_arrays,
+  NodeIndexT row_array_parent_col_id,
+  rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+
+  if (original_col_ids.empty()) {
+    rmm::device_uvector<NodeIndexT> empty_row_idx(0, stream);
+    rmm::device_uvector<NodeIndexT> empty_col_idx(0, stream);
+    rmm::device_uvector<NodeT> empty_column_categories(0, stream);
+    rmm::device_uvector<row_offset_t> empty_max_row_offsets(0, stream);
+    rmm::device_uvector<NodeIndexT> empty_mapped_col_ids(0, stream);
+    return std::tuple{compressed_sparse_row{std::move(empty_row_idx), std::move(empty_col_idx)},
+                      column_tree_properties{std::move(empty_column_categories),
+                                             std::move(empty_max_row_offsets),
+                                             std::move(empty_mapped_col_ids)}};
+  }
+
+  auto [unpermuted_tree, unpermuted_col_ids, unpermuted_max_row_offsets] =
+    cudf::io::json::detail::reduce_to_column_tree(node_tree,
+                                                  original_col_ids,
+                                                  sorted_col_ids,
+                                                  ordered_node_ids,
+                                                  row_offsets,
+                                                  is_array_of_arrays,
+                                                  row_array_parent_col_id,
+                                                  stream);
+
+  NodeIndexT num_columns = unpermuted_col_ids.size();
+
+  auto mapped_col_ids = cudf::detail::make_device_uvector_async(
+    unpermuted_col_ids, stream, cudf::get_current_device_resource_ref());
+  rmm::device_uvector<NodeIndexT> rev_mapped_col_ids(num_columns, stream);
+  rmm::device_uvector<NodeIndexT> reordering_index(unpermuted_col_ids.size(), stream);
+
+  thrust::sequence(
+    rmm::exec_policy_nosync(stream), reordering_index.begin(), reordering_index.end());
+  // Reorder nodes and column ids in level-wise fashion
+  thrust::sort_by_key(
+    rmm::exec_policy_nosync(stream),
+    reordering_index.begin(),
+    reordering_index.end(),
+    mapped_col_ids.begin(),
+    level_ordering{
+      unpermuted_tree.node_levels, unpermuted_col_ids, unpermuted_tree.parent_node_ids});
+
+  {
+    auto mapped_col_ids_copy = cudf::detail::make_device_uvector_async(
+      mapped_col_ids, stream, cudf::get_current_device_resource_ref());
+    thrust::sequence(
+      rmm::exec_policy_nosync(stream), rev_mapped_col_ids.begin(), rev_mapped_col_ids.end());
+    thrust::sort_by_key(rmm::exec_policy_nosync(stream),
+                        mapped_col_ids_copy.begin(),
+                        mapped_col_ids_copy.end(),
+                        rev_mapped_col_ids.begin());
+  }
+
+  rmm::device_uvector<NodeIndexT> parent_col_ids(num_columns, stream);
+  thrust::transform_output_iterator parent_col_ids_it(parent_col_ids.begin(),
+                                                      parent_nodeids_to_colids{rev_mapped_col_ids});
+  rmm::device_uvector<row_offset_t> max_row_offsets(num_columns, stream);
+  rmm::device_uvector<NodeT> column_categories(num_columns, stream);
+  thrust::copy_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_zip_iterator(thrust::make_permutation_iterator(
+                                unpermuted_tree.parent_node_ids.begin(), reordering_index.begin()),
+                              thrust::make_permutation_iterator(unpermuted_max_row_offsets.begin(),
+                                                                reordering_index.begin()),
+                              thrust::make_permutation_iterator(
+                                unpermuted_tree.node_categories.begin(), reordering_index.begin())),
+    num_columns,
+    thrust::make_zip_iterator(
+      parent_col_ids_it, max_row_offsets.begin(), column_categories.begin()));
+
+#ifdef CSR_DEBUG_PRINT
+  print<NodeIndexT>(reordering_index, "h_reordering_index", stream);
+  print<NodeIndexT>(mapped_col_ids, "h_mapped_col_ids", stream);
+  print<NodeIndexT>(rev_mapped_col_ids, "h_rev_mapped_col_ids", stream);
+  print<NodeIndexT>(parent_col_ids, "h_parent_col_ids", stream);
+  print<row_offset_t>(max_row_offsets, "h_max_row_offsets", stream);
+#endif
+
+  auto construct_row_idx = [&stream](NodeIndexT num_columns,
+                                     device_span<NodeIndexT const> parent_col_ids) {
+    auto row_idx = cudf::detail::make_zeroed_device_uvector_async<NodeIndexT>(
+      static_cast<std::size_t>(num_columns + 1), stream, cudf::get_current_device_resource_ref());
+    // Note that the first element of csr_parent_col_ids is -1 (parent_node_sentinel)
+    // children adjacency
+
+    auto num_non_leaf_columns = thrust::unique_count(
+      rmm::exec_policy_nosync(stream), parent_col_ids.begin() + 1, parent_col_ids.end());
+    rmm::device_uvector<NodeIndexT> non_leaf_nodes(num_non_leaf_columns, stream);
+    rmm::device_uvector<NodeIndexT> non_leaf_nodes_children(num_non_leaf_columns, stream);
+    thrust::reduce_by_key(rmm::exec_policy_nosync(stream),
+                          parent_col_ids.begin() + 1,
+                          parent_col_ids.end(),
+                          thrust::make_constant_iterator(1),
+                          non_leaf_nodes.begin(),
+                          non_leaf_nodes_children.begin(),
+                          thrust::equal_to<TreeDepthT>());
+
+    thrust::scatter(rmm::exec_policy_nosync(stream),
+                    non_leaf_nodes_children.begin(),
+                    non_leaf_nodes_children.end(),
+                    non_leaf_nodes.begin(),
+                    row_idx.begin() + 1);
+
+    if (num_columns > 1) {
+      thrust::transform_inclusive_scan(
+        rmm::exec_policy_nosync(stream),
+        thrust::make_zip_iterator(thrust::make_counting_iterator(1), row_idx.begin() + 1),
+        thrust::make_zip_iterator(thrust::make_counting_iterator(1) + num_columns, row_idx.end()),
+        row_idx.begin() + 1,
+        cuda::proclaim_return_type<NodeIndexT>([] __device__(auto a) {
+          auto n   = thrust::get<0>(a);
+          auto idx = thrust::get<1>(a);
+          return n == 1 ? idx : idx + 1;
+        }),
+        thrust::plus<NodeIndexT>{});
+    } else {
+      auto single_node = 1;
+      row_idx.set_element_async(1, single_node, stream);
+    }
+
+#ifdef CSR_DEBUG_PRINT
+    print<NodeIndexT>(row_idx, "h_row_idx", stream);
+#endif
+    return row_idx;
+  };
+
+  auto construct_col_idx = [&stream](NodeIndexT num_columns,
+                                     device_span<NodeIndexT const> parent_col_ids,
+                                     device_span<NodeIndexT const> row_idx) {
+    rmm::device_uvector<NodeIndexT> col_idx((num_columns - 1) * 2, stream);
+    thrust::fill(rmm::exec_policy_nosync(stream), col_idx.begin(), col_idx.end(), -1);
+    // excluding root node, construct scatter map
+    rmm::device_uvector<NodeIndexT> map(num_columns - 1, stream);
+    thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(stream),
+                                  parent_col_ids.begin() + 1,
+                                  parent_col_ids.end(),
+                                  thrust::make_constant_iterator(1),
+                                  map.begin());
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::make_counting_iterator(1),
+                       num_columns - 1,
+                       [row_idx        = row_idx.begin(),
+                        map            = map.begin(),
+                        parent_col_ids = parent_col_ids.begin()] __device__(auto i) {
+                         auto parent_col_id = parent_col_ids[i];
+                         if (parent_col_id == 0)
+                           --map[i - 1];
+                         else
+                           map[i - 1] += row_idx[parent_col_id];
+                       });
+    thrust::scatter(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator(1),
+                    thrust::make_counting_iterator(1) + num_columns - 1,
+                    map.begin(),
+                    col_idx.begin());
+
+    // Skip the parent of root node
+    thrust::scatter(rmm::exec_policy_nosync(stream),
+                    parent_col_ids.begin() + 1,
+                    parent_col_ids.end(),
+                    row_idx.begin() + 1,
+                    col_idx.begin());
+
+#ifdef CSR_DEBUG_PRINT
+    print<NodeIndexT>(col_idx, "h_col_idx", stream);
+#endif
+
+    return col_idx;
+  };
+
+  /*
+    5. CSR construction:
+      a. Sort column levels and get their ordering
+      b. For each column node coln iterated according to sorted_column_levels; do
+          i. Find nodes that have coln as the parent node -> set adj_coln
+          ii. row idx[coln] = size of adj_coln + 1
+          iii. col idx[coln] = adj_coln U {parent_col_id[coln]}
+  */
+  auto row_idx = construct_row_idx(num_columns, parent_col_ids);
+  auto col_idx = construct_col_idx(num_columns, parent_col_ids, row_idx);
+
+  return std::tuple{
+    compressed_sparse_row{std::move(row_idx), std::move(col_idx)},
+    column_tree_properties{
+      std::move(column_categories), std::move(max_row_offsets), std::move(mapped_col_ids)}};
+}
+
+}  // namespace experimental::detail
+}  // namespace cudf::io::json
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 70d61132b42..5855f1b5a5f 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -43,6 +44,7 @@
 #include <thrust/uninitialized_fill.h>
 
 #include <algorithm>
+#include <deque>
 
 namespace cudf::io::json::detail {
 
@@ -58,16 +60,15 @@ namespace cudf::io::json::detail {
  */
 rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_array_children_level,
                                                           tree_meta_t const& d_tree,
-                                                          device_span<NodeIndexT> col_ids,
+                                                          device_span<NodeIndexT const> col_ids,
                                                           size_type const num_columns,
                                                           rmm::cuda_stream_view stream)
 {
-  CUDF_FUNC_RANGE();
   auto [level2_nodes, level2_indices] = get_array_children_indices(
     row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream);
   auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin());
   rmm::device_uvector<NodeIndexT> values_column_indices(num_columns, stream);
-  thrust::scatter(rmm::exec_policy(stream),
+  thrust::scatter(rmm::exec_policy_nosync(stream),
                   level2_indices.begin(),
                   level2_indices.end(),
                   col_id_location,
@@ -90,12 +91,11 @@ std::vector<std::string> copy_strings_to_host_sync(
   device_span<SymbolOffsetT const> node_range_end,
   rmm::cuda_stream_view stream)
 {
-  CUDF_FUNC_RANGE();
   auto const num_strings = node_range_begin.size();
   rmm::device_uvector<size_type> string_offsets(num_strings, stream);
   rmm::device_uvector<size_type> string_lengths(num_strings, stream);
   auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
-  thrust::transform(rmm::exec_policy(stream),
+  thrust::transform(rmm::exec_policy_nosync(stream),
                     d_offset_pairs,
                     d_offset_pairs + num_strings,
                     thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()),
@@ -161,18 +161,18 @@ std::vector<std::string> copy_strings_to_host_sync(
 rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const> input,
                                                       tree_meta_t const& d_column_tree,
                                                       tree_meta_t const& tree,
-                                                      device_span<NodeIndexT> col_ids,
+                                                      device_span<NodeIndexT const> col_ids,
                                                       cudf::io::json_reader_options const& options,
                                                       rmm::cuda_stream_view stream)
 {
   auto const num_nodes = col_ids.size();
   auto const num_cols  = d_column_tree.node_categories.size();
   rmm::device_uvector<uint8_t> is_all_nulls(num_cols, stream);
-  thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
+  thrust::fill(rmm::exec_policy_nosync(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
 
   auto parse_opt = parsing_options(options, stream);
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::counting_iterator<size_type>(0),
     num_nodes,
     [options           = parse_opt.view(),
@@ -193,7 +193,7 @@ rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const>
   return is_all_nulls;
 }
 
-NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT> col_ids,
+NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT const> col_ids,
                                        bool is_enabled_lines,
                                        rmm::cuda_stream_view stream)
 {
@@ -221,33 +221,34 @@ struct json_column_data {
   bitmask_type* validity;
 };
 
-std::pair<cudf::detail::host_vector<uint8_t>,
-          std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>>
-build_tree(device_json_column& root,
-           std::vector<uint8_t> const& is_str_column_all_nulls,
-           tree_meta_t& d_column_tree,
-           device_span<NodeIndexT const> d_unique_col_ids,
-           device_span<size_type const> d_max_row_offsets,
-           std::vector<std::string> const& column_names,
-           NodeIndexT row_array_parent_col_id,
-           bool is_array_of_arrays,
-           cudf::io::json_reader_options const& options,
-           rmm::cuda_stream_view stream,
-           rmm::device_async_resource_ref mr);
-void scatter_offsets(
-  tree_meta_t& tree,
-  device_span<NodeIndexT> col_ids,
-  device_span<size_type> row_offsets,
-  device_span<size_type> node_ids,
-  device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+using hashmap_of_device_columns =
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;
+
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
   tree_meta_t& d_column_tree,
-  host_span<const uint8_t> ignore_vals,
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>& columns,
-  rmm::cuda_stream_view stream);
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+void scatter_offsets(tree_meta_t const& tree,
+                     device_span<NodeIndexT const> col_ids,
+                     device_span<size_type const> row_offsets,
+                     device_span<size_type> node_ids,
+                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+                     tree_meta_t const& d_column_tree,
+                     host_span<const bool> ignore_vals,
+                     hashmap_of_device_columns const& columns,
+                     rmm::cuda_stream_view stream);
 
 /**
  * @brief Constructs `d_json_column` from node tree representation
- * Newly constructed columns are insert into `root`'s children.
+ * Newly constructed columns are inserted into `root`'s children.
  * `root` must be a list type.
  *
  * @param input Input JSON string device data
@@ -265,28 +266,28 @@ void scatter_offsets(
  * of child_offets and validity members of `d_json_column`
  */
 void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t& tree,
-                             device_span<NodeIndexT> col_ids,
-                             device_span<size_type> row_offsets,
+                             tree_meta_t const& tree,
+                             device_span<NodeIndexT const> col_ids,
+                             device_span<size_type const> row_offsets,
                              device_json_column& root,
                              bool is_array_of_arrays,
                              cudf::io::json_reader_options const& options,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
-  CUDF_FUNC_RANGE();
-
   bool const is_enabled_lines                 = options.is_enabled_lines();
   bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
-  auto const num_nodes                        = col_ids.size();
-  rmm::device_uvector<NodeIndexT> sorted_col_ids(col_ids.size(), stream);  // make a copy
-  thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin());
+  // make a copy
+  auto sorted_col_ids = cudf::detail::make_device_uvector_async(
+    col_ids, stream, cudf::get_current_device_resource_ref());
 
   // sort by {col_id} on {node_ids} stable
   rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
-  thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end());
-  thrust::stable_sort_by_key(
-    rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
+  thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end());
+  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
+                             sorted_col_ids.begin(),
+                             sorted_col_ids.end(),
+                             node_ids.begin());
 
   NodeIndexT const row_array_parent_col_id =
     get_row_array_parent_col_id(col_ids, is_enabled_lines, stream);
@@ -316,7 +317,7 @@ void make_device_json_column(device_span<SymbolT const> input,
       cudf::detail::make_host_vector_sync(values_column_indices, stream);
     std::transform(unique_col_ids.begin(),
                    unique_col_ids.end(),
-                   column_names.begin(),
+                   column_names.cbegin(),
                    column_names.begin(),
                    [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
                      auto col_id, auto name) mutable {
@@ -333,17 +334,17 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
     return std::vector<uint8_t>();
   }();
-  auto [ignore_vals, columns] = build_tree(root,
-                                           is_str_column_all_nulls,
-                                           d_column_tree,
-                                           d_unique_col_ids,
-                                           d_max_row_offsets,
-                                           column_names,
-                                           row_array_parent_col_id,
-                                           is_array_of_arrays,
-                                           options,
-                                           stream,
-                                           mr);
+  auto const [ignore_vals, columns] = build_tree(root,
+                                                 is_str_column_all_nulls,
+                                                 d_column_tree,
+                                                 d_unique_col_ids,
+                                                 d_max_row_offsets,
+                                                 column_names,
+                                                 row_array_parent_col_id,
+                                                 is_array_of_arrays,
+                                                 options,
+                                                 stream,
+                                                 mr);
 
   scatter_offsets(tree,
                   col_ids,
@@ -356,19 +357,18 @@ void make_device_json_column(device_span<SymbolT const> input,
                   stream);
 }
 
-std::pair<cudf::detail::host_vector<uint8_t>,
-          std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>>
-build_tree(device_json_column& root,
-           std::vector<uint8_t> const& is_str_column_all_nulls,
-           tree_meta_t& d_column_tree,
-           device_span<NodeIndexT const> d_unique_col_ids,
-           device_span<size_type const> d_max_row_offsets,
-           std::vector<std::string> const& column_names,
-           NodeIndexT row_array_parent_col_id,
-           bool is_array_of_arrays,
-           cudf::io::json_reader_options const& options,
-           rmm::cuda_stream_view stream,
-           rmm::device_async_resource_ref mr)
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
+  tree_meta_t& d_column_tree,
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
 {
   bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
   auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
@@ -380,6 +380,7 @@ build_tree(device_json_column& root,
     cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
   auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
   auto num_columns           = d_unique_col_ids.size();
+  stream.synchronize();
 
   auto to_json_col_type = [](auto category) {
     switch (category) {
@@ -439,11 +440,12 @@ build_tree(device_json_column& root,
   });
 
   // use hash map because we may skip field name's col_ids
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
+  hashmap_of_device_columns columns;
   // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
   std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
   // find column_ids which are values, but should be ignored in validity
-  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
+  auto ignore_vals = cudf::detail::make_host_vector<bool>(num_columns, stream);
+  std::fill(ignore_vals.begin(), ignore_vals.end(), false);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   std::vector<uint8_t> is_pruned(num_columns, 0);
   // for columns that are not mixed type but have been forced as string
@@ -452,7 +454,7 @@ build_tree(device_json_column& root,
 
   std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
     [&](NodeIndexT this_col_id, device_json_column& col) {
-      for (auto col_name : col.column_order) {
+      for (auto const& col_name : col.column_order) {
         auto child_id                  = mapped_columns[{this_col_id, col_name}];
         is_mixed_type_column[child_id] = 1;
         remove_child_columns(child_id, col.child_columns.at(col_name));
@@ -523,7 +525,7 @@ build_tree(device_json_column& root,
     if (parent_col_id != parent_node_sentinel &&
           (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
         forced_as_string_column[parent_col_id]) {
-      ignore_vals[this_col_id] = 1;
+      ignore_vals[this_col_id] = true;
       if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
       if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
       continue;
@@ -569,12 +571,12 @@ build_tree(device_json_column& root,
       }
 
       if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
-        ignore_vals[this_col_id] = 1;
+        ignore_vals[this_col_id] = true;
         continue;
       }
       if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
         // remap
-        ignore_vals[old_col_id] = 1;
+        ignore_vals[old_col_id] = true;
         mapped_columns.erase({parent_col_id, name});
         columns.erase(old_col_id);
         parent_col.child_columns.erase(name);
@@ -624,7 +626,7 @@ build_tree(device_json_column& root,
       auto parent_col_id = column_parent_ids[this_col_id];
       if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
         is_mixed_type_column[this_col_id] = 1;
-        ignore_vals[this_col_id]          = 1;
+        ignore_vals[this_col_id]          = true;
         columns.erase(this_col_id);
       }
       // Convert only mixed type columns as string (so to copy), but not its children
@@ -644,7 +646,7 @@ build_tree(device_json_column& root,
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
       forced_as_string_column[this_col_id] = true;
-      ignore_vals[this_col_id]             = 1;
+      ignore_vals[this_col_id]             = true;
     }
     // Convert only mixed type columns as string (so to copy), but not its children
     if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
@@ -664,16 +666,15 @@ build_tree(device_json_column& root,
   return {ignore_vals, columns};
 }
 
-void scatter_offsets(
-  tree_meta_t& tree,
-  device_span<NodeIndexT> col_ids,
-  device_span<size_type> row_offsets,
-  device_span<size_type> node_ids,
-  device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
-  tree_meta_t& d_column_tree,
-  host_span<const uint8_t> ignore_vals,
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>& columns,
-  rmm::cuda_stream_view stream)
+void scatter_offsets(tree_meta_t const& tree,
+                     device_span<NodeIndexT const> col_ids,
+                     device_span<size_type const> row_offsets,
+                     device_span<size_type> node_ids,
+                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+                     tree_meta_t const& d_column_tree,
+                     host_span<const bool> ignore_vals,
+                     hashmap_of_device_columns const& columns,
+                     rmm::cuda_stream_view stream)
 {
   auto const num_nodes   = col_ids.size();
   auto const num_columns = d_column_tree.node_categories.size();
@@ -695,7 +696,7 @@ void scatter_offsets(
 
   // 3. scatter string offsets to respective columns, set validity bits
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::counting_iterator<size_type>(0),
     num_nodes,
     [column_categories = d_column_tree.node_categories.begin(),
@@ -739,7 +740,7 @@ void scatter_offsets(
                                                                   : col_ids[parent_node_ids[node_id]];
       }));
   auto const list_children_end = thrust::copy_if(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
     thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
       num_nodes,
@@ -757,12 +758,12 @@ void scatter_offsets(
 
   auto const num_list_children =
     list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
-  thrust::stable_sort_by_key(rmm::exec_policy(stream),
+  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
                              parent_col_ids.begin(),
                              parent_col_ids.begin() + num_list_children,
                              node_ids.begin());
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator<size_type>(0),
     num_list_children,
     [node_ids        = node_ids.begin(),
@@ -805,4 +806,599 @@ void scatter_offsets(
   stream.synchronize();
 }
 
+namespace experimental {
+
+std::map<std::string, schema_element> unified_schema(cudf::io::json_reader_options const& options)
+{
+  return std::visit(
+    cudf::detail::visitor_overload{
+      [](std::vector<data_type> const& user_dtypes) {
+        std::map<std::string, schema_element> dnew;
+        std::transform(thrust::counting_iterator<size_t>(0),
+                       thrust::counting_iterator<size_t>(user_dtypes.size()),
+                       std::inserter(dnew, dnew.end()),
+                       [&user_dtypes](auto i) {
+                         return std::pair(std::to_string(i), schema_element{user_dtypes[i]});
+                       });
+        return dnew;
+      },
+      [](std::map<std::string, data_type> const& user_dtypes) {
+        std::map<std::string, schema_element> dnew;
+        std::transform(user_dtypes.begin(),
+                       user_dtypes.end(),
+                       std::inserter(dnew, dnew.end()),
+                       [](auto key_dtype) {
+                         return std::pair(key_dtype.first, schema_element{key_dtype.second});
+                       });
+        return dnew;
+      },
+      [](std::map<std::string, schema_element> const& user_dtypes) { return user_dtypes; }},
+    options.get_dtypes());
+}
+
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
+  tree_meta_t& d_column_tree,
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+/**
+ * @brief Constructs `d_json_column` from node tree representation
+ * Newly constructed columns are inserted into `root`'s children.
+ * `root` must be a list type.
+ *
+ * @param input Input JSON string device data
+ * @param tree Node tree representation of the JSON string
+ * @param col_ids Column ids of the nodes in the tree
+ * @param row_offsets Row offsets of the nodes in the tree
+ * @param root Root node of the `d_json_column` tree
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param options Parsing options specifying the parsing behaviour
+ * options affecting behaviour are
+ *   is_enabled_lines: Whether the input is a line-delimited JSON
+ *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the device memory
+ * of child_offets and validity members of `d_json_column`
+ */
+void make_device_json_column(device_span<SymbolT const> input,
+                             tree_meta_t const& tree,
+                             device_span<NodeIndexT const> col_ids,
+                             device_span<size_type const> row_offsets,
+                             device_json_column& root,
+                             bool is_array_of_arrays,
+                             cudf::io::json_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  bool const is_enabled_lines                 = options.is_enabled_lines();
+  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  // make a copy
+  auto sorted_col_ids = cudf::detail::make_device_uvector_async(
+    col_ids, stream, cudf::get_current_device_resource_ref());
+
+  // sort by {col_id} on {node_ids} stable
+  rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
+  thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end());
+  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
+                             sorted_col_ids.begin(),
+                             sorted_col_ids.end(),
+                             node_ids.begin());
+
+  NodeIndexT const row_array_parent_col_id =
+    get_row_array_parent_col_id(col_ids, is_enabled_lines, stream);
+
+  // 1. gather column information.
+  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
+    reduce_to_column_tree(tree,
+                          col_ids,
+                          sorted_col_ids,
+                          node_ids,
+                          row_offsets,
+                          is_array_of_arrays,
+                          row_array_parent_col_id,
+                          stream);
+
+  auto num_columns                      = d_unique_col_ids.size();
+  std::vector<std::string> column_names = copy_strings_to_host_sync(
+    input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
+  // array of arrays column names
+  if (is_array_of_arrays) {
+    auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
+    auto const column_parent_ids =
+      cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
+    TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
+    auto values_column_indices =
+      get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
+    auto h_values_column_indices =
+      cudf::detail::make_host_vector_sync(values_column_indices, stream);
+    std::transform(unique_col_ids.begin(),
+                   unique_col_ids.end(),
+                   column_names.cbegin(),
+                   column_names.begin(),
+                   [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
+                     auto col_id, auto name) mutable {
+                     return column_parent_ids[col_id] == row_array_parent_col_id
+                              ? std::to_string(h_values_column_indices[col_id])
+                              : name;
+                   });
+  }
+
+  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
+    if (is_enabled_mixed_types_as_string) {
+      return cudf::detail::make_std_vector_sync(
+        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
+    }
+    return std::vector<uint8_t>();
+  }();
+  auto const [ignore_vals, columns] = build_tree(root,
+                                                 is_str_column_all_nulls,
+                                                 d_column_tree,
+                                                 d_unique_col_ids,
+                                                 d_max_row_offsets,
+                                                 column_names,
+                                                 row_array_parent_col_id,
+                                                 is_array_of_arrays,
+                                                 options,
+                                                 stream,
+                                                 mr);
+  if (ignore_vals.empty()) return;
+  scatter_offsets(tree,
+                  col_ids,
+                  row_offsets,
+                  node_ids,
+                  sorted_col_ids,
+                  d_column_tree,
+                  ignore_vals,
+                  columns,
+                  stream);
+}
+
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
+  tree_meta_t& d_column_tree,
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  bool const is_enabled_lines                 = options.is_enabled_lines();
+  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
+  auto column_categories =
+    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
+  auto const column_parent_ids =
+    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
+  auto column_range_beg =
+    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
+  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
+  auto num_columns           = d_unique_col_ids.size();
+  stream.synchronize();
+
+  auto to_json_col_type = [](auto category) {
+    switch (category) {
+      case NC_STRUCT: return json_col_t::StructColumn;
+      case NC_LIST: return json_col_t::ListColumn;
+      case NC_STR: [[fallthrough]];
+      case NC_VAL: return json_col_t::StringColumn;
+      default: return json_col_t::Unknown;
+    }
+  };
+
+  auto initialize_json_columns = [&](auto i, auto& col_ref, auto column_category) {
+    auto& col = col_ref.get();
+    if (col.type != json_col_t::Unknown) { return; }
+    if (column_category == NC_ERR || column_category == NC_FN) {
+      return;
+    } else if (column_category == NC_VAL || column_category == NC_STR) {
+      col.string_offsets.resize(max_row_offsets[i] + 1, stream);
+      col.string_lengths.resize(max_row_offsets[i] + 1, stream);
+      thrust::fill(
+        rmm::exec_policy_nosync(stream),
+        thrust::make_zip_iterator(col.string_offsets.begin(), col.string_lengths.begin()),
+        thrust::make_zip_iterator(col.string_offsets.end(), col.string_lengths.end()),
+        thrust::make_tuple(0, 0));
+    } else if (column_category == NC_LIST) {
+      col.child_offsets.resize(max_row_offsets[i] + 2, stream);
+      thrust::uninitialized_fill(
+        rmm::exec_policy_nosync(stream), col.child_offsets.begin(), col.child_offsets.end(), 0);
+    }
+    col.num_rows = max_row_offsets[i] + 1;
+    col.validity =
+      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
+    col.type = to_json_col_type(column_category);
+  };
+
+  // 2. generate nested columns tree and its device_memory
+  // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
+  auto h_range_col_id_it =
+    thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin());
+  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
+    return thrust::get<0>(a) < thrust::get<0>(b);
+  });
+  // adjacency list construction
+  std::map<NodeIndexT, std::vector<NodeIndexT>> adj;
+  for (auto const this_col_id : unique_col_ids) {
+    auto parent_col_id = column_parent_ids[this_col_id];
+    adj[parent_col_id].push_back(this_col_id);
+  }
+
+  // Pruning
+  auto is_pruned = cudf::detail::make_host_vector<bool>(num_columns, stream);
+  std::fill_n(is_pruned.begin(), num_columns, options.is_enabled_prune_columns());
+
+  // prune all children of a column, but not self.
+  auto ignore_all_children = [&](auto parent_col_id) {
+    std::deque<NodeIndexT> offspring;
+    if (adj.count(parent_col_id)) {
+      for (auto const& child : adj[parent_col_id]) {
+        offspring.push_back(child);
+      }
+    }
+    while (!offspring.empty()) {
+      auto this_id = offspring.front();
+      offspring.pop_front();
+      is_pruned[this_id] = true;
+      if (adj.count(this_id)) {
+        for (auto const& child : adj[this_id]) {
+          offspring.push_back(child);
+        }
+      }
+    }
+  };
+
+  // Pruning: iterate through schema and mark only those columns and enforce type.
+  // NoPruning: iterate through schema and enforce type.
+
+  if (adj[parent_node_sentinel].empty())
+    return {cudf::detail::make_host_vector<bool>(0, stream), {}};  // for empty file
+  CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1");
+  auto expected_types = cudf::detail::make_host_vector<NodeT>(num_columns, stream);
+  std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES);
+
+  auto lookup_names = [&column_names](auto child_ids, auto name) {
+    for (auto const& child_id : child_ids) {
+      if (column_names[child_id] == name) return child_id;
+    }
+    return -1;
+  };
+  // recursive lambda on schema to mark columns as pruned.
+  std::function<void(NodeIndexT root, schema_element const& schema)> mark_is_pruned;
+  mark_is_pruned = [&is_pruned,
+                    &mark_is_pruned,
+                    &adj,
+                    &lookup_names,
+                    &column_categories,
+                    &expected_types,
+                    &ignore_all_children](NodeIndexT root, schema_element const& schema) -> void {
+    if (root == -1) return;
+    bool pass =
+      (schema.type == data_type{type_id::STRUCT} and column_categories[root] == NC_STRUCT) or
+      (schema.type == data_type{type_id::LIST} and column_categories[root] == NC_LIST) or
+      (schema.type != data_type{type_id::STRUCT} and schema.type != data_type{type_id::LIST} and
+       column_categories[root] != NC_FN);
+    if (!pass) {
+      // ignore all children of this column and prune this column.
+      is_pruned[root] = true;
+      ignore_all_children(root);
+      return;
+    }
+    is_pruned[root]    = false;
+    auto expected_type = [](auto type, auto cat) {
+      if (type == data_type{type_id::STRUCT} and cat == NC_STRUCT) return NC_STRUCT;
+      if (type == data_type{type_id::LIST} and cat == NC_LIST) return NC_LIST;
+      if (type != data_type{type_id::STRUCT} and type != data_type{type_id::LIST}) return NC_STR;
+      return NC_ERR;
+    }(schema.type, column_categories[root]);
+    expected_types[root] = expected_type;  // forced type.
+    // ignore children of nested columns, but not self.
+    if (expected_type == NC_STR and
+        (column_categories[root] == NC_STRUCT or column_categories[root] == NC_LIST))
+      ignore_all_children(root);
+    if (not(schema.type == data_type{type_id::STRUCT} or schema.type == data_type{type_id::LIST}))
+      return;  // no children to mark for non-nested.
+    auto child_ids = adj.count(root) ? adj[root] : std::vector<NodeIndexT>{};
+    if (schema.type == data_type{type_id::STRUCT}) {
+      for (auto const& key_pair : schema.child_types) {
+        auto col_id = lookup_names(child_ids, key_pair.first);
+        if (col_id == -1) continue;
+        is_pruned[col_id] = false;
+        for (auto const& child_id : adj[col_id])  // children of field (>1 if mixed)
+          mark_is_pruned(child_id, key_pair.second);
+      }
+    } else if (schema.type == data_type{type_id::LIST}) {
+      // partial solution for list children to have any name.
+      auto this_list_child_name =
+        schema.child_types.size() == 1 ? schema.child_types.begin()->first : list_child_name;
+      if (schema.child_types.count(this_list_child_name) == 0) return;
+      auto list_child = schema.child_types.at(this_list_child_name);
+      for (auto const& child_id : child_ids)
+        mark_is_pruned(child_id, list_child);
+    }
+  };
+  if (is_array_of_arrays) {
+    if (adj[adj[parent_node_sentinel][0]].empty())
+      return {cudf::detail::make_host_vector<bool>(0, stream), {}};
+    auto root_list_col_id =
+      is_enabled_lines ? adj[parent_node_sentinel][0] : adj[adj[parent_node_sentinel][0]][0];
+    // mark root and row array col_id as not pruned.
+    if (!is_enabled_lines) {
+      auto top_level_list_id       = adj[parent_node_sentinel][0];
+      is_pruned[top_level_list_id] = false;
+    }
+    is_pruned[root_list_col_id] = false;
+    std::visit(cudf::detail::visitor_overload{
+                 [&root_list_col_id, &adj, &mark_is_pruned, &column_names](
+                   std::vector<data_type> const& user_dtypes) -> void {
+                   for (size_t i = 0; i < adj[root_list_col_id].size() && i < user_dtypes.size();
+                        i++) {
+                     NodeIndexT const first_child_id = adj[root_list_col_id][i];
+                     auto name                       = column_names[first_child_id];
+                     auto value_id                   = std::stol(name);
+                     if (value_id >= 0 and value_id < static_cast<long>(user_dtypes.size()))
+                       mark_is_pruned(first_child_id, schema_element{user_dtypes[value_id]});
+                     // Note: mixed type - forced type, will work here.
+                   }
+                 },
+                 [&root_list_col_id, &adj, &mark_is_pruned, &column_names](
+                   std::map<std::string, data_type> const& user_dtypes) -> void {
+                   for (size_t i = 0; i < adj[root_list_col_id].size(); i++) {
+                     auto const first_child_id = adj[root_list_col_id][i];
+                     auto name                 = column_names[first_child_id];
+                     if (user_dtypes.count(name))
+                       mark_is_pruned(first_child_id, schema_element{user_dtypes.at(name)});
+                   }
+                 },
+                 [&root_list_col_id, &adj, &mark_is_pruned, &column_names](
+                   std::map<std::string, schema_element> const& user_dtypes) -> void {
+                   for (size_t i = 0; i < adj[root_list_col_id].size(); i++) {
+                     auto const first_child_id = adj[root_list_col_id][i];
+                     auto name                 = column_names[first_child_id];
+                     if (user_dtypes.count(name))
+                       mark_is_pruned(first_child_id, user_dtypes.at(name));
+                   }
+                 }},
+               options.get_dtypes());
+  } else {
+    auto root_struct_col_id =
+      is_enabled_lines
+        ? adj[parent_node_sentinel][0]
+        : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]);
+    // mark root and row struct col_id as not pruned.
+    if (!is_enabled_lines) {
+      auto top_level_list_id       = adj[parent_node_sentinel][0];
+      is_pruned[top_level_list_id] = false;
+    }
+    is_pruned[root_struct_col_id] = false;
+    schema_element u_schema{data_type{type_id::STRUCT}};
+    u_schema.child_types = unified_schema(options);
+    std::visit(
+      cudf::detail::visitor_overload{
+        [&is_pruned, &root_struct_col_id, &adj, &mark_is_pruned](
+          std::vector<data_type> const& user_dtypes) -> void {
+          for (size_t i = 0; i < adj[root_struct_col_id].size() && i < user_dtypes.size(); i++) {
+            NodeIndexT const first_field_id = adj[root_struct_col_id][i];
+            is_pruned[first_field_id]       = false;
+            for (auto const& child_id : adj[first_field_id])  // children of field (>1 if mixed)
+              mark_is_pruned(child_id, schema_element{user_dtypes[i]});
+          }
+        },
+        [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema](
+          std::map<std::string, data_type> const& user_dtypes) -> void {
+          mark_is_pruned(root_struct_col_id, u_schema);
+        },
+        [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema](
+          std::map<std::string, schema_element> const& user_dtypes) -> void {
+          mark_is_pruned(root_struct_col_id, u_schema);
+        }},
+      options.get_dtypes());
+  }
+  // Useful for array of arrays
+  auto named_level =
+    is_enabled_lines
+      ? adj[parent_node_sentinel][0]
+      : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]);
+
+  auto handle_mixed_types = [&column_categories,
+                             &is_str_column_all_nulls,
+                             &is_pruned,
+                             &expected_types,
+                             &is_enabled_mixed_types_as_string,
+                             &ignore_all_children](std::vector<NodeIndexT>& child_ids) {
+    // do these on unpruned columns only.
+    // when mixed types is disabled, ignore string sibling of nested column.
+    // when mixed types is disabled, and both list and struct columns are siblings, error out.
+    // when mixed types is enabled, force string type on all columns
+
+    // Remove pruned children (forced type will not clash here because other types are already
+    // pruned)
+    child_ids.erase(
+      std::remove_if(child_ids.begin(),
+                     child_ids.end(),
+                     [&is_pruned](NodeIndexT child_id) { return is_pruned[child_id]; }),
+      child_ids.end());
+    // find string id, struct id, list id.
+    NodeIndexT str_col_id{-1}, struct_col_id{-1}, list_col_id{-1};
+    for (auto const& child_id : child_ids) {
+      if (column_categories[child_id] == NC_VAL || column_categories[child_id] == NC_STR)
+        str_col_id = child_id;
+      else if (column_categories[child_id] == NC_STRUCT)
+        struct_col_id = child_id;
+      else if (column_categories[child_id] == NC_LIST)
+        list_col_id = child_id;
+    }
+    // conditions for handling mixed types.
+    if (is_enabled_mixed_types_as_string) {
+      if (struct_col_id != -1 and list_col_id != -1) {
+        expected_types[struct_col_id] = NC_STR;
+        expected_types[list_col_id]   = NC_STR;
+        // ignore children of nested columns.
+        ignore_all_children(struct_col_id);
+        ignore_all_children(list_col_id);
+      }
+      if ((struct_col_id != -1 or list_col_id != -1) and str_col_id != -1) {
+        if (is_str_column_all_nulls[str_col_id])
+          is_pruned[str_col_id] = true;
+        else {
+          // ignore children of nested columns.
+          if (struct_col_id != -1) {
+            expected_types[struct_col_id] = NC_STR;
+            ignore_all_children(struct_col_id);
+          }
+          if (list_col_id != -1) {
+            expected_types[list_col_id] = NC_STR;
+            ignore_all_children(list_col_id);
+          }
+        }
+      }
+    } else {
+      // if both are present, error out.
+      CUDF_EXPECTS(struct_col_id == -1 or list_col_id == -1,
+                   "A mix of lists and structs within the same column is not supported");
+      // either one only: so ignore str column.
+      if ((struct_col_id != -1 or list_col_id != -1) and str_col_id != -1) {
+        is_pruned[str_col_id] = true;
+      }
+    }
+  };
+
+  using dev_ref = std::reference_wrapper<device_json_column>;
+  std::unordered_map<NodeIndexT, dev_ref> columns;
+  columns.try_emplace(parent_node_sentinel, std::ref(root));
+  // convert adjaceny list to tree.
+  dev_ref parent_ref = std::ref(root);
+  // creates children column
+  std::function<void(NodeIndexT, dev_ref)> construct_tree;
+  construct_tree = [&](NodeIndexT root, dev_ref ref) -> void {
+    if (is_pruned[root]) return;
+    auto expected_category =
+      expected_types[root] == NUM_NODE_CLASSES ? column_categories[root] : expected_types[root];
+    initialize_json_columns(root, ref, expected_category);
+    auto child_ids = adj.count(root) ? adj[root] : std::vector<NodeIndexT>{};
+    if (expected_category == NC_STRUCT) {
+      // find field column ids, and its children and create columns.
+      for (auto const& field_id : child_ids) {
+        auto name = column_names[field_id];
+        if (is_pruned[field_id]) continue;
+        auto inserted =
+          ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second;
+        ref.get().column_order.emplace_back(name);
+        CUDF_EXPECTS(inserted,
+                     "struct child column insertion failed, duplicate column name in the parent");
+        auto this_ref = std::ref(ref.get().child_columns.at(name));
+        // Mixed type handling
+        auto& value_col_ids = adj[field_id];
+        handle_mixed_types(value_col_ids);
+        if (value_col_ids.empty()) {
+          // If no column is present, remove the uninitialized column.
+          ref.get().child_columns.erase(name);
+          ref.get().column_order.pop_back();
+          continue;
+        }
+        for (auto const& child_id : value_col_ids)  // children of field (>1 if mixed)
+        {
+          if (is_pruned[child_id]) continue;
+          columns.try_emplace(child_id, this_ref);
+          construct_tree(child_id, this_ref);
+        }
+      }
+    } else if (expected_category == NC_LIST) {
+      // array of arrays interpreted as array of structs.
+      if (is_array_of_arrays and root == named_level) {
+        // create column names
+        std::map<NodeIndexT, std::vector<NodeIndexT>> array_values;
+        for (auto const& child_id : child_ids) {
+          if (is_pruned[child_id]) continue;
+          auto name = column_names[child_id];
+          array_values[std::stoi(name)].push_back(child_id);
+        }
+        //
+        for (auto const& value_id_pair : array_values) {
+          auto [value_id, value_col_ids] = value_id_pair;
+          auto name                      = std::to_string(value_id);
+          auto inserted =
+            ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second;
+          ref.get().column_order.emplace_back(name);
+          CUDF_EXPECTS(inserted,
+                       "list child column insertion failed, duplicate column name in the parent");
+          auto this_ref = std::ref(ref.get().child_columns.at(name));
+          handle_mixed_types(value_col_ids);
+          if (value_col_ids.empty()) {
+            // If no column is present, remove the uninitialized column.
+            ref.get().child_columns.erase(name);
+            ref.get().column_order.pop_back();
+            continue;
+          }
+          for (auto const& child_id : value_col_ids)  // children of field (>1 if mixed)
+          {
+            if (is_pruned[child_id]) continue;
+            columns.try_emplace(child_id, this_ref);
+            construct_tree(child_id, this_ref);
+          }
+        }
+      } else {
+        if (child_ids.empty()) return;
+        auto inserted =
+          ref.get()
+            .child_columns.try_emplace(list_child_name, device_json_column(stream, mr))
+            .second;
+        CUDF_EXPECTS(inserted,
+                     "list child column insertion failed, duplicate column name in the parent");
+        ref.get().column_order.emplace_back(list_child_name);
+        auto this_ref = std::ref(ref.get().child_columns.at(list_child_name));
+        // Mixed type handling
+        handle_mixed_types(child_ids);
+        if (child_ids.empty()) {
+          // If no column is present, remove the uninitialized column.
+          ref.get().child_columns.erase(list_child_name);
+        }
+        for (auto const& child_id : child_ids) {
+          if (is_pruned[child_id]) continue;
+          columns.try_emplace(child_id, this_ref);
+          construct_tree(child_id, this_ref);
+        }
+      }
+    }
+  };
+  auto inserted = parent_ref.get()
+                    .child_columns.try_emplace(list_child_name, device_json_column(stream, mr))
+                    .second;
+  CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
+  parent_ref = std::ref(parent_ref.get().child_columns.at(list_child_name));
+  columns.try_emplace(adj[parent_node_sentinel][0], parent_ref);
+  construct_tree(adj[parent_node_sentinel][0], parent_ref);
+
+  // Forced string type due to input schema and mixed type as string.
+  for (size_t i = 0; i < expected_types.size(); i++) {
+    if (expected_types[i] == NC_STR) {
+      if (columns.count(i)) { columns.at(i).get().forced_as_string_column = true; }
+    }
+  }
+  std::transform(expected_types.cbegin(),
+                 expected_types.cend(),
+                 column_categories.cbegin(),
+                 expected_types.begin(),
+                 [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; });
+  cudaMemcpyAsync(d_column_tree.node_categories.begin(),
+                  expected_types.data(),
+                  expected_types.size() * sizeof(column_categories[0]),
+                  cudaMemcpyDefault,
+                  stream.value());
+
+  return {is_pruned, columns};
+}
+}  // namespace experimental
+
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index b08fd139113..912e93d52ae 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -47,7 +47,6 @@
 
 namespace cudf::io::json::detail {
 
-// DEBUG prints
 auto to_cat = [](auto v) -> std::string {
   switch (v) {
     case NC_STRUCT: return " S";
@@ -105,19 +104,20 @@ void print_tree(host_span<SymbolT const> input,
  * max row offsets of columns
  */
 std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
-reduce_to_column_tree(tree_meta_t& tree,
-                      device_span<NodeIndexT> original_col_ids,
-                      device_span<NodeIndexT> sorted_col_ids,
-                      device_span<NodeIndexT> ordered_node_ids,
-                      device_span<size_type> row_offsets,
+reduce_to_column_tree(tree_meta_t const& tree,
+                      device_span<NodeIndexT const> original_col_ids,
+                      device_span<NodeIndexT const> sorted_col_ids,
+                      device_span<NodeIndexT const> ordered_node_ids,
+                      device_span<size_type const> row_offsets,
                       bool is_array_of_arrays,
                       NodeIndexT const row_array_parent_col_id,
                       rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
+
   // 1. column count for allocation
-  auto const num_columns =
-    thrust::unique_count(rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end());
+  auto const num_columns = thrust::unique_count(
+    rmm::exec_policy_nosync(stream), sorted_col_ids.begin(), sorted_col_ids.end());
 
   // 2. reduce_by_key {col_id}, {row_offset}, max.
   rmm::device_uvector<NodeIndexT> unique_col_ids(num_columns, stream);
@@ -162,30 +162,34 @@ reduce_to_column_tree(tree_meta_t& tree,
     });
 
   // 4. unique_copy parent_node_ids, ranges
-  rmm::device_uvector<TreeDepthT> column_levels(0, stream);  // not required
+  rmm::device_uvector<TreeDepthT> column_levels(num_columns, stream);  // not required
   rmm::device_uvector<NodeIndexT> parent_col_ids(num_columns, stream);
   rmm::device_uvector<SymbolOffsetT> col_range_begin(num_columns, stream);  // Field names
   rmm::device_uvector<SymbolOffsetT> col_range_end(num_columns, stream);
   rmm::device_uvector<size_type> unique_node_ids(num_columns, stream);
-  thrust::unique_by_key_copy(rmm::exec_policy(stream),
+  thrust::unique_by_key_copy(rmm::exec_policy_nosync(stream),
                              sorted_col_ids.begin(),
                              sorted_col_ids.end(),
                              ordered_node_ids.begin(),
                              thrust::make_discard_iterator(),
                              unique_node_ids.begin());
+
   thrust::copy_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_zip_iterator(
+      thrust::make_permutation_iterator(tree.node_levels.begin(), unique_node_ids.begin()),
       thrust::make_permutation_iterator(tree.parent_node_ids.begin(), unique_node_ids.begin()),
       thrust::make_permutation_iterator(tree.node_range_begin.begin(), unique_node_ids.begin()),
       thrust::make_permutation_iterator(tree.node_range_end.begin(), unique_node_ids.begin())),
     unique_node_ids.size(),
-    thrust::make_zip_iterator(
-      parent_col_ids.begin(), col_range_begin.begin(), col_range_end.begin()));
+    thrust::make_zip_iterator(column_levels.begin(),
+                              parent_col_ids.begin(),
+                              col_range_begin.begin(),
+                              col_range_end.begin()));
 
   // convert parent_node_ids to parent_col_ids
   thrust::transform(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     parent_col_ids.begin(),
     parent_col_ids.end(),
     parent_col_ids.begin(),
@@ -203,18 +207,17 @@ reduce_to_column_tree(tree_meta_t& tree,
              column_categories[parent_col_id] == NC_LIST &&
                (!is_array_of_arrays || parent_col_id != row_array_parent_col_id));
   };
+
   // Mixed types in List children go to different columns,
   // so all immediate children of list column should have same max_row_offsets.
   //   create list's children max_row_offsets array. (initialize to zero)
   //   atomicMax on  children max_row_offsets array.
   //   gather the max_row_offsets from children row offset array.
   {
-    rmm::device_uvector<NodeIndexT> list_parents_children_max_row_offsets(num_columns, stream);
-    thrust::fill(rmm::exec_policy(stream),
-                 list_parents_children_max_row_offsets.begin(),
-                 list_parents_children_max_row_offsets.end(),
-                 0);
-    thrust::for_each(rmm::exec_policy(stream),
+    auto list_parents_children_max_row_offsets =
+      cudf::detail::make_zeroed_device_uvector_async<NodeIndexT>(
+        static_cast<std::size_t>(num_columns), stream, cudf::get_current_device_resource_ref());
+    thrust::for_each(rmm::exec_policy_nosync(stream),
                      unique_col_ids.begin(),
                      unique_col_ids.end(),
                      [column_categories = column_categories.begin(),
@@ -230,8 +233,9 @@ reduce_to_column_tree(tree_meta_t& tree,
                          ref.fetch_max(max_row_offsets[col_id], cuda::std::memory_order_relaxed);
                        }
                      });
+
     thrust::gather_if(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       parent_col_ids.begin(),
       parent_col_ids.end(),
       parent_col_ids.begin(),
@@ -246,7 +250,7 @@ reduce_to_column_tree(tree_meta_t& tree,
   // copy lists' max_row_offsets to children.
   // all structs should have same size.
   thrust::transform_if(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     unique_col_ids.begin(),
     unique_col_ids.end(),
     max_row_offsets.begin(),
@@ -272,7 +276,7 @@ reduce_to_column_tree(tree_meta_t& tree,
 
   // For Struct and List (to avoid copying entire strings when mixed type as string is enabled)
   thrust::transform_if(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     col_range_begin.begin(),
     col_range_begin.end(),
     column_categories.begin(),
@@ -313,7 +317,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
     // Note: json_col modified here, moves this memory
   };
 
-  auto get_child_schema = [schema](auto child_name) -> std::optional<schema_element> {
+  auto get_child_schema = [&schema](auto child_name) -> std::optional<schema_element> {
     if (schema.has_value()) {
       auto const result = schema.value().child_types.find(child_name);
       if (result != std::end(schema.value().child_types)) { return result->second; }
@@ -321,6 +325,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
     return {};
   };
 
+  auto get_list_child_schema = [&schema]() -> std::optional<schema_element> {
+    if (schema.has_value()) {
+      if (schema.value().child_types.size() > 0) return schema.value().child_types.begin()->second;
+    }
+    return {};
+  };
+
   switch (json_col.type) {
     case json_col_t::StringColumn: {
       // move string_offsets to GPU and transform to string column
@@ -435,9 +446,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                                                      rmm::device_buffer{},
                                                      0);
       // Create children column
-      auto child_schema_element = json_col.child_columns.empty()
-                                    ? std::optional<schema_element>{}
-                                    : get_child_schema(json_col.child_columns.begin()->first);
+      auto child_schema_element =
+        json_col.child_columns.empty() ? std::optional<schema_element>{} : get_list_child_schema();
       auto [child_column, names] =
         json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
           ? std::pair<std::unique_ptr<column>,
@@ -475,6 +485,16 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
   }
 }
 
+template <typename... Args>
+auto make_device_json_column_dispatch(bool experimental, Args&&... args)
+{
+  if (experimental) {
+    return experimental::make_device_json_column(std::forward<Args>(args)...);
+  } else {
+    return make_device_json_column(std::forward<Args>(args)...);
+  }
+}
+
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
@@ -520,6 +540,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                   gpu_tree,
                                   is_array_of_arrays,
                                   options.is_enabled_lines(),
+                                  options.is_enabled_experimental(),
                                   stream,
                                   cudf::get_current_device_resource_ref());
 
@@ -532,15 +553,16 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                0);
 
   // Get internal JSON column
-  make_device_json_column(d_input,
-                          gpu_tree,
-                          gpu_col_id,
-                          gpu_row_offsets,
-                          root_column,
-                          is_array_of_arrays,
-                          options,
-                          stream,
-                          mr);
+  make_device_json_column_dispatch(options.is_enabled_experimental(),
+                                   d_input,
+                                   gpu_tree,
+                                   gpu_col_id,
+                                   gpu_row_offsets,
+                                   root_column,
+                                   is_array_of_arrays,
+                                   options,
+                                   stream,
+                                   mr);
 
   // data_root refers to the root column of the data represented by the given JSON string
   auto& data_root =
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 4d0dc010c57..d949635c1cc 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include "io/utilities/hostdevice_vector.hpp"
+#include "io/utilities/parsing_utils.cuh"
+#include "io/utilities/string_parsing.hpp"
 #include "nested_json.hpp"
 
 #include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/scatter.cuh>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/helper_functions.cuh>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
@@ -34,12 +35,14 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/device/device_radix_sort.cuh>
+#include <cuco/static_map.cuh>
 #include <cuco/static_set.cuh>
 #include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/fill.h>
+#include <thrust/functional.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -492,6 +495,85 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
           std::move(node_range_end)};
 }
 
+// Return field node ids after unicode decoding of field names and matching them to same field names
+std::pair<size_t, rmm::device_uvector<size_type>> remapped_field_nodes_after_unicode_decode(
+  device_span<SymbolT const> d_input,
+  tree_meta_t const& d_tree,
+  device_span<size_type const> keys,
+  rmm::cuda_stream_view stream)
+{
+  size_t num_keys = keys.size();
+  if (num_keys == 0) { return {num_keys, rmm::device_uvector<size_type>(num_keys, stream)}; }
+  rmm::device_uvector<size_type> offsets(num_keys, stream);
+  rmm::device_uvector<size_type> lengths(num_keys, stream);
+  auto offset_length_it = thrust::make_zip_iterator(offsets.begin(), lengths.begin());
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    keys.begin(),
+                    keys.end(),
+                    offset_length_it,
+                    [node_range_begin = d_tree.node_range_begin.data(),
+                     node_range_end   = d_tree.node_range_end.data()] __device__(auto key) {
+                      return thrust::make_tuple(node_range_begin[key],
+                                                node_range_end[key] - node_range_begin[key]);
+                    });
+  cudf::io::parse_options_view opt{',', '\n', '\0', '.'};
+  opt.keepquotes = true;
+
+  auto utf8_decoded_fields = parse_data(d_input.data(),
+                                        offset_length_it,
+                                        num_keys,
+                                        data_type{type_id::STRING},
+                                        rmm::device_buffer{},
+                                        0,
+                                        opt,
+                                        stream,
+                                        cudf::get_current_device_resource_ref());
+  // hash using iter, create a hashmap for 0-num_keys.
+  // insert and find. -> array
+  // store to static_map with keys as field key[index], and values as key[array[index]]
+
+  auto str_view         = strings_column_view{utf8_decoded_fields->view()};
+  auto const char_ptr   = str_view.chars_begin(stream);
+  auto const offset_ptr = str_view.offsets().begin<size_type>();
+
+  // String hasher
+  auto const d_hasher = cuda::proclaim_return_type<
+    typename cudf::hashing::detail::default_hash<cudf::string_view>::result_type>(
+    [char_ptr, offset_ptr] __device__(auto node_id) {
+      auto const field_name = cudf::string_view(char_ptr + offset_ptr[node_id],
+                                                offset_ptr[node_id + 1] - offset_ptr[node_id]);
+      return cudf::hashing::detail::default_hash<cudf::string_view>{}(field_name);
+    });
+  auto const d_equal = [char_ptr, offset_ptr] __device__(auto node_id1, auto node_id2) {
+    auto const field_name1 = cudf::string_view(char_ptr + offset_ptr[node_id1],
+                                               offset_ptr[node_id1 + 1] - offset_ptr[node_id1]);
+    auto const field_name2 = cudf::string_view(char_ptr + offset_ptr[node_id2],
+                                               offset_ptr[node_id2 + 1] - offset_ptr[node_id2]);
+    return field_name1 == field_name2;
+  };
+
+  using hasher_type                             = decltype(d_hasher);
+  constexpr size_type empty_node_index_sentinel = -1;
+  auto key_set                                  = cuco::static_set{
+    cuco::extent{compute_hash_table_size(num_keys)},
+    cuco::empty_key{empty_node_index_sentinel},
+    d_equal,
+    cuco::linear_probing<1, hasher_type>{d_hasher},
+                                     {},
+                                     {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+  auto const counting_iter = thrust::make_counting_iterator<size_type>(0);
+  rmm::device_uvector<size_type> found_keys(num_keys, stream);
+  key_set.insert_and_find_async(counting_iter,
+                                counting_iter + num_keys,
+                                found_keys.begin(),
+                                thrust::make_discard_iterator(),
+                                stream.value());
+  // set.size will synchronize the stream before return.
+  return {key_set.size(stream), std::move(found_keys)};
+}
+
 /**
  * @brief Generates unique node_type id for each node.
  * Field nodes with the same name are assigned the same node_type id.
@@ -500,11 +582,14 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
  * All inputs and outputs are in node_id order.
  * @param d_input JSON string in device memory
  * @param d_tree Tree representation of the JSON
+ * @param is_enabled_experimental Whether to enable experimental features such as
+ * utf8 field name support
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @return Vector of node_type ids
  */
 rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<SymbolT const> d_input,
                                                               tree_meta_t const& d_tree,
+                                                              bool is_enabled_experimental,
                                                               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
@@ -536,7 +621,7 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
   };
   // key-value pairs: uses node_id itself as node_type. (unique node_id for a field name due to
   // hashing)
-  auto const iter = thrust::make_counting_iterator<size_type>(0);
+  auto const counting_iter = thrust::make_counting_iterator<size_type>(0);
 
   auto const is_field_name_node = [node_categories =
                                      d_tree.node_categories.data()] __device__(auto node_id) {
@@ -554,15 +639,61 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
                                      {},
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value()};
-  key_set.insert_if_async(iter,
-                          iter + num_nodes,
+  key_set.insert_if_async(counting_iter,
+                          counting_iter + num_nodes,
                           thrust::counting_iterator<size_type>(0),  // stencil
                           is_field_name_node,
                           stream.value());
 
+  // experimental feature: utf8 field name support
+  // parse_data on field names,
+  // rehash it using another map,
+  // reassign the reverse map values to new matched node indices.
+  auto get_utf8_matched_field_nodes = [&]() {
+    auto make_map = [&stream](auto num_keys) {
+      using hasher_type3 = cudf::hashing::detail::default_hash<size_type>;
+      return cuco::static_map{
+        cuco::extent{compute_hash_table_size(num_keys, 100)},  // 100% occupancy
+        cuco::empty_key{empty_node_index_sentinel},
+        cuco::empty_value{empty_node_index_sentinel},
+        {},
+        cuco::linear_probing<1, hasher_type3>{hasher_type3{}},
+        {},
+        {},
+        cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+        stream.value()};
+    };
+    if (!is_enabled_experimental) { return std::pair{false, make_map(0)}; }
+    // get all unique field node ids for utf8 decoding
+    auto num_keys = key_set.size(stream);
+    rmm::device_uvector<size_type> keys(num_keys, stream);
+    key_set.retrieve_all(keys.data(), stream.value());
+
+    auto [num_unique_fields, found_keys] =
+      remapped_field_nodes_after_unicode_decode(d_input, d_tree, keys, stream);
+
+    auto is_need_remap = num_unique_fields != num_keys;
+    if (!is_need_remap) { return std::pair{false, make_map(0)}; }
+
+    // store to static_map with keys as field keys[index], and values as keys[found_keys[index]]
+    auto reverse_map        = make_map(num_keys);
+    auto matching_keys_iter = thrust::make_permutation_iterator(keys.begin(), found_keys.begin());
+    auto pair_iter =
+      thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), matching_keys_iter));
+    reverse_map.insert_async(pair_iter, pair_iter + num_keys, stream);
+    return std::pair{is_need_remap, std::move(reverse_map)};
+  };
+  auto [is_need_remap, reverse_map] = get_utf8_matched_field_nodes();
+
   auto const get_hash_value =
-    [key_set = key_set.ref(cuco::op::find)] __device__(auto node_id) -> size_type {
+    [key_set       = key_set.ref(cuco::op::find),
+     is_need_remap = is_need_remap,
+     rm            = reverse_map.ref(cuco::op::find)] __device__(auto node_id) -> size_type {
     auto const it = key_set.find(node_id);
+    if (it != key_set.end() and is_need_remap) {
+      auto const it2 = rm.find(*it);
+      return (it2 == rm.end()) ? size_type{0} : it2->second;
+    }
     return (it == key_set.end()) ? size_type{0} : *it;
   };
 
@@ -771,6 +902,8 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
  * @param d_tree Tree representation of the JSON
  * @param is_array_of_arrays Whether the tree is an array of arrays
  * @param is_enabled_lines Whether the input is a line-delimited JSON
+ * @param is_enabled_experimental Whether the experimental feature is enabled such as
+ * utf8 field name support
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return column_id, parent_column_id
@@ -780,6 +913,7 @@ std::pair<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<NodeIndexT>> gene
   tree_meta_t const& d_tree,
   bool is_array_of_arrays,
   bool is_enabled_lines,
+  bool is_enabled_experimental,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
@@ -793,7 +927,7 @@ std::pair<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<NodeIndexT>> gene
   auto [col_id, unique_keys] = [&]() {
     // Convert node_category + field_name to node_type.
     rmm::device_uvector<size_type> node_type =
-      hash_node_type_with_field_name(d_input, d_tree, stream);
+      hash_node_type_with_field_name(d_input, d_tree, is_enabled_experimental, stream);
 
     // hash entire path from node to root.
     return hash_node_path(d_tree.node_levels,
@@ -948,12 +1082,13 @@ records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               tree_meta_t const& d_tree,
                               bool is_array_of_arrays,
                               bool is_enabled_lines,
+                              bool is_enabled_experimental,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  auto [new_col_id, new_parent_col_id] =
-    generate_column_id(d_input, d_tree, is_array_of_arrays, is_enabled_lines, stream, mr);
+  auto [new_col_id, new_parent_col_id] = generate_column_id(
+    d_input, d_tree, is_array_of_arrays, is_enabled_lines, is_enabled_experimental, stream, mr);
 
   auto row_offsets = compute_row_offsets(
     std::move(new_parent_col_id), d_tree, is_array_of_arrays, is_enabled_lines, stream, mr);
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 83f71e657a7..3d9a51833e0 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -185,6 +185,55 @@ struct device_json_column {
   }
 };
 
+namespace experimental {
+/*
+ * @brief Sparse graph adjacency matrix stored in Compressed Sparse Row (CSR) format.
+ */
+struct compressed_sparse_row {
+  rmm::device_uvector<NodeIndexT> row_idx;
+  rmm::device_uvector<NodeIndexT> col_idx;
+};
+
+/*
+ * @brief Auxiliary column tree properties that are required to construct the device json
+ * column subtree, but not required for the final cudf column construction.
+ */
+struct column_tree_properties {
+  rmm::device_uvector<NodeT> categories;
+  rmm::device_uvector<size_type> max_row_offsets;
+  rmm::device_uvector<NodeIndexT> mapped_ids;
+};
+
+namespace detail {
+/**
+ * @brief Reduce node tree into column tree by aggregating each property of column.
+ *
+ * @param node_tree Node tree representation of JSON string
+ * @param original_col_ids Column ids of nodes
+ * @param sorted_col_ids Sorted column ids of nodes
+ * @param ordered_node_ids Node ids of nodes sorted by column ids
+ * @param row_offsets Row offsets of nodes
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Tuple of compressed_sparse_row struct storing adjacency information of the column tree,
+ * and column_tree_properties struct storing properties of each node i.e. column category, max
+ * number of rows in the column, and column id
+ */
+CUDF_EXPORT
+std::tuple<compressed_sparse_row, column_tree_properties> reduce_to_column_tree(
+  tree_meta_t& node_tree,
+  device_span<NodeIndexT const> original_col_ids,
+  device_span<NodeIndexT const> sorted_col_ids,
+  device_span<NodeIndexT const> ordered_node_ids,
+  device_span<size_type const> row_offsets,
+  bool is_array_of_arrays,
+  NodeIndexT row_array_parent_col_id,
+  rmm::cuda_stream_view stream);
+
+}  // namespace detail
+}  // namespace experimental
+
 namespace detail {
 
 // TODO: return device_uvector instead of passing pre-allocated memory
@@ -267,6 +316,8 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
  * index, level, begin index, and end index in the input JSON string
  * @param is_array_of_arrays Whether the tree is an array of arrays
  * @param is_enabled_lines Whether the input is a line-delimited JSON
+ * @param is_enabled_experimental Whether to enable experimental features such as utf-8 field name
+ * support
  * @param stream The CUDA stream to which kernels are dispatched
  * @param mr Optional, resource with which to allocate
  * @return A tuple of the output column indices and the row offsets within each column for each node
@@ -277,6 +328,7 @@ records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               tree_meta_t const& d_tree,
                               bool is_array_of_arrays,
                               bool is_enabled_lines,
+                              bool is_enabled_experimental,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
@@ -314,12 +366,13 @@ get_array_children_indices(TreeDepthT row_array_children_level,
  * @return A tuple of column tree representation of JSON string, column ids of columns, and
  * max row offsets of columns
  */
+CUDF_EXPORT
 std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
-reduce_to_column_tree(tree_meta_t& tree,
-                      device_span<NodeIndexT> original_col_ids,
-                      device_span<NodeIndexT> sorted_col_ids,
-                      device_span<NodeIndexT> ordered_node_ids,
-                      device_span<size_type> row_offsets,
+reduce_to_column_tree(tree_meta_t const& tree,
+                      device_span<NodeIndexT const> original_col_ids,
+                      device_span<NodeIndexT const> sorted_col_ids,
+                      device_span<NodeIndexT const> ordered_node_ids,
+                      device_span<size_type const> row_offsets,
                       bool is_array_of_arrays,
                       NodeIndexT const row_array_parent_col_id,
                       rmm::cuda_stream_view stream);
@@ -343,14 +396,30 @@ reduce_to_column_tree(tree_meta_t& tree,
  * of child_offets and validity members of `d_json_column`
  */
 void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t& tree,
-                             device_span<NodeIndexT> col_ids,
-                             device_span<size_type> row_offsets,
+                             tree_meta_t const& tree,
+                             device_span<NodeIndexT const> col_ids,
+                             device_span<size_type const> row_offsets,
                              device_json_column& root,
                              bool is_array_of_arrays,
                              cudf::io::json_reader_options const& options,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
+
+namespace experimental {
+/**
+ * @copydoc cudf::io::json::detail::make_device_json_column
+ */
+void make_device_json_column(device_span<SymbolT const> input,
+                             tree_meta_t const& tree,
+                             device_span<NodeIndexT const> col_ids,
+                             device_span<size_type const> row_offsets,
+                             device_json_column& root,
+                             bool is_array_of_arrays,
+                             cudf::io::json_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
+}  // namespace experimental
+
 /**
  * @brief Retrieves the parse_options to be used for type inference and type casting
  *
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 86402a0e7de..573101cefd9 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -22,7 +22,6 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <cuco/static_map.cuh>
 #include <cuco/static_multimap.cuh>
 #include <cuda/atomic>
 
@@ -51,11 +50,6 @@ using mixed_multimap_type =
                         cudf::detail::cuco_allocator<char>,
                         cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
-using semi_map_type = cuco::legacy::static_map<hash_value_type,
-                                               size_type,
-                                               cuda::thread_scope_device,
-                                               cudf::detail::cuco_allocator<char>>;
-
 using row_hash_legacy =
   cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
 
diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh
index 19701816867..4a52cfe098a 100644
--- a/cpp/src/join/mixed_join_common_utils.cuh
+++ b/cpp/src/join/mixed_join_common_utils.cuh
@@ -25,6 +25,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cub/cub.cuh>
+#include <cuco/static_set.cuh>
 
 namespace cudf {
 namespace detail {
@@ -160,6 +161,39 @@ struct pair_expression_equality : public expression_equality<has_nulls> {
   }
 };
 
+/**
+ * @brief Equality comparator that composes two row_equality comparators.
+ */
+struct double_row_equality_comparator {
+  row_equality const equality_comparator;
+  row_equality const conditional_comparator;
+
+  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
+  {
+    using experimental::row::lhs_index_type;
+    using experimental::row::rhs_index_type;
+
+    return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
+           conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
+  }
+};
+
+// A CUDA Cooperative Group of 1 thread for the hash set for mixed semi.
+auto constexpr DEFAULT_MIXED_SEMI_JOIN_CG_SIZE = 1;
+
+// The hash set type used by mixed_semi_join with the build_table.
+using hash_set_type =
+  cuco::static_set<size_type,
+                   cuco::extent<size_t>,
+                   cuda::thread_scope_device,
+                   double_row_equality_comparator,
+                   cuco::linear_probing<DEFAULT_MIXED_SEMI_JOIN_CG_SIZE, row_hash>,
+                   cudf::detail::cuco_allocator<char>,
+                   cuco::storage<1>>;
+
+// The hash_set_ref_type used by mixed_semi_join kerenels for probing.
+using hash_set_ref_type = hash_set_type::ref_type<cuco::contains_tag>;
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 7459ac3e99c..bd8c80652a0 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -38,38 +38,48 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                   table_device_view right_table,
                   table_device_view probe,
                   table_device_view build,
-                  row_hash const hash_probe,
                   row_equality const equality_probe,
-                  cudf::detail::semi_map_type::device_view hash_table_view,
+                  hash_set_ref_type set_ref,
                   cudf::device_span<bool> left_table_keep_mask,
                   cudf::ast::detail::expression_device_view device_expression_data)
 {
+  auto constexpr cg_size = hash_set_ref_type::cg_size;
+
+  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
   // used to circumvent conflicts between arrays of different types between
   // different template instantiations due to the extern specifier.
   extern __shared__ char raw_intermediate_storage[];
-  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+  auto intermediate_storage =
     reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
   auto thread_intermediate_storage =
-    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+    intermediate_storage + (tile.meta_group_rank() * device_expression_data.num_intermediates);
 
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = left_num_rows;
+  // Equality evaluator to use
+  auto const evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
 
-  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
+  // Make sure to swap_tables here as hash_set will use probe table as the left one
+  auto constexpr swap_tables = true;
+  auto const equality        = single_expression_equality<has_nulls>{
+    evaluator, thread_intermediate_storage, swap_tables, equality_probe};
 
-  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
-    left_table, right_table, device_expression_data);
+  // Create set ref with the new equality comparator
+  auto const set_ref_equality = set_ref.with_key_eq(equality);
 
-  if (outer_row_index < outer_num_rows) {
-    // Figure out the number of elements for this key.
-    auto equality = single_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, false, equality_probe};
+  // Total number of rows to query the set
+  auto const outer_num_rows = left_table.num_rows();
+  // Grid stride for the tile
+  auto const cg_grid_stride = cudf::detail::grid_1d::grid_stride<block_size>() / cg_size;
 
-    left_table_keep_mask[outer_row_index] =
-      hash_table_view.contains(outer_row_index, hash_probe, equality);
+  // Find all the rows in the left table that are in the hash table
+  for (auto outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>() / cg_size;
+       outer_row_index < outer_num_rows;
+       outer_row_index += cg_grid_stride) {
+    auto const result = set_ref_equality.contains(tile, outer_row_index);
+    if (tile.thread_rank() == 0) { left_table_keep_mask[outer_row_index] = result; }
   }
 }
 
@@ -78,9 +88,8 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
-                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            hash_set_ref_type set_ref,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
@@ -94,9 +103,8 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
-        hash_probe,
         equality_probe,
-        hash_table_view,
+        set_ref,
         left_table_keep_mask,
         device_expression_data);
   } else {
@@ -106,9 +114,8 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
-        hash_probe,
         equality_probe,
-        hash_table_view,
+        set_ref,
         left_table_keep_mask,
         device_expression_data);
   }
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index 43714ffb36a..b08298e64e4 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -45,9 +45,8 @@ namespace detail {
  * @param[in] right_table The right table
  * @param[in] probe The table with which to probe the hash table for matches.
  * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
  * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] hash_table_view The hash table built from `build`.
+ * @param[in] set_ref The hash table device view built from `build`.
  * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating
  * the corresponding index from left table is present in output
  * @param[in] device_expression_data Container of device data required to evaluate the desired
@@ -58,9 +57,8 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
-                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            hash_set_ref_type set_ref,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index aa4fa281159..83a55eca50f 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -45,45 +45,6 @@
 namespace cudf {
 namespace detail {
 
-namespace {
-/**
- * @brief Device functor to create a pair of hash value and index for a given row.
- */
-struct make_pair_function_semi {
-  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
-  {
-    // The value is irrelevant since we only ever use the hash map to check for
-    // membership of a particular row index.
-    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
-  }
-};
-
-/**
- * @brief Equality comparator that composes two row_equality comparators.
- */
-class double_row_equality {
- public:
-  double_row_equality(row_equality equality_comparator, row_equality conditional_comparator)
-    : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator}
-  {
-  }
-
-  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
-  {
-    using experimental::row::lhs_index_type;
-    using experimental::row::rhs_index_type;
-
-    return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
-           _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
-  }
-
- private:
-  row_equality _equality_comparator;
-  row_equality _conditional_comparator;
-};
-
-}  // namespace
-
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -95,7 +56,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
+  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and
                  (join_type != join_kind::FULL_JOIN),
                "Inner, left, and full joins should use mixed_join.");
 
@@ -136,7 +97,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // output column and follow the null-supporting expression evaluation code
   // path.
   auto const has_nulls = cudf::nullate::DYNAMIC{
-    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
+    cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or
     binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)};
 
   auto const parser = ast::detail::expression_parser{
@@ -155,27 +116,20 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto right_conditional_view = table_device_view::create(right_conditional, stream);
 
   auto const preprocessed_build =
-    experimental::row::equality::preprocessed_table::create(build, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(build, stream);
   auto const preprocessed_probe =
-    experimental::row::equality::preprocessed_table::create(probe, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
   auto const row_comparator =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
+    cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe};
   auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
 
-  semi_map_type hash_table{
-    compute_hash_table_size(build.num_rows()),
-    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-    cuco::empty_value{cudf::detail::JoinNoneValue},
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
   // Create hash table containing all keys found in right table
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const build_nulls    = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
   auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build};
-  auto const hash_build     = row_hash_build.device_hasher(build_nulls);
+
   // Since we may see multiple rows that are identical in the equality tables
   // but differ in the conditional tables, the equality comparator used for
   // insertion must account for both sets of tables. An alternative solution
@@ -190,20 +144,28 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const equality_build_equality =
     row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
   auto const preprocessed_build_condtional =
-    experimental::row::equality::preprocessed_table::create(right_conditional, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream);
   auto const row_comparator_conditional_build =
     cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
                                                             preprocessed_build_condtional};
   auto const equality_build_conditional =
     row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
-  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
-  make_pair_function_semi pair_func_build{};
 
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
+  hash_set_type row_set{
+    {compute_hash_table_size(build.num_rows())},
+    cuco::empty_key{JoinNoneValue},
+    {equality_build_equality, equality_build_conditional},
+    {row_hash_build.device_hasher(build_nulls)},
+    {},
+    {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    {stream.value()}};
+
+  auto iter = thrust::make_counting_iterator(0);
 
   // skip rows that are null here.
   if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
+    row_set.insert_async(iter, iter + right_num_rows, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
     auto const [row_bitmask, _] =
@@ -211,18 +173,19 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
-    hash_table.insert_if(
-      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
+    row_set.insert_if_async(iter, iter + right_num_rows, stencil, pred, stream.value());
   }
 
-  auto hash_table_view = hash_table.get_device_view();
-
-  detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+  detail::grid_1d const config(outer_num_rows * hash_set_type::cg_size, DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block =
+    parser.shmem_per_thread *
+    cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size);
 
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
+  hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);
+
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
@@ -231,9 +194,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
                          *right_conditional_view,
                          *probe_view,
                          *build_view,
-                         hash_probe,
                          equality_probe,
-                         hash_table_view,
+                         row_set_ref,
                          cudf::device_span<bool>(left_table_keep_mask),
                          parser.device_expression_data,
                          config,
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 2dd25a7b890..e1c1d2e3002 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -1021,6 +1021,76 @@ struct group_key_func {
   }
 };
 
+// merges all the tdigests within each group. returns a table containing 2 columns:
+// the sorted means and weights.
+template <typename GroupOffsetIter>
+std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_merged_centroids(
+  tdigest_column_view const& tdv,
+  GroupOffsetIter group_offsets,
+  size_type num_groups,
+  rmm::cuda_stream_view stream)
+{
+  auto temp_mr = cudf::get_current_device_resource_ref();
+
+  auto const total_merged_centroids = tdv.means().size();
+
+  // output is the merged centroids (means, weights)
+  rmm::device_uvector<double> output_means(total_merged_centroids, stream, temp_mr);
+  rmm::device_uvector<double> output_weights(total_merged_centroids, stream, temp_mr);
+
+  // each group represents a collection of tdigest columns. each row is 1 tdigest.
+  // within each group, we want to sort all the centroids within all the tdigests
+  // in that group, using the means as the key. the "outer offsets" represent the indices of the
+  // tdigests, and the "inner offsets" represents the list of centroids for a particular tdigest.
+  //
+  //  rows
+  //  ----        centroid 0 ---------
+  //  tdigest 0   centroid 1
+  //  ----        centroid 2  group 0
+  //  tdigest 1   centroid 3
+  //  ----        centroid 4 ---------
+  //  tdigest 2   centroid 5
+  //  ----        centroid 6  group 1
+  //  tdigest 3   centroid 7
+  //              centroid 8
+  //  ----        centroid 9 --------
+  auto inner_offsets    = tdv.centroids().offsets();
+  auto centroid_offsets = cudf::detail::make_counting_transform_iterator(
+    0,
+    cuda::proclaim_return_type<size_type>(
+      [group_offsets, inner_offsets = tdv.centroids().offsets().begin<size_type>()] __device__(
+        size_type i) { return inner_offsets[group_offsets[i]]; }));
+
+  // perform the sort using the means as the key
+  size_t temp_size;
+  CUDF_CUDA_TRY(cub::DeviceSegmentedSort::SortPairs(nullptr,
+                                                    temp_size,
+                                                    tdv.means().begin<double>(),
+                                                    output_means.begin(),
+                                                    tdv.weights().begin<double>(),
+                                                    output_weights.begin(),
+                                                    total_merged_centroids,
+                                                    num_groups,
+                                                    centroid_offsets,
+                                                    centroid_offsets + 1,
+                                                    stream.value()));
+
+  rmm::device_buffer temp_mem(temp_size, stream, temp_mr);
+  CUDF_CUDA_TRY(cub::DeviceSegmentedSort::SortPairs(temp_mem.data(),
+                                                    temp_size,
+                                                    tdv.means().begin<double>(),
+                                                    output_means.begin(),
+                                                    tdv.weights().begin<double>(),
+                                                    output_weights.begin(),
+                                                    total_merged_centroids,
+                                                    num_groups,
+                                                    centroid_offsets,
+                                                    centroid_offsets + 1,
+                                                    stream.value()));
+
+  return {std::move(output_means), std::move(output_weights)};
+}
+
 template <typename HGroupOffsetIter, typename GroupOffsetIter, typename GroupLabelIter>
 std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                                        HGroupOffsetIter h_outer_offsets,
@@ -1032,59 +1102,6 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
-  // thrust::merge and thrust::merge_by_key don't provide what we need.  What we would need is an
-  // algorithm like a super-merge that takes two layers of keys: one which identifies the outer
-  // grouping of tdigests, and one which identifies the inner groupings of the tdigests within the
-  // outer groups.
-  // TODO: investigate replacing the iterative merge with a single stable_sort_by_key.
-
-  // bring tdigest offsets back to the host
-  auto tdigest_offsets = tdv.centroids().offsets();
-  std::vector<size_type> h_inner_offsets(tdigest_offsets.size());
-  cudaMemcpyAsync(h_inner_offsets.data(),
-                  tdigest_offsets.begin<size_type>(),
-                  sizeof(size_type) * tdigest_offsets.size(),
-                  cudaMemcpyDefault,
-                  stream);
-
-  stream.synchronize();
-
-  // extract all means and weights into a table
-  cudf::table_view tdigests_unsliced({tdv.means(), tdv.weights()});
-
-  // generate the merged (but not yet compressed) tdigests for each group.
-  std::vector<std::unique_ptr<table>> tdigests;
-  tdigests.reserve(num_groups);
-  std::transform(h_outer_offsets,
-                 h_outer_offsets + num_groups,
-                 std::next(h_outer_offsets),
-                 std::back_inserter(tdigests),
-                 [&](auto tdigest_start, auto tdigest_end) {
-                   // the range of tdigests in this group
-                   auto const num_tdigests = tdigest_end - tdigest_start;
-
-                   // slice each tdigest from the input
-                   std::vector<table_view> unmerged_tdigests;
-                   unmerged_tdigests.reserve(num_tdigests);
-                   auto offset_iter = std::next(h_inner_offsets.begin(), tdigest_start);
-                   std::transform(
-                     offset_iter,
-                     offset_iter + num_tdigests,
-                     std::next(offset_iter),
-                     std::back_inserter(unmerged_tdigests),
-                     [&](size_type start, size_type end) {
-                       return cudf::detail::slice(tdigests_unsliced, {start, end}, stream);
-                     });
-
-                   // merge
-                   return cudf::detail::merge(unmerged_tdigests,
-                                              {0},
-                                              {order::ASCENDING},
-                                              {},
-                                              stream,
-                                              cudf::get_current_device_resource_ref());
-                 });
-
   // generate min and max values
   auto merged_min_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -1121,7 +1138,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
   auto group_num_weights = cudf::detail::make_counting_transform_iterator(
     0,
     group_num_weights_func<decltype(group_offsets)>{group_offsets,
-                                                    tdigest_offsets.begin<size_type>()});
+                                                    tdv.centroids().offsets().begin<size_type>()});
   thrust::replace_if(rmm::exec_policy(stream),
                      merged_min_col->mutable_view().begin<double>(),
                      merged_min_col->mutable_view().end<double>(),
@@ -1135,29 +1152,33 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                      group_is_empty{},
                      0);
 
-  // concatenate all the merged tdigests back into one table.
-  std::vector<table_view> tdigest_views;
-  tdigest_views.reserve(num_groups);
-  std::transform(tdigests.begin(),
-                 tdigests.end(),
-                 std::back_inserter(tdigest_views),
-                 [](std::unique_ptr<table> const& t) { return t->view(); });
-  auto merged =
-    cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref());
+  auto temp_mr = cudf::get_current_device_resource_ref();
+
+  // merge the centroids
+  auto [merged_means, merged_weights] =
+    generate_merged_centroids(tdv, group_offsets, num_groups, stream);
+  size_t const num_centroids = tdv.means().size();
+  CUDF_EXPECTS(merged_means.size() == num_centroids,
+               "Unexpected number of centroids in merged result");
 
   // generate cumulative weights
-  auto merged_weights     = merged->get_column(1).view();
-  auto cumulative_weights = cudf::make_numeric_column(
-    data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream);
-  auto keys = cudf::detail::make_counting_transform_iterator(
-    0,
-    group_key_func<decltype(group_labels)>{
-      group_labels, tdigest_offsets.begin<size_type>(), tdigest_offsets.size()});
+  rmm::device_uvector<double> cumulative_weights(merged_weights.size(), stream, temp_mr);
+
+  // generate group keys for all centroids in the entire column
+  rmm::device_uvector<size_type> group_keys(num_centroids, stream, temp_mr);
+  auto iter          = thrust::make_counting_iterator(0);
+  auto inner_offsets = tdv.centroids().offsets();
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_centroids,
+                    group_keys.begin(),
+                    group_key_func<decltype(group_labels)>{
+                      group_labels, inner_offsets.begin<size_type>(), inner_offsets.size()});
   thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                keys,
-                                keys + cumulative_weights->size(),
-                                merged_weights.begin<double>(),
-                                cumulative_weights->mutable_view().begin<double>());
+                                group_keys.begin(),
+                                group_keys.begin() + num_centroids,
+                                merged_weights.begin(),
+                                cumulative_weights.begin());
 
   auto const delta = max_centroids;
 
@@ -1166,37 +1187,32 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
     delta,
     num_groups,
     nearest_value_centroid_weights<decltype(group_offsets)>{
-      cumulative_weights->view().begin<double>(),
-      group_offsets,
-      tdigest_offsets.begin<size_type>()},
-    centroid_group_info<decltype(group_offsets)>{cumulative_weights->view().begin<double>(),
-                                                 group_offsets,
-                                                 tdigest_offsets.begin<size_type>()},
+      cumulative_weights.begin(), group_offsets, inner_offsets.begin<size_type>()},
+    centroid_group_info<decltype(group_offsets)>{
+      cumulative_weights.begin(), group_offsets, inner_offsets.begin<size_type>()},
     cumulative_centroid_weight<decltype(group_labels), decltype(group_offsets)>{
-      cumulative_weights->view().begin<double>(),
+      cumulative_weights.begin(),
       group_labels,
       group_offsets,
-      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+      {inner_offsets.begin<size_type>(), static_cast<size_t>(inner_offsets.size())}},
     false,
     stream,
     mr);
 
   // input centroid values
   auto centroids = cudf::detail::make_counting_transform_iterator(
-    0,
-    make_weighted_centroid{merged->get_column(0).view().begin<double>(),
-                           merged_weights.begin<double>()});
+    0, make_weighted_centroid{merged_means.begin(), merged_weights.begin()});
 
   // compute the tdigest
   return compute_tdigests(
     delta,
     centroids,
-    centroids + merged->num_rows(),
+    centroids + merged_means.size(),
     cumulative_centroid_weight<decltype(group_labels), decltype(group_offsets)>{
-      cumulative_weights->view().begin<double>(),
+      cumulative_weights.begin(),
       group_labels,
       group_offsets,
-      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+      {inner_offsets.begin<size_type>(), static_cast<size_t>(inner_offsets.size())}},
     std::move(merged_min_col),
     std::move(merged_max_col),
     group_cluster_wl,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 288fa84a73d..b67d922d377 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -329,6 +329,7 @@ ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cp
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
 ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp)
 ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu)
+ConfigureTest(JSON_TREE_CSR io/json/json_tree_csr.cu)
 ConfigureTest(
   DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
   GPUS 1
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 48bc982d0e3..68ec255b39d 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2856,6 +2856,59 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren)
   }
 }
 
+TEST_F(JsonReaderTest, MixedTypesWithSchema)
+{
+  std::string data = "{\"data\": {\"A\": 0, \"B\": 1}}\n{\"data\": [1,0]}\n";
+
+  std::map<std::string, cudf::io::schema_element> data_types;
+  std::map<std::string, cudf::io::schema_element> child_types;
+  child_types.insert(
+    std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING, 0}, {}}});
+  data_types.insert(std::pair{
+    "data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST, 0}, child_types}});
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .dtypes(data_types)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+      .normalize_single_quotes(true)
+      .normalize_whitespace(true)
+      .mixed_types_as_string(true)
+      .experimental(true)
+      .keep_quotes(true)
+      .lines(true);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 2);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::LIST);
+  EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::STRING);
+}
+
+TEST_F(JsonReaderTest, UnicodeFieldname)
+{
+  // unicode at nested and leaf levels
+  std::string data = R"({"data": {"a": 0, "b	c": 1}}
+  {"data": {"\u0061": 2, "\u0062\tc": 3}}
+  {"d\u0061ta": {"a": 4}})";
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+      .experimental(true)
+      .lines(true);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 3);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+  EXPECT_EQ(result.tbl->get_column(0).num_children(), 2);
+  EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info.at(0).name, "data");
+  EXPECT_EQ(result.metadata.schema_info.at(0).children.at(0).name, "a");
+  EXPECT_EQ(result.metadata.schema_info.at(0).children.at(1).name, "b\tc");
+  EXPECT_EQ(result.metadata.schema_info.at(0).children.size(), 2);
+}
+
 TEST_F(JsonReaderTest, JsonDtypeSchema)
 {
   std::string data = R"(
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 875cc467b6a..15682c6ae6b 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -889,6 +889,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
                                                      gpu_tree,
                                                      is_array_of_arrays,
                                                      json_lines,
+                                                     false,
                                                      stream,
                                                      cudf::get_current_device_resource_ref());
 #if LIBCUDF_JSON_DEBUG_DUMP
diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu
new file mode 100644
index 00000000000..f988ae24b38
--- /dev/null
+++ b/cpp/tests/io/json/json_tree_csr.cu
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/json/nested_json.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/tokenize_json.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include <string>
+
+namespace cuio_json = cudf::io::json;
+
+struct h_tree_meta_t {
+  std::vector<cuio_json::NodeT> node_categories;
+  std::vector<cuio_json::NodeIndexT> parent_node_ids;
+  std::vector<cuio_json::SymbolOffsetT> node_range_begin;
+  std::vector<cuio_json::SymbolOffsetT> node_range_end;
+};
+
+struct h_column_tree {
+  // position of nnzs
+  std::vector<cuio_json::NodeIndexT> row_idx;
+  std::vector<cuio_json::NodeIndexT> col_idx;
+  // node properties
+  std::vector<cuio_json::NodeT> categories;
+  std::vector<cuio_json::NodeIndexT> column_ids;
+};
+
+// debug printing
+template <typename T>
+void print(cudf::host_span<T const> vec, std::string name)
+{
+  std::cout << name << " = ";
+  for (auto e : vec) {
+    std::cout << e << " ";
+  }
+  std::cout << std::endl;
+}
+
+bool check_equality(cuio_json::tree_meta_t& d_a,
+                    cudf::device_span<cudf::size_type const> d_a_max_row_offsets,
+                    cuio_json::experimental::compressed_sparse_row& d_b_csr,
+                    cuio_json::experimental::column_tree_properties& d_b_ctp,
+                    rmm::cuda_stream_view stream)
+{
+  // convert from tree_meta_t to column_tree_csr
+  stream.synchronize();
+
+  h_tree_meta_t a{cudf::detail::make_std_vector_async(d_a.node_categories, stream),
+                  cudf::detail::make_std_vector_async(d_a.parent_node_ids, stream),
+                  cudf::detail::make_std_vector_async(d_a.node_range_begin, stream),
+                  cudf::detail::make_std_vector_async(d_a.node_range_end, stream)};
+
+  h_column_tree b{cudf::detail::make_std_vector_async(d_b_csr.row_idx, stream),
+                  cudf::detail::make_std_vector_async(d_b_csr.col_idx, stream),
+                  cudf::detail::make_std_vector_async(d_b_ctp.categories, stream),
+                  cudf::detail::make_std_vector_async(d_b_ctp.mapped_ids, stream)};
+
+  auto a_max_row_offsets = cudf::detail::make_std_vector_async(d_a_max_row_offsets, stream);
+  auto b_max_row_offsets = cudf::detail::make_std_vector_async(d_b_ctp.max_row_offsets, stream);
+
+  stream.synchronize();
+
+  auto num_nodes = a.parent_node_ids.size();
+  if (num_nodes > 1) {
+    if (b.row_idx.size() != num_nodes + 1) { return false; }
+
+    for (auto pos = b.row_idx[0]; pos < b.row_idx[1]; pos++) {
+      auto v = b.col_idx[pos];
+      if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[0]) { return false; }
+    }
+    for (size_t u = 1; u < num_nodes; u++) {
+      auto v = b.col_idx[b.row_idx[u]];
+      if (a.parent_node_ids[b.column_ids[u]] != b.column_ids[v]) { return false; }
+
+      for (auto pos = b.row_idx[u] + 1; pos < b.row_idx[u + 1]; pos++) {
+        v = b.col_idx[pos];
+        if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) { return false; }
+      }
+    }
+    for (size_t u = 0; u < num_nodes; u++) {
+      if (a.node_categories[b.column_ids[u]] != b.categories[u]) { return false; }
+    }
+    for (size_t u = 0; u < num_nodes; u++) {
+      if (a_max_row_offsets[b.column_ids[u]] != b_max_row_offsets[u]) { return false; }
+    }
+  } else if (num_nodes == 1) {
+    if (b.row_idx.size() != num_nodes + 1) { return false; }
+
+    if (b.row_idx[0] != 0 || b.row_idx[1] != 1) return false;
+    if (!b.col_idx.empty()) return false;
+    for (size_t u = 0; u < num_nodes; u++) {
+      if (a.node_categories[b.column_ids[u]] != b.categories[u]) { return false; }
+    }
+
+    for (size_t u = 0; u < num_nodes; u++) {
+      if (a_max_row_offsets[b.column_ids[u]] != b_max_row_offsets[u]) { return false; }
+    }
+  }
+  return true;
+}
+
+void run_test(std::string const& input, bool enable_lines = true)
+{
+  auto const stream = cudf::get_default_stream();
+  cudf::string_scalar d_scalar(input, true, stream);
+  auto d_input = cudf::device_span<cuio_json::SymbolT const>{d_scalar.data(),
+                                                             static_cast<size_t>(d_scalar.size())};
+
+  cudf::io::json_reader_options options{};
+  options.enable_lines(enable_lines);
+  options.enable_mixed_types_as_string(true);
+
+  // Parse the JSON and get the token stream
+  auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, cudf::get_current_device_resource_ref());
+
+  // Get the JSON's tree representation
+  auto gpu_tree =
+    cuio_json::detail::get_tree_representation(tokens_gpu,
+                                               token_indices_gpu,
+                                               options.is_enabled_mixed_types_as_string(),
+                                               stream,
+                                               cudf::get_current_device_resource_ref());
+
+  bool const is_array_of_arrays = [&]() {
+    std::array<cuio_json::node_t, 2> h_node_categories = {cuio_json::NC_ERR, cuio_json::NC_ERR};
+    auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size());
+    CUDF_CUDA_TRY(cudaMemcpyAsync(h_node_categories.data(),
+                                  gpu_tree.node_categories.data(),
+                                  sizeof(cuio_json::node_t) * size_to_copy,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    stream.synchronize();
+    if (options.is_enabled_lines()) return h_node_categories[0] == cuio_json::NC_LIST;
+    return h_node_categories[0] == cuio_json::NC_LIST and
+           h_node_categories[1] == cuio_json::NC_LIST;
+  }();
+
+  auto tup =
+    cuio_json::detail::records_orient_tree_traversal(d_input,
+                                                     gpu_tree,
+                                                     is_array_of_arrays,
+                                                     options.is_enabled_lines(),
+                                                     false,
+                                                     stream,
+                                                     rmm::mr::get_current_device_resource());
+  auto& gpu_col_id      = std::get<0>(tup);
+  auto& gpu_row_offsets = std::get<1>(tup);
+
+  auto const num_nodes = gpu_col_id.size();
+  rmm::device_uvector<cudf::size_type> sorted_col_ids(gpu_col_id.size(), stream);  // make a copy
+  thrust::copy(
+    rmm::exec_policy(stream), gpu_col_id.begin(), gpu_col_id.end(), sorted_col_ids.begin());
+
+  // sort by {col_id} on {node_ids} stable
+  rmm::device_uvector<cudf::size_type> node_ids(gpu_col_id.size(), stream);
+  thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end());
+  thrust::stable_sort_by_key(
+    rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
+
+  cudf::size_type const row_array_parent_col_id = [&]() {
+    cudf::size_type value      = cuio_json::parent_node_sentinel;
+    auto const list_node_index = options.is_enabled_lines() ? 0 : 1;
+    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
+                                  gpu_col_id.data() + list_node_index,
+                                  sizeof(cudf::size_type),
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    stream.synchronize();
+    return value;
+  }();
+
+  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
+    cudf::io::json::detail::reduce_to_column_tree(gpu_tree,
+                                                  gpu_col_id,
+                                                  sorted_col_ids,
+                                                  node_ids,
+                                                  gpu_row_offsets,
+                                                  is_array_of_arrays,
+                                                  row_array_parent_col_id,
+                                                  stream);
+
+  auto [d_column_tree_csr, d_column_tree_properties] =
+    cudf::io::json::experimental::detail::reduce_to_column_tree(gpu_tree,
+                                                                gpu_col_id,
+                                                                sorted_col_ids,
+                                                                node_ids,
+                                                                gpu_row_offsets,
+                                                                is_array_of_arrays,
+                                                                row_array_parent_col_id,
+                                                                stream);
+
+  auto iseq = check_equality(
+    d_column_tree, d_max_row_offsets, d_column_tree_csr, d_column_tree_properties, stream);
+  // assert equality between csr and meta formats
+  ASSERT_TRUE(iseq);
+}
+
+struct JsonColumnTreeTests : public cudf::test::BaseFixture {};
+
+TEST_F(JsonColumnTreeTests, JSONL_Small)
+{
+  std::string const input =
+    R"(  {}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )";  // Prepare input & output buffers
+  run_test(input);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_Large)
+{
+  std::string const input =
+    R"(  {}
+    {}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }}
+ { "a": { "y" : 6, "z": [] }}
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )";
+  run_test(input);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_ListofStruct)
+{
+  std::string const input = R"(
+  { "Root": { "Key": [ { "EE": "A" } ] } }
+  { "Root": { "Key": {  } } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  )";
+  run_test(input);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_MissingEntries)
+{
+  std::string json_stringl = R"(
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}
+    {"a": 1, "b": {"0": "abc"          }, "c": false}
+    {"a": 1, "b": {}}
+    {"a": 1,                              "c": null}
+    )";
+  run_test(json_stringl);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_MoreMissingEntries)
+{
+  std::string json_stringl = R"(
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    )";
+  run_test(json_stringl);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_StillMoreMissingEntries)
+{
+  std::string json_stringl = R"(
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    { "foo1": ["123","456"], "bar": 123 }
+    { "foo2": { "b": 5 }, "car": 456 }
+    { "foo1": [1,2,3], "bar": 123 }
+    { "foo2": { "a": 1 }, "bar": 456 }
+    )";
+  run_test(json_stringl);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_MissingEntries)
+{
+  std::string json_string = R"([
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true},
+    {"a": 1, "b": {"0": "abc"          }, "c": false},
+    {"a": 1, "b": {}},
+    {"a": 1,                              "c": null}
+    ])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_StructOfStructs)
+{
+  std::string json_string =
+    R"([
+    {},
+    { "a": { "y" : 6, "z": [] }},
+    { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }}
+    ])";  // Prepare input & output buffers
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_ArrayOfArrays_NestedList)
+{
+  std::string json_string =
+    R"([123, [1,2,3]]
+       [456, null,  { "a": 1 }])";
+  run_test(json_string);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_ArrayofArrays_NestedList)
+{
+  std::string json_string = R"([[[1,2,3], null, 123],
+              [null, { "a": 1 }, 456 ]])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_CornerCase_Empty)
+{
+  std::string json_string = R"([])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_CornerCase_List)
+{
+  std::string json_string = R"([123])";
+  run_test(json_string, true);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_CornerCase_EmptyNestedList)
+{
+  std::string json_string = R"([[[]]])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSON_CornerCase_EmptyNestedLists)
+{
+  std::string json_string = R"([[], [], []])";
+  run_test(json_string, false);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_CornerCase_ListofLists)
+{
+  std::string json_string = R"([[1, 2, 3], [4, 5, null], []])";
+  run_test(json_string, true);
+}
+
+TEST_F(JsonColumnTreeTests, JSONL_CornerCase_EmptyListOfLists)
+{
+  std::string json_string = R"([[]])";
+  run_test(json_string, true);
+}
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index 6c147c8a128..9041969bec7 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -778,6 +778,138 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality)
              {1});
 }
 
+TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_one_greater_right_one =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {1},
+             left_one_greater_right_one,
+             {2, 7, 8});
+}
+
+TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge)
+{
+  using T1 = double;
+
+  // Number of rows in each column
+  auto constexpr N = 10000;
+
+  // Generate column data for left and right tables
+  auto const [left_col0, right_col0] = gen_random_nullable_repeated_columns<T1>(N, 200);
+  auto const [left_col1, right_col1] = gen_random_nullable_repeated_columns<T1>(N, 100);
+
+  // Setup data and nulls for the left table
+  std::vector<std::pair<std::vector<T1>, std::vector<bool>>> lefts = {
+    {left_col0.first, left_col0.second}, {left_col1.first, left_col1.second}};
+  std::vector<cudf::test::fixed_width_column_wrapper<T1>> left_wrappers;
+  std::vector<cudf::column_view> left_columns;
+  for (auto [data, valids] : lefts) {
+    left_wrappers.emplace_back(
+      cudf::test::fixed_width_column_wrapper<T1>(data.begin(), data.end(), valids.begin()));
+    left_columns.emplace_back(left_wrappers.back());
+  };
+
+  // Setup data and nulls for the right table
+  std::vector<std::pair<std::vector<T1>, std::vector<bool>>> rights = {
+    {right_col0.first, right_col0.second}, {right_col1.first, right_col1.second}};
+  std::vector<cudf::test::fixed_width_column_wrapper<T1>> right_wrappers;
+  std::vector<cudf::column_view> right_columns;
+  for (auto [data, valids] : rights) {
+    right_wrappers.emplace_back(
+      cudf::test::fixed_width_column_wrapper<T1>(data.begin(), data.end(), valids.begin()));
+    right_columns.emplace_back(left_wrappers.back());
+  };
+
+  // Left and right table views.
+  auto const left_table  = cudf::table_view{left_columns};
+  auto const right_table = cudf::table_view{right_columns};
+
+  // Using the zeroth column for equality.
+  auto const left_equality  = left_table.select({0});
+  auto const right_equality = right_table.select({0});
+
+  // Column references for equality column.
+  auto const col_ref_left_0  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_zero_eq_right_zero =
+    cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
+
+  // Mixed semi join with zeroth column equality
+  {
+    // Expected left_semi_join result
+    auto const expected_mixed_semi_join =
+      cudf::conditional_left_semi_join(left_table, right_table, left_zero_eq_right_zero);
+
+    // Actual mixed_left_semi_join result
+    auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality,
+                                                            right_equality,
+                                                            left_table,
+                                                            right_table,
+                                                            left_zero_eq_right_zero,
+                                                            cudf::null_equality::UNEQUAL);
+
+    // Copy data back to host for comparisons
+    auto expected_indices = cudf::detail::make_std_vector_async<int32_t>(
+      cudf::device_span<int32_t>(*expected_mixed_semi_join), cudf::get_default_stream());
+    auto result_indices = cudf::detail::make_std_vector_sync<int32_t>(
+      cudf::device_span<int32_t>(*mixed_semi_join), cudf::get_default_stream());
+
+    // Sort the indices for 1-1 comparison
+    std::sort(expected_indices.begin(), expected_indices.end());
+    std::sort(result_indices.begin(), result_indices.end());
+
+    // Expected and actual vectors must match.
+    EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size());
+    EXPECT_TRUE(
+      std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin()));
+  }
+
+  // Mixed semi join with zeroth column equality and first column GREATER conditional
+  {
+    // Column references for conditional column.
+    auto const col_ref_left_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
+    auto const col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+    auto left_one_gt_right_one =
+      cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+    // Expected left_semi_join result
+    auto const expected_mixed_semi_join = cudf::conditional_left_semi_join(
+      left_table,
+      right_table,
+      cudf::ast::operation(
+        cudf::ast::ast_operator::LOGICAL_AND, left_zero_eq_right_zero, left_one_gt_right_one));
+
+    // Actual left_semi_join result
+    auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality,
+                                                            right_equality,
+                                                            left_table,
+                                                            right_table,
+                                                            left_one_gt_right_one,
+                                                            cudf::null_equality::UNEQUAL);
+
+    // Copy data back to host for comparisons
+    auto expected_indices = cudf::detail::make_std_vector_async<int32_t>(
+      cudf::device_span<int32_t>(*expected_mixed_semi_join), cudf::get_default_stream());
+    auto result_indices = cudf::detail::make_std_vector_sync<int32_t>(
+      cudf::device_span<int32_t>(*mixed_semi_join), cudf::get_default_stream());
+
+    // Sort the indices for 1-1 comparison
+    std::sort(expected_indices.begin(), expected_indices.end());
+    std::sort(result_indices.begin(), result_indices.end());
+
+    // Expected and actual vectors must match.
+    EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size());
+    EXPECT_TRUE(
+      std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin()));
+  }
+}
+
 TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)
 {
   this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}},
@@ -900,3 +1032,18 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality)
              left_zero_eq_right_zero,
              {0, 1, 3});
 }
+
+TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_one_greater_right_one =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {1},
+             left_one_greater_right_one,
+             {0, 1, 3, 4, 5, 6, 9});
+}
diff --git a/dependencies.yaml b/dependencies.yaml
index 9c95b9f399f..bb8635403a4 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -25,6 +25,7 @@ files:
       - rapids_build_setuptools
       - run_common
       - run_cudf
+      - run_cudf_polars
       - run_pylibcudf
       - run_dask_cudf
       - run_custreamz
@@ -663,7 +664,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.6
+          - polars>=1.8,<1.9
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index d6f8cd2a1ff..e21536e2e97 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -25,6 +25,7 @@ This page provides API documentation for pylibcudf.
     lists
     merge
     null_mask
+    partitioning
     quantiles
     reduce
     replace
@@ -38,6 +39,7 @@ This page provides API documentation for pylibcudf.
     table
     traits
     transform
+    transpose
     types
     unary
 
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index c8933981736..53638f071cc 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -19,3 +19,4 @@ I/O Functions
     csv
     json
     parquet
+    timezone
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst
new file mode 100644
index 00000000000..20c1ffc2e93
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst
@@ -0,0 +1,6 @@
+========
+Timezone
+========
+
+.. automodule:: pylibcudf.io.timezone
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst
new file mode 100644
index 00000000000..6951dbecca0
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst
@@ -0,0 +1,6 @@
+============
+partitioning
+============
+
+.. automodule:: pylibcudf.partitioning
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst
new file mode 100644
index 00000000000..6241295e770
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst
@@ -0,0 +1,6 @@
+=========
+transpose
+=========
+
+.. automodule:: pylibcudf.transpose
+   :members:
diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst
index 142124163af..0e701b849fd 100644
--- a/docs/dask_cudf/source/best_practices.rst
+++ b/docs/dask_cudf/source/best_practices.rst
@@ -81,7 +81,7 @@ representations, native cuDF spilling may be insufficient. For these cases,
 `JIT-unspill <https://docs.rapids.ai/api/dask-cuda/nightly/spilling/#jit-unspill>`__
 is likely to produce better protection from out-of-memory (OOM) errors.
 Please see `Dask-CUDA's spilling documentation
-<https://docs.rapids.ai/api/dask-cuda/24.10/spilling/>`__ for further details
+<https://docs.rapids.ai/api/dask-cuda/stable/spilling/>`__ for further details
 and guidance.
 
 Use RMM
@@ -252,6 +252,15 @@ result in a simple 1-to-1 mapping between files and output partitions.
   correspond to a reasonable partition size, use ``blocksize=None``
   to avoid unnecessary metadata collection.
 
+.. note::
+  When reading from remote storage (e.g. S3 and GCS), performance will
+  likely improve with ``filesystem="arrow"``. When this option is set,
+  PyArrow will be used to perform IO on multiple CPU threads. Please be
+  aware that this feature is experimental, and behavior may change in
+  the future (without deprecation). Do not pass in ``blocksize`` or
+  ``aggregate_files`` when this feature is used. Instead, set the
+  ``"dataframe.parquet.minimum-partition-size"`` config to control
+  file aggregation.
 
 Use :func:`from_map`
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 23ca7e49753..6eb755d7854 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -40,9 +40,10 @@ Using Dask cuDF
 The Dask DataFrame API (Recommended)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Simply use the `Dask configuration <dask:configuration>`__ system to
-set the ``"dataframe.backend"`` option to ``"cudf"``. From Python,
-this can be achieved like so::
+Simply use the `Dask configuration
+<https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html>`__
+system to set the ``"dataframe.backend"`` option to ``"cudf"``.
+From Python, this can be achieved like so::
 
   import dask
 
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 2bb74c3e3b1..e41cc15712f 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -39,6 +39,7 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean allowNonNumericNumbers;
   private final boolean allowUnquotedControlChars;
   private final boolean cudfPruneSchema;
+  private final boolean experimental;
   private final byte lineDelimiter;
 
   private JSONOptions(Builder builder) {
@@ -55,6 +56,7 @@ private JSONOptions(Builder builder) {
     allowNonNumericNumbers = builder.allowNonNumericNumbers;
     allowUnquotedControlChars = builder.allowUnquotedControlChars;
     cudfPruneSchema = builder.cudfPruneSchema;
+    experimental = builder.experimental;
     lineDelimiter = builder.lineDelimiter;
   }
 
@@ -111,6 +113,10 @@ public boolean unquotedControlChars() {
     return allowUnquotedControlChars;
   }
 
+  public boolean experimental() {
+    return experimental;
+  }
+
   @Override
   String[] getIncludeColumnNames() {
     throw new UnsupportedOperationException("JSON reader didn't support column prune");
@@ -136,6 +142,7 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean keepQuotes = false;
 
     private boolean cudfPruneSchema = false;
+    private boolean experimental = false;
     private byte lineDelimiter = '\n';
 
     public Builder withCudfPruneSchema(boolean prune) {
@@ -159,6 +166,14 @@ public Builder withStrictValidation(boolean isAllowed) {
       return this;
     }
 
+    /**
+     * Should experimental features be enabled or not
+     */
+    public Builder withExperimental(boolean isAllowed) {
+      experimental = isAllowed;
+      return this;
+    }
+
     /**
      * Should leading zeros on numbers be allowed or not. Strict validation
      * must be enabled for this to have any effect.
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 6d370ca27b2..dbee53640aa 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -260,6 +260,7 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean allowNonNumericNumbers,
                                         boolean allowUnquotedControl,
                                         boolean pruneColumns,
+                                        boolean experimental,
                                         byte lineDelimiter) throws CudfException;
 
   private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
@@ -275,6 +276,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean allowNonNumericNumbers,
                                       boolean allowUnquotedControl,
                                       boolean pruneColumns,
+                                      boolean experimental,
                                       byte lineDelimiter,
                                       long dsHandle) throws CudfException;
 
@@ -288,6 +290,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool
                                       boolean allowLeadingZeros,
                                       boolean allowNonNumericNumbers,
                                       boolean allowUnquotedControl,
+                                      boolean experimental,
                                       byte lineDelimiter,
                                       long dsHandle) throws CudfException;
 
@@ -303,6 +306,7 @@ private static native long readAndInferJSON(long address, long length,
                                               boolean allowLeadingZeros,
                                               boolean allowNonNumericNumbers,
                                               boolean allowUnquotedControl,
+                                              boolean experimental,
                                               byte lineDelimiter) throws CudfException;
 
   /**
@@ -1333,6 +1337,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     opts.nonNumericNumbersAllowed(),
                     opts.unquotedControlChars(),
                     cudfPruneSchema,
+                    opts.experimental(),
                     opts.getLineDelimiter()))) {
 
       return gatherJSONColumns(schema, twm, -1);
@@ -1417,6 +1422,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
         opts.leadingZerosAllowed(),
         opts.nonNumericNumbersAllowed(),
         opts.unquotedControlChars(),
+        opts.experimental(),
         opts.getLineDelimiter()));
   }
 
@@ -1439,6 +1445,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
           opts.leadingZerosAllowed(),
           opts.nonNumericNumbersAllowed(),
           opts.unquotedControlChars(),
+          opts.experimental(),
           opts.getLineDelimiter(),
           dsHandle));
         return twm;
@@ -1499,6 +1506,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
             opts.nonNumericNumbersAllowed(),
             opts.unquotedControlChars(),
             cudfPruneSchema,
+            opts.experimental(),
             opts.getLineDelimiter()))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
     }
@@ -1543,6 +1551,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
         opts.nonNumericNumbersAllowed(),
         opts.unquotedControlChars(),
         cudfPruneSchema,
+        opts.experimental(),
         opts.getLineDelimiter(),
         dsHandle))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 0f77da54152..0a667978ca3 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1627,6 +1627,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
                                                          jboolean allow_leading_zeros,
                                                          jboolean allow_nonnumeric_numbers,
                                                          jboolean allow_unquoted_control,
+                                                         jboolean experimental,
                                                          jbyte line_delimiter,
                                                          jlong ds_handle)
 {
@@ -1649,6 +1650,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
         .mixed_types_as_string(mixed_types_as_string)
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
+        .experimental(experimental)
         .keep_quotes(keep_quotes)
         .prune_columns(false);
     if (strict_validation) {
@@ -1680,6 +1682,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
                                            jboolean allow_leading_zeros,
                                            jboolean allow_nonnumeric_numbers,
                                            jboolean allow_unquoted_control,
+                                           jboolean experimental,
                                            jbyte line_delimiter)
 {
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
@@ -1705,6 +1708,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
         .strict_validation(strict_validation)
         .mixed_types_as_string(mixed_types_as_string)
         .prune_columns(false)
+        .experimental(experimental)
         .delimiter(static_cast<char>(line_delimiter))
         .keep_quotes(keep_quotes);
     if (strict_validation) {
@@ -1821,6 +1825,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
                                                  jboolean allow_nonnumeric_numbers,
                                                  jboolean allow_unquoted_control,
                                                  jboolean prune_columns,
+                                                 jboolean experimental,
                                                  jbyte line_delimiter,
                                                  jlong ds_handle)
 {
@@ -1859,7 +1864,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes)
-        .prune_columns(prune_columns);
+        .prune_columns(prune_columns)
+        .experimental(experimental);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
@@ -1920,6 +1926,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
                                                            jboolean allow_nonnumeric_numbers,
                                                            jboolean allow_unquoted_control,
                                                            jboolean prune_columns,
+                                                           jboolean experimental,
                                                            jbyte line_delimiter)
 {
   bool read_buffer = true;
@@ -1972,7 +1979,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes)
-        .prune_columns(prune_columns);
+        .prune_columns(prune_columns)
+        .experimental(experimental);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 7c91533cf93..3c96b90f0a1 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -78,8 +78,11 @@ class Aggregation:
         )
 
     @classmethod
-    def nunique(cls):
-        return cls(pylibcudf.aggregation.nunique(pylibcudf.types.NullPolicy.EXCLUDE))
+    def nunique(cls, dropna=True):
+        return cls(pylibcudf.aggregation.nunique(
+            pylibcudf.types.NullPolicy.EXCLUDE
+            if dropna else pylibcudf.types.NullPolicy.INCLUDE
+        ))
 
     @classmethod
     def nth(cls, size):
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 48f75b12a73..9b7ab0888d2 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -3,11 +3,8 @@
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
 from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
-cimport pylibcudf.libcudf.types as libcudf_types
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.hash cimport (
     md5,
@@ -19,37 +16,23 @@ from pylibcudf.libcudf.hash cimport (
     sha512,
     xxhash_64,
 )
-from pylibcudf.libcudf.partitioning cimport (
-    hash_partition as cpp_hash_partition,
-)
-from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
+from cudf._lib.utils cimport table_view_from_columns
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
-def hash_partition(list source_columns, object columns_to_hash,
+def hash_partition(list source_columns, list columns_to_hash,
                    int num_partitions):
-    cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash
-    cdef int c_num_partitions = num_partitions
-    cdef table_view c_source_view = table_view_from_columns(source_columns)
-
-    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
-    with nogil:
-        c_result = move(
-            cpp_hash_partition(
-                c_source_view,
-                c_columns_to_hash,
-                c_num_partitions
-            )
-        )
-
-    return (
-        columns_from_unique_ptr(move(c_result.first)),
-        list(c_result.second)
+    plc_table, offsets = plc.partitioning.hash_partition(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),
+        columns_to_hash,
+        num_partitions
     )
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 1938f00c179..76a6e32fde0 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -21,6 +21,10 @@ cdef add_df_col_struct_names(
     df,
     child_names_dict
 )
+cdef update_col_struct_field_names(
+    Column col,
+    child_names
+)
 cdef update_struct_field_names(
     table,
     vector[column_name_info]& schema_info)
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index adeba6fffb1..f88c48ce989 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -1,8 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-import cudf
-from cudf.core.buffer import acquire_spill_lock
-
 from libc.stdint cimport int64_t
 from libcpp cimport bool, int
 from libcpp.map cimport map
@@ -11,187 +8,43 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-import datetime
 from collections import OrderedDict
 
-cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
-
 try:
     import ujson as json
 except ImportError:
     import json
 
 cimport pylibcudf.libcudf.io.types as cudf_io_types
+cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
 from pylibcudf.libcudf.io.data_sink cimport data_sink
 from pylibcudf.libcudf.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
-    orc_reader_options,
     orc_writer_options,
-    read_orc as libcudf_read_orc,
     write_orc as libcudf_write_orc,
 )
-from pylibcudf.libcudf.io.orc_metadata cimport (
-    binary_statistics,
-    bucket_statistics,
-    column_statistics,
-    date_statistics,
-    decimal_statistics,
-    double_statistics,
-    integer_statistics,
-    no_statistics,
-    parsed_orc_statistics,
-    read_parsed_orc_statistics as libcudf_read_parsed_orc_statistics,
-    statistics_type,
-    string_statistics,
-    timestamp_statistics,
-)
 from pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     compression_type,
     sink_info,
-    source_info,
     table_input_metadata,
-    table_with_metadata,
 )
 from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport data_type, size_type, type_id
-from pylibcudf.variant cimport get_if as std_get_if, holds_alternative
 
 from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_column_struct_field_names,
-)
+from cudf._lib.io.utils cimport make_sink_info, update_col_struct_field_names
+from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
 
-from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
-
-from cudf._lib.types cimport underlying_type_t_type_id
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+import pylibcudf as plc
 
+import cudf
+from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
+from cudf.core.buffer import acquire_spill_lock
 
 
-cdef _parse_column_type_statistics(column_statistics stats):
-    # Initialize stats to return and parse stats blob
-    column_stats = {}
-
-    if stats.number_of_values.has_value():
-        column_stats["number_of_values"] = stats.number_of_values.value()
-
-    if stats.has_null.has_value():
-        column_stats["has_null"] = stats.has_null.value()
-
-    cdef statistics_type type_specific_stats = stats.type_specific_stats
-
-    cdef integer_statistics* int_stats
-    cdef double_statistics* dbl_stats
-    cdef string_statistics* str_stats
-    cdef bucket_statistics* bucket_stats
-    cdef decimal_statistics* dec_stats
-    cdef date_statistics* date_stats
-    cdef binary_statistics* bin_stats
-    cdef timestamp_statistics* ts_stats
-
-    if holds_alternative[no_statistics](type_specific_stats):
-        return column_stats
-    elif int_stats := std_get_if[integer_statistics](&type_specific_stats):
-        if int_stats.minimum.has_value():
-            column_stats["minimum"] = int_stats.minimum.value()
-        else:
-            column_stats["minimum"] = None
-        if int_stats.maximum.has_value():
-            column_stats["maximum"] = int_stats.maximum.value()
-        else:
-            column_stats["maximum"] = None
-        if int_stats.sum.has_value():
-            column_stats["sum"] = int_stats.sum.value()
-        else:
-            column_stats["sum"] = None
-    elif dbl_stats := std_get_if[double_statistics](&type_specific_stats):
-        if dbl_stats.minimum.has_value():
-            column_stats["minimum"] = dbl_stats.minimum.value()
-        else:
-            column_stats["minimum"] = None
-        if dbl_stats.maximum.has_value():
-            column_stats["maximum"] = dbl_stats.maximum.value()
-        else:
-            column_stats["maximum"] = None
-        if dbl_stats.sum.has_value():
-            column_stats["sum"] = dbl_stats.sum.value()
-        else:
-            column_stats["sum"] = None
-    elif str_stats := std_get_if[string_statistics](&type_specific_stats):
-        if str_stats.minimum.has_value():
-            column_stats["minimum"] = str_stats.minimum.value().decode("utf-8")
-        else:
-            column_stats["minimum"] = None
-        if str_stats.maximum.has_value():
-            column_stats["maximum"] = str_stats.maximum.value().decode("utf-8")
-        else:
-            column_stats["maximum"] = None
-        if str_stats.sum.has_value():
-            column_stats["sum"] = str_stats.sum.value()
-        else:
-            column_stats["sum"] = None
-    elif bucket_stats := std_get_if[bucket_statistics](&type_specific_stats):
-        column_stats["true_count"] = bucket_stats.count[0]
-        column_stats["false_count"] = (
-            column_stats["number_of_values"]
-            - column_stats["true_count"]
-        )
-    elif dec_stats := std_get_if[decimal_statistics](&type_specific_stats):
-        if dec_stats.minimum.has_value():
-            column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8")
-        else:
-            column_stats["minimum"] = None
-        if dec_stats.maximum.has_value():
-            column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8")
-        else:
-            column_stats["maximum"] = None
-        if dec_stats.sum.has_value():
-            column_stats["sum"] = dec_stats.sum.value().decode("utf-8")
-        else:
-            column_stats["sum"] = None
-    elif date_stats := std_get_if[date_statistics](&type_specific_stats):
-        if date_stats.minimum.has_value():
-            column_stats["minimum"] = datetime.datetime.fromtimestamp(
-                datetime.timedelta(date_stats.minimum.value()).total_seconds(),
-                datetime.timezone.utc,
-            )
-        else:
-            column_stats["minimum"] = None
-        if date_stats.maximum.has_value():
-            column_stats["maximum"] = datetime.datetime.fromtimestamp(
-                datetime.timedelta(date_stats.maximum.value()).total_seconds(),
-                datetime.timezone.utc,
-            )
-        else:
-            column_stats["maximum"] = None
-    elif bin_stats := std_get_if[binary_statistics](&type_specific_stats):
-        if bin_stats.sum.has_value():
-            column_stats["sum"] = bin_stats.sum.value()
-        else:
-            column_stats["sum"] = None
-    elif ts_stats := std_get_if[timestamp_statistics](&type_specific_stats):
-        # Before ORC-135, the local timezone offset was included and they were
-        # stored as minimum and maximum. After ORC-135, the timestamp is
-        # adjusted to UTC before being converted to milliseconds and stored
-        # in minimumUtc and maximumUtc.
-        # TODO: Support minimum and maximum by reading writer's local timezone
-        if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value():
-            column_stats["minimum"] = datetime.datetime.fromtimestamp(
-                ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc
-            )
-            column_stats["maximum"] = datetime.datetime.fromtimestamp(
-                ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc
-            )
-    else:
-        raise ValueError("Unsupported statistics type")
-    return column_stats
-
-
+# TODO: Consider inlining this function since it seems to only be used in one place.
 cpdef read_parsed_orc_statistics(filepath_or_buffer):
     """
     Cython function to call into libcudf API, see `read_parsed_orc_statistics`.
@@ -201,25 +54,13 @@ cpdef read_parsed_orc_statistics(filepath_or_buffer):
     cudf.io.orc.read_orc_statistics
     """
 
-    cdef parsed_orc_statistics parsed = (
-        libcudf_read_parsed_orc_statistics(make_source_info([filepath_or_buffer]))
+    parsed = (
+        plc.io.orc.read_parsed_orc_statistics(
+            plc.io.SourceInfo([filepath_or_buffer])
+        )
     )
 
-    cdef vector[column_statistics] file_stats = parsed.file_stats
-    cdef vector[vector[column_statistics]] stripes_stats = parsed.stripes_stats
-
-    parsed_file_stats = [
-        _parse_column_type_statistics(file_stats[column_index])
-        for column_index in range(file_stats.size())
-    ]
-
-    parsed_stripes_stats = [
-        [_parse_column_type_statistics(stripes_stats[stripe_index][column_index])
-         for column_index in range(stripes_stats[stripe_index].size())]
-        for stripe_index in range(stripes_stats.size())
-    ]
-
-    return parsed.column_names, parsed_file_stats, parsed_stripes_stats
+    return parsed.column_names, parsed.file_stats, parsed.stripes_stats
 
 
 cpdef read_orc(object filepaths_or_buffers,
@@ -235,36 +76,34 @@ cpdef read_orc(object filepaths_or_buffers,
     See Also
     --------
     cudf.read_orc
+
+    Notes
+    -----
+    Currently this function only considers the metadata of the first file in the list of
+    filepaths_or_buffers.
     """
-    cdef orc_reader_options c_orc_reader_options = make_orc_reader_options(
-        filepaths_or_buffers,
+
+    if columns is not None:
+        columns = [str(col) for col in columns]
+
+    tbl_w_meta = plc.io.orc.read_orc(
+        plc.io.SourceInfo(filepaths_or_buffers),
         columns,
-        stripes or [],
+        stripes,
         get_skiprows_arg(skip_rows),
         get_num_rows_arg(num_rows),
-        (
-            type_id.EMPTY
-            if timestamp_type is None else
-            <type_id>(
-                <underlying_type_t_type_id> (
-                    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
-                        cudf.dtype(timestamp_type)
-                    ]
-                )
-            )
-        ),
         use_index,
+        plc.types.DataType(
+            SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[
+                cudf.dtype(timestamp_type)
+            ]
+        )
     )
 
-    cdef table_with_metadata c_result
-    cdef size_type nrows
+    names = tbl_w_meta.column_names(include_children=False)
 
-    with nogil:
-        c_result = move(libcudf_read_orc(c_orc_reader_options))
-
-    names = [info.name.decode() for info in c_result.metadata.schema_info]
     actual_index_names, col_names, is_range_index, reset_index_name, \
-        range_idx = _get_index_from_metadata(c_result.metadata.user_data,
+        range_idx = _get_index_from_metadata(tbl_w_meta.per_file_user_data,
                                              names,
                                              skip_rows,
                                              num_rows)
@@ -272,11 +111,11 @@ cpdef read_orc(object filepaths_or_buffers,
     if columns is not None and (isinstance(columns, list) and len(columns) == 0):
         # When `columns=[]`, index needs to be
         # established, but not the columns.
-        nrows = c_result.tbl.get()[0].view().num_rows()
+        nrows = tbl_w_meta.tbl.num_rows()
         return {}, cudf.RangeIndex(nrows)
 
-    data, index = data_from_unique_ptr(
-        move(c_result.tbl),
+    data, index = data_from_pylibcudf_io(
+        tbl_w_meta,
         col_names if columns is None else names,
         actual_index_names
     )
@@ -286,11 +125,13 @@ cpdef read_orc(object filepaths_or_buffers,
     elif reset_index_name:
         index.names = [None] * len(index.names)
 
+    child_name_values = tbl_w_meta.child_names.values()
+
     data = {
-        name: update_column_struct_field_names(
-            col, c_result.metadata.schema_info[i]
+        name: update_col_struct_field_names(
+            col, child_names
         )
-        for i, (name, col) in enumerate(data.items())
+        for (name, col), child_names in zip(data.items(), child_name_values)
     }
 
     return data, index
@@ -313,32 +154,35 @@ cdef compression_type _get_comp_type(object compression):
         raise ValueError(f"Unsupported `compression` type {compression}")
 
 cdef tuple _get_index_from_metadata(
-        map[string, string] user_data,
+        vector[map[string, string]] user_data,
         object names,
         object skip_rows,
         object num_rows):
-    json_str = user_data[b'pandas'].decode('utf-8')
+
     meta = None
     index_col = None
     is_range_index = False
     reset_index_name = False
     range_idx = None
-    if json_str != "":
-        meta = json.loads(json_str)
-        if 'index_columns' in meta and len(meta['index_columns']) > 0:
-            index_col = meta['index_columns']
-            if isinstance(index_col[0], dict) and \
-                    index_col[0]['kind'] == 'range':
-                is_range_index = True
-            else:
-                index_col_names = OrderedDict()
-                for idx_col in index_col:
-                    for c in meta['columns']:
-                        if c['field_name'] == idx_col:
-                            index_col_names[idx_col] = \
-                                c['name'] or c['field_name']
-                            if c['name'] is None:
-                                reset_index_name = True
+
+    if user_data.size() > 0:
+        json_str = user_data[0][b'pandas'].decode('utf-8')
+        if json_str != "":
+            meta = json.loads(json_str)
+            if 'index_columns' in meta and len(meta['index_columns']) > 0:
+                index_col = meta['index_columns']
+                if isinstance(index_col[0], dict) and \
+                        index_col[0]['kind'] == 'range':
+                    is_range_index = True
+                else:
+                    index_col_names = OrderedDict()
+                    for idx_col in index_col:
+                        for c in meta['columns']:
+                            if c['field_name'] == idx_col:
+                                index_col_names[idx_col] = \
+                                    c['name'] or c['field_name']
+                                if c['name'] is None:
+                                    reset_index_name = True
 
     actual_index_names = None
     if index_col is not None and len(index_col) > 0:
@@ -473,41 +317,6 @@ cdef int64_t get_num_rows_arg(object arg) except*:
     return <int64_t> arg
 
 
-cdef orc_reader_options make_orc_reader_options(
-    object filepaths_or_buffers,
-    object column_names,
-    object stripes,
-    int64_t skip_rows,
-    int64_t num_rows,
-    type_id timestamp_type,
-    bool use_index
-) except*:
-
-    cdef vector[vector[size_type]] strps = stripes
-    cdef orc_reader_options opts
-    cdef source_info src = make_source_info(filepaths_or_buffers)
-    opts = move(
-        orc_reader_options.builder(src)
-        .stripes(strps)
-        .skip_rows(skip_rows)
-        .timestamp_type(data_type(timestamp_type))
-        .use_index(use_index)
-        .build()
-    )
-    if num_rows >= 0:
-        opts.set_num_rows(num_rows)
-
-    cdef vector[string] c_column_names
-    if column_names is not None:
-        c_column_names.reserve(len(column_names))
-        for col in column_names:
-            c_column_names.push_back(str(col).encode())
-        if len(column_names) > 0:
-            opts.set_columns(c_column_names)
-
-    return opts
-
-
 cdef class ORCWriter:
     """
     ORCWriter lets you you incrementally write out a ORC file from a series
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index d94f0e1b564..13997da8403 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -2,24 +2,13 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.partitioning cimport partition as cpp_partition
-from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.table.table_view cimport table_view
-
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
+
+import pylibcudf as plc
 
 from cudf._lib.reduce import minmax
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
-cimport pylibcudf.libcudf.types as libcudf_types
-
 
 @acquire_spill_lock()
 def partition(list source_columns, Column partition_map,
@@ -50,25 +39,15 @@ def partition(list source_columns, Column partition_map,
 
     if num_partitions is None:
         num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True)
-    cdef int c_num_partitions = num_partitions
-    cdef table_view c_source_view = table_view_from_columns(source_columns)
-
-    cdef column_view c_partition_map_view = partition_map.view()
 
-    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
     if partition_map.size > 0:
         lo, hi = minmax(partition_map)
         if lo < 0 or hi >= num_partitions:
             raise ValueError("Partition map has invalid values")
-    with nogil:
-        c_result = move(
-            cpp_partition(
-                c_source_view,
-                c_partition_map_view,
-                c_num_partitions
-            )
-        )
 
-    return (
-        columns_from_unique_ptr(move(c_result.first)), list(c_result.second)
+    plc_table, offsets = plc.partitioning.partition(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),
+        partition_map.to_pylibcudf(mode="read"),
+        num_partitions
     )
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets
diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx
index fe8c17c9e31..df81b3942b4 100644
--- a/python/cudf/cudf/_lib/strings/attributes.pyx
+++ b/python/cudf/cudf/_lib/strings/attributes.pyx
@@ -2,19 +2,10 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.attributes cimport (
-    code_points as cpp_code_points,
-    count_bytes as cpp_count_bytes,
-    count_characters as cpp_count_characters,
-)
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def count_characters(Column source_strings):
@@ -22,13 +13,10 @@ def count_characters(Column source_strings):
     Returns an integer numeric column containing the
     length of each string in characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_count_characters(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.attributes.count_characters(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -37,13 +25,10 @@ def count_bytes(Column source_strings):
     Returns an integer numeric column containing the
     number of bytes of each string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_count_bytes(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.attributes.count_bytes(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -52,10 +37,7 @@ def code_points(Column source_strings):
     Creates a numeric column with code point values (integers)
     for each character of each string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_code_points(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.attributes.code_points(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index bff3b2c4ce4..54624a5a2fd 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -1,29 +1,10 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.optional cimport make_optional
-from libcpp.string cimport string
-from libcpp.utility cimport move
+import pylibcudf as plc
 
-from pylibcudf.libcudf.io.timezone cimport (
-    make_timezone_transition_table as cpp_make_timezone_transition_table,
-)
-from pylibcudf.libcudf.table.table cimport table
-
-from cudf._lib.utils cimport columns_from_unique_ptr
+from cudf._lib.column cimport Column
 
 
 def make_timezone_transition_table(tzdir, tzname):
-    cdef unique_ptr[table] c_result
-    cdef string c_tzdir = tzdir.encode()
-    cdef string c_tzname = tzname.encode()
-
-    with nogil:
-        c_result = move(
-            cpp_make_timezone_transition_table(
-                make_optional[string](c_tzdir),
-                c_tzname
-            )
-        )
-
-    return columns_from_unique_ptr(move(c_result))
+    plc_table = plc.io.timezone.make_timezone_transition_table(tzdir, tzname)
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()]
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index f78fbd4c844..995d278cb88 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -1,32 +1,18 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.transpose cimport transpose as cpp_transpose
+import pylibcudf as plc
 
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
 
 
 def transpose(list source_columns):
     """Transpose m n-row columns into n m-row columns
     """
-    cdef pair[unique_ptr[column], table_view] c_result
-    cdef table_view c_input = table_view_from_columns(source_columns)
-
-    with nogil:
-        c_result = move(cpp_transpose(c_input))
-
-    # Notice, the data pointer of `result_owner` has been exposed
-    # through `c_result.second` at this point.
-    result_owner = Column.from_unique_ptr(
-        move(c_result.first), data_ptr_exposed=True
-    )
-    return columns_from_table_view(
-        c_result.second,
-        owners=[result_owner] * c_result.second.num_columns()
+    input_table = plc.table.Table(
+        [col.to_pylibcudf(mode="read") for col in source_columns]
     )
+    result_table = plc.transpose.transpose(input_table)
+    return [
+        Column.from_pylibcudf(col, data_ptr_exposed=True)
+        for col in result_table.columns()
+    ]
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index ff97fe80310..7254db5c43d 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.table.table cimport table, table_view
 cdef data_from_unique_ptr(
     unique_ptr[table] c_tbl, column_names, index_names=*)
 cdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
-cdef data_from_pylibcudf_io(tbl_with_meta)
+cdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *)
 cdef data_from_table_view(
     table_view tv, object owner, object column_names, object index_names=*)
 cdef table_view table_view_from_columns(columns) except *
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 8660cca9322..9e5b99f64eb 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -316,15 +316,17 @@ cdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
         index_names
     )
 
-cdef data_from_pylibcudf_io(tbl_with_meta):
+cdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None):
     """
     Unpacks the TableWithMetadata from libcudf I/O
     into a dict of columns and an Index (cuDF format)
     """
+    if column_names is None:
+        column_names = tbl_with_meta.column_names(include_children=False)
     return _data_from_columns(
         columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns],
-        column_names=tbl_with_meta.column_names(include_children=False),
-        index_names=None
+        column_names=column_names,
+        index_names=index_names
     )
 
 cdef columns_from_table_view(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 98af006f6e5..37ad6b8fabb 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -767,11 +767,15 @@ def fillna(
         )
 
     @_performance_tracking
-    def _drop_column(self, name):
-        """Drop a column by *name*"""
-        if name not in self._data:
-            raise KeyError(f"column '{name}' does not exist")
-        del self._data[name]
+    def _drop_column(
+        self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise"
+    ) -> None:
+        """Drop a column by *name* inplace."""
+        try:
+            del self._data[name]
+        except KeyError as err:
+            if errors != "ignore":
+                raise KeyError(f"column '{name}' does not exist") from err
 
     @_performance_tracking
     def _quantile_table(
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index cb8cd0cd28b..be05075a2cd 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2232,6 +2232,22 @@ def func(x):
 
         return self.agg(func)
 
+    @_performance_tracking
+    def nunique(self, dropna: bool = True):
+        """
+        Return number of unique elements in the group.
+
+        Parameters
+        ----------
+        dropna : bool, default True
+            Don't include NaN in the counts.
+        """
+
+        def func(x):
+            return getattr(x, "nunique")(dropna=dropna)
+
+        return self.agg(func)
+
     @_performance_tracking
     def std(
         self,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 810d4ad74e7..5952815deef 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3,7 +3,6 @@
 
 from __future__ import annotations
 
-import numbers
 import operator
 import textwrap
 import warnings
@@ -150,24 +149,14 @@
 )
 
 
-def _get_host_unique(array):
+def _get_unique_drop_labels(array):
+    """Return labels to be dropped for IndexFrame.drop."""
     if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)):
-        return array.unique.to_pandas()
-    elif isinstance(array, (str, numbers.Number)):
-        return [array]
+        yield from np.unique(as_column(array).values_host)
+    elif is_scalar(array):
+        yield array
     else:
-        return set(array)
-
-
-def _drop_columns(f: Frame, columns: abc.Iterable, errors: str):
-    for c in columns:
-        try:
-            f._drop_column(c)
-        except KeyError as e:
-            if errors == "ignore":
-                pass
-            else:
-                raise e
+        yield from set(array)
 
 
 def _indices_from_labels(obj, labels):
@@ -5262,15 +5251,14 @@ def drop(
             out = self.copy()
 
         if axis in (1, "columns"):
-            target = _get_host_unique(target)
-
-            _drop_columns(out, target, errors)
+            for label in _get_unique_drop_labels(target):
+                out._drop_column(label, errors=errors)
         elif axis in (0, "index"):
             dropped = _drop_rows_by_labels(out, target, level, errors)
 
             if columns is not None:
-                columns = _get_host_unique(columns)
-                _drop_columns(dropped, columns, errors)
+                for label in _get_unique_drop_labels(columns):
+                    dropped._drop_column(label, errors=errors)
 
             out._mimic_inplace(dropped, inplace=True)
 
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index c54293badbe..68b60809bb9 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -181,11 +181,6 @@ def read_orc_statistics(
             parsed_stripes_statistics,
         ) = liborc.read_parsed_orc_statistics(path_or_buf)
 
-        # Parse column names
-        column_names = [
-            column_name.decode("utf-8") for column_name in column_names
-        ]
-
         # Parse file statistics
         file_statistics = {
             column_name: column_stats
@@ -248,9 +243,9 @@ def _filter_stripes(
         num_rows_scanned = 0
         for i, stripe_statistics in enumerate(stripes_statistics):
             num_rows_before_stripe = num_rows_scanned
-            num_rows_scanned += next(iter(stripe_statistics.values()))[
-                "number_of_values"
-            ]
+            num_rows_scanned += next(
+                iter(stripe_statistics.values())
+            ).number_of_values
             if stripes is not None and i not in stripes:
                 continue
             if skip_rows is not None and num_rows_scanned <= skip_rows:
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index 773141ee71a..979c936a182 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -33,9 +33,10 @@ def __array_function__(self, *args, **kwargs):
 
 missing_arrfunc_reason = "NEP-18 support is not available in NumPy"
 
+np.random.seed(0)
+
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
     "func",
     [
@@ -47,7 +48,8 @@ def __array_function__(self, *args, **kwargs):
         lambda x: np.linalg.norm(x),
     ],
 )
-def test_array_func_cudf_series(np_ar, func):
+def test_array_func_cudf_series(func):
+    np_ar = np.random.random(100)
     cudf_ser = cudf.Series(np_ar)
     expect = func(np_ar)
     got = func(cudf_ser)
@@ -58,9 +60,6 @@ def test_array_func_cudf_series(np_ar, func):
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize(
-    "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))]
-)
 @pytest.mark.parametrize(
     "func",
     [
@@ -74,7 +73,8 @@ def test_array_func_cudf_series(np_ar, func):
         lambda x: np.prod(x, axis=1),
     ],
 )
-def test_array_func_cudf_dataframe(pd_df, func):
+def test_array_func_cudf_dataframe(func):
+    pd_df = pd.DataFrame(np.random.uniform(size=(100, 10)))
     cudf_df = cudf.from_pandas(pd_df)
     expect = func(pd_df)
     got = func(cudf_df)
@@ -82,9 +82,6 @@ def test_array_func_cudf_dataframe(pd_df, func):
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize(
-    "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))]
-)
 @pytest.mark.parametrize(
     "func",
     [
@@ -93,21 +90,22 @@ def test_array_func_cudf_dataframe(pd_df, func):
         lambda x: np.linalg.det(x),
     ],
 )
-def test_array_func_missing_cudf_dataframe(pd_df, func):
+def test_array_func_missing_cudf_dataframe(func):
+    pd_df = pd.DataFrame(np.random.uniform(size=(100, 10)))
     cudf_df = cudf.from_pandas(pd_df)
     with pytest.raises(TypeError):
         func(cudf_df)
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
     "func",
     [
         lambda x: np.unique(x),
     ],
 )
-def test_array_func_cudf_index(np_ar, func):
+def test_array_func_cudf_index(func):
+    np_ar = np.random.random(100)
     cudf_index = cudf.Index(cudf.Series(np_ar))
     expect = func(np_ar)
     got = func(cudf_index)
@@ -118,7 +116,6 @@ def test_array_func_cudf_index(np_ar, func):
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
     "func",
     [
@@ -127,7 +124,8 @@ def test_array_func_cudf_index(np_ar, func):
         lambda x: np.linalg.det(x),
     ],
 )
-def test_array_func_missing_cudf_index(np_ar, func):
+def test_array_func_missing_cudf_index(func):
+    np_ar = np.random.random(100)
     cudf_index = cudf.Index(cudf.Series(np_ar))
     with pytest.raises(TypeError):
         func(cudf_index)
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 9d69e626c3d..5acdf36de80 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -236,6 +236,7 @@ def test_avro_compression(rows, codec):
             },
         ],
         rows,
+        seed=0,
     )
     expected_df = cudf.DataFrame.from_arrow(df)
 
@@ -599,7 +600,7 @@ def test_avro_reader_multiblock(
     else:
         assert dtype in ("float32", "float64")
         avro_type = "float" if dtype == "float32" else "double"
-
+        np.random.seed(0)
         # We don't use rand_dataframe() here, because it increases the
         # execution time of each test by a factor of 10 or more (it appears
         # to use a very costly approach to generating random data).
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index f4d1578bda7..6f88d942746 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -515,6 +515,17 @@ def test_dataframe_drop_columns(pdf, columns, inplace):
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize("obj", ["Index", "Series"])
+def test_drop_cudf_obj_columns(obj):
+    pdf = pd.DataFrame({"A": [1], "B": [1]})
+    gdf = cudf.from_pandas(pdf)
+
+    columns = ["B"]
+    expected = pdf.drop(labels=getattr(pd, obj)(columns), axis=1)
+    actual = gdf.drop(columns=getattr(cudf, obj)(columns), axis=1)
+    assert_eq(expected, actual)
+
+
 @pytest.mark.parametrize(
     "pdf",
     [
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 0aaa71e50d7..14ba9894fd3 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1940,6 +1940,23 @@ def test_groupby_nunique(agg, by):
     assert_groupby_results_equal(expect, got, check_dtype=False)
 
 
+@pytest.mark.parametrize("dropna", [True, False])
+def test_nunique_dropna(dropna):
+    gdf = cudf.DataFrame(
+        {
+            "a": [1, 1, 2],
+            "b": [4, None, 5],
+            "c": [None, None, 7],
+            "d": [1, 1, 3],
+        }
+    )
+    pdf = gdf.to_pandas()
+
+    result = gdf.groupby("a")["b"].nunique(dropna=dropna)
+    expected = pdf.groupby("a")["b"].nunique(dropna=dropna)
+    assert_groupby_results_equal(result, expected, check_dtype=False)
+
+
 @pytest.mark.parametrize(
     "n",
     [0, 1, 2, 10],
@@ -2470,6 +2487,7 @@ def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     pdf.columns = ["x", "y", "z"]
@@ -2602,6 +2620,7 @@ def test_groupby_shift_row_mixed_numerics(
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2639,6 +2658,7 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2687,6 +2707,7 @@ def test_groupby_shift_row_mixed_fill(
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2732,6 +2753,7 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     gdf = cudf.from_pandas(t.to_pandas())
 
@@ -2782,6 +2804,7 @@ def test_groupby_diff_row_mixed_numerics(nelem, shift_perc, direction):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2815,6 +2838,7 @@ def test_groupby_diff_row_zero_shift(nelem):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     gdf = cudf.from_pandas(t.to_pandas())
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index c2a30b76bea..1dd732c7191 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -184,25 +184,25 @@ def test_orc_read_statistics(datadir):
         pytest.skip(".orc file is not found: %s" % e)
 
     # Check numberOfValues
-    assert_eq(file_statistics[0]["int1"]["number_of_values"], 11_000)
+    assert_eq(file_statistics[0]["int1"].number_of_values, 11_000)
     assert_eq(
-        file_statistics[0]["int1"]["number_of_values"],
+        file_statistics[0]["int1"].number_of_values,
         sum(
             [
-                stripes_statistics[0]["int1"]["number_of_values"],
-                stripes_statistics[1]["int1"]["number_of_values"],
-                stripes_statistics[2]["int1"]["number_of_values"],
+                stripes_statistics[0]["int1"].number_of_values,
+                stripes_statistics[1]["int1"].number_of_values,
+                stripes_statistics[2]["int1"].number_of_values,
             ]
         ),
     )
     assert_eq(
-        stripes_statistics[1]["int1"]["number_of_values"],
-        stripes_statistics[1]["string1"]["number_of_values"],
+        stripes_statistics[1]["int1"].number_of_values,
+        stripes_statistics[1]["string1"].number_of_values,
     )
-    assert_eq(stripes_statistics[2]["string1"]["number_of_values"], 1_000)
+    assert_eq(stripes_statistics[2]["string1"].number_of_values, 1_000)
 
     # Check other statistics
-    assert_eq(stripes_statistics[2]["string1"]["has_null"], False)
+    assert_eq(stripes_statistics[2]["string1"].has_null, False)
     assert_eq(
         file_statistics[0]["int1"]["minimum"],
         min(
@@ -1538,8 +1538,8 @@ def test_empty_statistics():
     for stats in got:
         # Similar expected stats for the first 6 columns in this case
         for col_name in ascii_lowercase[:6]:
-            assert stats[0][col_name].get("number_of_values") == 0
-            assert stats[0][col_name].get("has_null") is True
+            assert stats[0][col_name].number_of_values == 0
+            assert stats[0][col_name].has_null is True
             assert stats[0][col_name].get("minimum") is None
             assert stats[0][col_name].get("maximum") is None
         for col_name in ascii_lowercase[:3]:
@@ -1547,17 +1547,17 @@ def test_empty_statistics():
         # Sum for decimal column is a string
         assert stats[0]["d"].get("sum") == "0"
 
-        assert stats[0]["g"].get("number_of_values") == 0
-        assert stats[0]["g"].get("has_null") is True
+        assert stats[0]["g"].number_of_values == 0
+        assert stats[0]["g"].has_null is True
         assert stats[0]["g"].get("true_count") == 0
         assert stats[0]["g"].get("false_count") == 0
 
-        assert stats[0]["h"].get("number_of_values") == 0
-        assert stats[0]["h"].get("has_null") is True
+        assert stats[0]["h"].number_of_values == 0
+        assert stats[0]["h"].has_null is True
         assert stats[0]["h"].get("sum") == 0
 
-        assert stats[0]["i"].get("number_of_values") == 1
-        assert stats[0]["i"].get("has_null") is False
+        assert stats[0]["i"].number_of_values == 1
+        assert stats[0]["i"].has_null is False
         assert stats[0]["i"].get("minimum") == 1
         assert stats[0]["i"].get("maximum") == 1
         assert stats[0]["i"].get("sum") == 1
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 1180da321e6..d636f36f282 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1873,7 +1873,7 @@ def _apply_filter_bool_eq(val, col_stats):
                 return False
         elif val is False:
             if (col_stats["false_count"] == 0) or (
-                col_stats["true_count"] == col_stats["number_of_values"]
+                col_stats["true_count"] == col_stats.number_of_values
             ):
                 return False
     return True
@@ -1900,7 +1900,7 @@ def _apply_predicate(op, val, col_stats):
             return False
         # TODO: Replace pd.isnull with
         # cudf.isnull once it is implemented
-        if pd.isnull(val) and not col_stats["has_null"]:
+        if pd.isnull(val) and not col_stats.has_null:
             return False
         if not _apply_filter_bool_eq(val, col_stats):
             return False
diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
index c1317e8f467..66c15f694ee 100644
--- a/python/cudf_polars/cudf_polars/__init__.py
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -10,13 +10,15 @@
 
 from __future__ import annotations
 
-# Check we have a supported polars version
-import cudf_polars.utils.versions as v
 from cudf_polars._version import __git_commit__, __version__
 from cudf_polars.callback import execute_with_cudf
 from cudf_polars.dsl.translate import translate_ir
 
-del v
+# Check we have a supported polars version
+from cudf_polars.utils.versions import _ensure_polars_version
+
+_ensure_polars_version()
+del _ensure_polars_version
 
 __all__: list[str] = [
     "execute_with_cudf",
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 45881afe0c8..a0291037f01 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -93,14 +93,6 @@ def _(
         cloud_options = None
     else:
         reader_options, cloud_options = map(json.loads, options)
-    if (
-        typ == "csv"
-        and visitor.version()[0] == 1
-        and reader_options["schema"] is not None
-    ):
-        reader_options["schema"] = {
-            "fields": reader_options["schema"]["inner"]
-        }  # pragma: no cover; CI tests 1.7
     file_options = node.file_options
     with_columns = file_options.with_columns
     n_rows = file_options.n_rows
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index a79d45899cd..7b6f3848fc4 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -164,9 +164,11 @@ def assert_collect_raises(
         cudf-polars.
         Useful for controlling optimization settings.
     polars_except
-        Exception or exceptions polars CPU is expected to raise.
+        Exception or exceptions polars CPU is expected to raise. If
+        None, CPU is not expected to raise an exception.
     cudf_except
-        Exception or exceptions polars GPU is expected to raise.
+        Exception or exceptions polars GPU is expected to raise. If
+        None, GPU is not expected to raise an exception.
     collect_kwargs
         Common keyword arguments to pass to collect for both polars CPU and
         cudf-polars.
@@ -203,7 +205,8 @@ def assert_collect_raises(
             f"CPU execution RAISED {type(e)}, EXPECTED {polars_except}"
         ) from e
     else:
-        raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}")
+        if polars_except != ():
+            raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}")
 
     engine = GPUEngine(raise_on_fail=True)
     try:
@@ -212,7 +215,8 @@ def assert_collect_raises(
         pass
     except Exception as e:
         raise AssertionError(
-            f"GPU execution RAISED {type(e)}, EXPECTED {polars_except}"
+            f"GPU execution RAISED {type(e)}, EXPECTED {cudf_except}"
         ) from e
     else:
-        raise AssertionError(f"GPU execution DID NOT RAISE {polars_except}")
+        if cudf_except != ():
+            raise AssertionError(f"GPU execution DID NOT RAISE {cudf_except}")
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index c40d59e6d33..05b76d76808 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -49,11 +49,15 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
     "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
+    "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index 2e6efde968c..4a7ad6b3cf2 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -12,11 +12,11 @@
 
 POLARS_VERSION = parse(__version__)
 
-POLARS_VERSION_GE_16 = POLARS_VERSION >= parse("1.6")
-POLARS_VERSION_GT_16 = POLARS_VERSION > parse("1.6")
-POLARS_VERSION_LT_16 = POLARS_VERSION < parse("1.6")
-
-if POLARS_VERSION_LT_16:
-    raise ImportError(
-        "cudf_polars requires py-polars v1.6 or greater."
-    )  # pragma: no cover
+POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8")
+
+
+def _ensure_polars_version():
+    if POLARS_VERSION_LT_18:
+        raise ImportError(
+            "cudf_polars requires py-polars v1.8 or greater."
+        )  # pragma: no cover
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index df70dc5dada..f55031e0826 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.6",
+    "polars>=1.8,<1.9",
     "pylibcudf==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index 6f996e0e0ec..74bf8b9e4e2 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -168,7 +168,11 @@ def test_groupby_nan_minmax_raises(op):
     "expr",
     [
         pl.lit(1).alias("value"),
-        pl.lit([[4, 5, 6]]).alias("value"),
+        pytest.param(
+            pl.lit([[4, 5, 6]]).alias("value"),
+            marks=pytest.mark.xfail(reason="Need to expose OtherScalar in rust IR"),
+        ),
+        pl.Series("value", [[4, 5, 6]], dtype=pl.List(pl.Int32)),
         pl.col("float") * (1 - pl.col("int")),
         [pl.lit(2).alias("value"), pl.col("float") * 2],
     ],
diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py
index 8e7f1a09d9b..ace1c6b8648 100644
--- a/python/cudf_polars/tests/testing/test_asserts.py
+++ b/python/cudf_polars/tests/testing/test_asserts.py
@@ -7,8 +7,6 @@
 
 import polars as pl
 
-from cudf_polars.containers import DataFrame
-from cudf_polars.dsl.ir import Select
 from cudf_polars.testing.asserts import (
     assert_collect_raises,
     assert_gpu_result_equal,
@@ -38,14 +36,24 @@ class E(Exception):
         assert_ir_translation_raises(unsupported, E)
 
 
-def test_collect_assert_raises(monkeypatch):
+def test_collect_assert_raises():
     df = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
 
-    with pytest.raises(AssertionError):
-        # This should raise, because polars CPU can run this query
+    with pytest.raises(AssertionError, match="CPU execution DID NOT RAISE"):
+        # This should raise, because polars CPU can run this query,
+        # but we expect an error.
         assert_collect_raises(
             df,
             polars_except=pl.exceptions.InvalidOperationError,
+            cudf_except=(),
+        )
+
+    with pytest.raises(AssertionError, match="GPU execution DID NOT RAISE"):
+        # This should raise, because polars GPU can run this query,
+        # but we expect an error.
+        assert_collect_raises(
+            df,
+            polars_except=(),
             cudf_except=pl.exceptions.InvalidOperationError,
         )
 
@@ -60,7 +68,7 @@ def test_collect_assert_raises(monkeypatch):
         cudf_except=pl.exceptions.InvalidOperationError,
     )
 
-    with pytest.raises(AssertionError):
+    with pytest.raises(AssertionError, match="GPU execution RAISED"):
         # This should raise because the expected GPU error is wrong
         assert_collect_raises(
             q,
@@ -68,23 +76,10 @@ def test_collect_assert_raises(monkeypatch):
             cudf_except=NotImplementedError,
         )
 
-    with pytest.raises(AssertionError):
+    with pytest.raises(AssertionError, match="CPU execution RAISED"):
         # This should raise because the expected CPU error is wrong
         assert_collect_raises(
             q,
             polars_except=NotImplementedError,
             cudf_except=pl.exceptions.InvalidOperationError,
         )
-
-    with monkeypatch.context() as m:
-        m.setattr(Select, "evaluate", lambda self, cache: DataFrame([]))
-        # This query should fail, but we monkeypatch a bad
-        # implementation of Select which "succeeds" to check that our
-        # assertion notices this case.
-        q = df.select(pl.col("a") + pl.Series([1, 2]))
-        with pytest.raises(AssertionError):
-            assert_collect_raises(
-                q,
-                polars_except=pl.exceptions.ComputeError,
-                cudf_except=pl.exceptions.ComputeError,
-            )
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 9347ebba5de..bead964a0ef 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -8,6 +8,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+from packaging.version import Version
 from pandas.api.types import is_scalar
 
 import dask.dataframe as dd
@@ -52,6 +53,10 @@
 get_parallel_type.register(cudf.BaseIndex, lambda _: Index)
 
 
+# Required for Arrow filesystem support in read_parquet
+PYARROW_GE_15 = Version(pa.__version__) >= Version("15.0.0")
+
+
 @meta_nonempty.register(cudf.BaseIndex)
 @_dask_cudf_performance_tracking
 def _nonempty_index(idx):
@@ -695,15 +700,140 @@ def from_dict(
         )
 
     @staticmethod
-    def read_parquet(*args, engine=None, **kwargs):
+    def read_parquet(path, *args, filesystem="fsspec", engine=None, **kwargs):
         import dask_expr as dx
+        import fsspec
 
-        from dask_cudf.io.parquet import CudfEngine
+        if (
+            isinstance(filesystem, fsspec.AbstractFileSystem)
+            or isinstance(filesystem, str)
+            and filesystem.lower() == "fsspec"
+        ):
+            # Default "fsspec" filesystem
+            from dask_cudf.io.parquet import CudfEngine
 
-        _raise_unsupported_parquet_kwargs(**kwargs)
-        return _default_backend(
-            dx.read_parquet, *args, engine=CudfEngine, **kwargs
-        )
+            _raise_unsupported_parquet_kwargs(**kwargs)
+            return _default_backend(
+                dx.read_parquet,
+                path,
+                *args,
+                filesystem=filesystem,
+                engine=CudfEngine,
+                **kwargs,
+            )
+
+        else:
+            # EXPERIMENTAL filesystem="arrow" support.
+            # This code path uses PyArrow for IO, which is only
+            # beneficial for remote storage (e.g. S3)
+
+            from fsspec.utils import stringify_path
+            from pyarrow import fs as pa_fs
+
+            # CudfReadParquetPyarrowFS requires import of distributed beforehand
+            # (See: https://github.com/dask/dask/issues/11352)
+            import distributed  # noqa: F401
+            from dask.core import flatten
+            from dask.dataframe.utils import pyarrow_strings_enabled
+
+            from dask_cudf.expr._expr import CudfReadParquetPyarrowFS
+
+            if args:
+                raise ValueError(f"Unexpected positional arguments: {args}")
+
+            if not (
+                isinstance(filesystem, pa_fs.FileSystem)
+                or isinstance(filesystem, str)
+                and filesystem.lower() in ("arrow", "pyarrow")
+            ):
+                raise ValueError(f"Unexpected filesystem value: {filesystem}.")
+
+            if not PYARROW_GE_15:
+                raise NotImplementedError(
+                    "Experimental Arrow filesystem support requires pyarrow>=15"
+                )
+
+            if not isinstance(path, str):
+                path = stringify_path(path)
+
+            # Extract kwargs
+            columns = kwargs.pop("columns", None)
+            filters = kwargs.pop("filters", None)
+            categories = kwargs.pop("categories", None)
+            index = kwargs.pop("index", None)
+            storage_options = kwargs.pop("storage_options", None)
+            dtype_backend = kwargs.pop("dtype_backend", None)
+            calculate_divisions = kwargs.pop("calculate_divisions", False)
+            ignore_metadata_file = kwargs.pop("ignore_metadata_file", False)
+            metadata_task_size = kwargs.pop("metadata_task_size", None)
+            split_row_groups = kwargs.pop("split_row_groups", "infer")
+            blocksize = kwargs.pop("blocksize", "default")
+            aggregate_files = kwargs.pop("aggregate_files", None)
+            parquet_file_extension = kwargs.pop(
+                "parquet_file_extension", (".parq", ".parquet", ".pq")
+            )
+            arrow_to_pandas = kwargs.pop("arrow_to_pandas", None)
+            open_file_options = kwargs.pop("open_file_options", None)
+
+            # Validate and normalize kwargs
+            kwargs["dtype_backend"] = dtype_backend
+            if arrow_to_pandas is not None:
+                raise ValueError(
+                    "arrow_to_pandas not supported for the 'cudf' backend."
+                )
+            if open_file_options is not None:
+                raise ValueError(
+                    "The open_file_options argument is no longer supported "
+                    "by the 'cudf' backend."
+                )
+            if filters is not None:
+                for filter in flatten(filters, container=list):
+                    _, op, val = filter
+                    if op == "in" and not isinstance(val, (set, list, tuple)):
+                        raise TypeError(
+                            "Value of 'in' filter must be a list, set or tuple."
+                        )
+            if metadata_task_size is not None:
+                raise NotImplementedError(
+                    "metadata_task_size is not supported when using the pyarrow filesystem."
+                )
+            if split_row_groups != "infer":
+                raise NotImplementedError(
+                    "split_row_groups is not supported when using the pyarrow filesystem."
+                )
+            if parquet_file_extension != (".parq", ".parquet", ".pq"):
+                raise NotImplementedError(
+                    "parquet_file_extension is not supported when using the pyarrow filesystem."
+                )
+            if blocksize is not None and blocksize != "default":
+                warnings.warn(
+                    "blocksize is not supported when using the pyarrow filesystem."
+                    "blocksize argument will be ignored."
+                )
+            if aggregate_files is not None:
+                warnings.warn(
+                    "aggregate_files is not supported when using the pyarrow filesystem. "
+                    "Please use the 'dataframe.parquet.minimum-partition-size' config."
+                    "aggregate_files argument will be ignored."
+                )
+
+            return dx.new_collection(
+                CudfReadParquetPyarrowFS(
+                    path,
+                    columns=dx._util._convert_to_list(columns),
+                    filters=filters,
+                    categories=categories,
+                    index=index,
+                    calculate_divisions=calculate_divisions,
+                    storage_options=storage_options,
+                    filesystem=filesystem,
+                    ignore_metadata_file=ignore_metadata_file,
+                    arrow_to_pandas=arrow_to_pandas,
+                    pyarrow_strings_enabled=pyarrow_strings_enabled(),
+                    kwargs=kwargs,
+                    _series=isinstance(columns, str),
+                )
+            )
 
     @staticmethod
     def read_csv(
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index b284ab3774d..af83a01da98 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -2,10 +2,13 @@
 import functools
 
 import dask_expr._shuffle as _shuffle_module
+import pandas as pd
 from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
 from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns
 from dask_expr._reductions import Reduction, Var
+from dask_expr.io.io import FusedParquetIO
+from dask_expr.io.parquet import ReadParquetPyarrowFS
 
 from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
 from dask.dataframe.dispatch import is_categorical_dtype
@@ -18,6 +21,92 @@
 ##
 
 
+class CudfFusedParquetIO(FusedParquetIO):
+    @staticmethod
+    def _load_multiple_files(
+        frag_filters,
+        columns,
+        schema,
+        *to_pandas_args,
+    ):
+        import pyarrow as pa
+
+        from dask.base import apply, tokenize
+        from dask.threaded import get
+
+        token = tokenize(frag_filters, columns, schema)
+        name = f"pq-file-{token}"
+        dsk = {
+            (name, i): (
+                CudfReadParquetPyarrowFS._fragment_to_table,
+                frag,
+                filter,
+                columns,
+                schema,
+            )
+            for i, (frag, filter) in enumerate(frag_filters)
+        }
+        dsk[name] = (
+            apply,
+            pa.concat_tables,
+            [list(dsk.keys())],
+            {"promote_options": "permissive"},
+        )
+        return CudfReadParquetPyarrowFS._table_to_pandas(
+            get(dsk, name),
+            *to_pandas_args,
+        )
+
+
+class CudfReadParquetPyarrowFS(ReadParquetPyarrowFS):
+    @functools.cached_property
+    def _dataset_info(self):
+        from dask_cudf.io.parquet import set_object_dtypes_from_pa_schema
+
+        dataset_info = super()._dataset_info
+        meta_pd = dataset_info["base_meta"]
+        if isinstance(meta_pd, cudf.DataFrame):
+            return dataset_info
+
+        # Convert to cudf
+        # (drop unsupported timezone information)
+        for k, v in meta_pd.dtypes.items():
+            if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None:
+                meta_pd[k] = meta_pd[k].dt.tz_localize(None)
+        meta_cudf = cudf.from_pandas(meta_pd)
+
+        # Re-set "object" dtypes to align with pa schema
+        kwargs = dataset_info.get("kwargs", {})
+        set_object_dtypes_from_pa_schema(
+            meta_cudf,
+            kwargs.get("schema", None),
+        )
+
+        dataset_info["base_meta"] = meta_cudf
+        self.operands[type(self)._parameters.index("_dataset_info_cache")] = (
+            dataset_info
+        )
+        return dataset_info
+
+    @staticmethod
+    def _table_to_pandas(
+        table,
+        index_name,
+        *args,
+    ):
+        df = cudf.DataFrame.from_arrow(table)
+        if index_name is not None:
+            df = df.set_index(index_name)
+        return df
+
+    def _tune_up(self, parent):
+        if self._fusion_compression_factor >= 1:
+            return
+        if isinstance(parent, CudfFusedParquetIO):
+            return
+        return parent.substitute(self, CudfFusedParquetIO(self))
+
+
 class RenameAxisCudf(RenameAxis):
     # TODO: Remove this after rename_axis is supported in cudf
     # (See: https://github.com/rapidsai/cudf/issues/16895)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index a14ffbc37dc..cf8af82e112 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -12,6 +12,7 @@
 from dask.dataframe import assert_eq
 
 import dask_cudf
+from dask_cudf.tests.utils import QUERY_PLANNING_ON
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
@@ -127,7 +128,20 @@ def test_read_parquet_open_file_options_raises():
         )
 
 
-def test_read_parquet_filesystem(s3_base, s3so, pdf):
+@pytest.mark.parametrize(
+    "filesystem",
+    [
+        pytest.param(
+            "arrow",
+            marks=pytest.mark.skipif(
+                not QUERY_PLANNING_ON or not dask_cudf.backends.PYARROW_GE_15,
+                reason="Not supported",
+            ),
+        ),
+        "fsspec",
+    ],
+)
+def test_read_parquet_filesystem(s3_base, s3so, pdf, filesystem):
     fname = "test_parquet_filesystem.parquet"
     bucket = "parquet"
     buffer = BytesIO()
@@ -135,21 +149,24 @@ def test_read_parquet_filesystem(s3_base, s3so, pdf):
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         path = f"s3://{bucket}/{fname}"
+        if filesystem == "arrow":
+            # This feature requires arrow >= 15
+            pytest.importorskip("pyarrow", minversion="15.0.0")
 
-        # Cannot pass filesystem="arrow"
-        with pytest.raises(ValueError):
-            dask_cudf.read_parquet(
+            import pyarrow.fs as pa_fs
+
+            df = dask_cudf.read_parquet(
+                path,
+                filesystem=pa_fs.S3FileSystem(
+                    endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+                ),
+            )
+        else:
+            df = dask_cudf.read_parquet(
                 path,
                 storage_options=s3so,
-                filesystem="arrow",
+                filesystem=filesystem,
             )
-
-        # Can pass filesystem="fsspec"
-        df = dask_cudf.read_parquet(
-            path,
-            storage_options=s3so,
-            filesystem="fsspec",
-        )
         assert df.b.sum().compute() == 9
 
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
index 88b15718382..d03e92319be 100644
--- a/python/dask_cudf/dask_cudf/tests/test_reductions.py
+++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -13,6 +13,7 @@
 
 
 def _make_random_frame(nelem, npartitions=2):
+    np.random.seed(0)
     df = pd.DataFrame(
         {
             "x": np.random.randint(0, 5, size=nelem),
@@ -38,7 +39,6 @@ def wrapped(series):
 @pytest.mark.parametrize("reducer", _reducers)
 def test_series_reduce(reducer):
     reducer = _get_reduce_fn(reducer)
-    np.random.seed(0)
     size = 10
     df, gdf = _make_random_frame(size)
 
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index f07c8897e34..a7cb66d7b16 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -31,6 +31,7 @@ set(cython_sources
     lists.pyx
     merge.pyx
     null_mask.pyx
+    partitioning.pyx
     quantiles.pyx
     reduce.pyx
     replace.pyx
@@ -44,6 +45,7 @@ set(cython_sources
     table.pyx
     traits.pyx
     transform.pyx
+    transpose.pyx
     types.pyx
     unary.pyx
     utils.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index b7cf6413c05..a384edd456d 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -17,6 +17,7 @@ from . cimport (
     lists,
     merge,
     null_mask,
+    partitioning,
     quantiles,
     reduce,
     replace,
@@ -29,6 +30,7 @@ from . cimport (
     strings,
     traits,
     transform,
+    transpose,
     types,
     unary,
 )
@@ -60,6 +62,7 @@ __all__ = [
     "lists",
     "merge",
     "null_mask",
+    "partitioning",
     "quantiles",
     "reduce",
     "replace",
@@ -72,6 +75,7 @@ __all__ = [
     "sorting",
     "traits",
     "transform",
+    "transpose",
     "types",
     "unary",
 ]
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 84b1c29f791..2a5365e8fad 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -28,6 +28,7 @@
     lists,
     merge,
     null_mask,
+    partitioning,
     quantiles,
     reduce,
     replace,
@@ -40,6 +41,7 @@
     strings,
     traits,
     transform,
+    transpose,
     types,
     unary,
 )
@@ -74,6 +76,7 @@
     "lists",
     "merge",
     "null_mask",
+    "partitioning",
     "quantiles",
     "reduce",
     "replace",
@@ -86,6 +89,7 @@
     "sorting",
     "traits",
     "transform",
+    "transpose",
     "types",
     "unary",
 ]
diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
index 5a67f4d6cdb..5f9d145139a 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -94,7 +94,7 @@ cpdef bool is_supported_operation(
 ):
     """Check if an operation is supported for the given data types.
 
-    For details, see :cpp:func::is_supported_operation`.
+    For details, see :cpp:func::`is_supported_operation`.
 
     Parameters
     ----------
diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx
index 4601cba515a..e9085e3ea02 100644
--- a/python/pylibcudf/pylibcudf/column_factories.pyx
+++ b/python/pylibcudf/pylibcudf/column_factories.pyx
@@ -18,6 +18,20 @@ from .types import MaskState, TypeId
 
 
 cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id):
+    """Creates an empty column of the specified type.
+
+    For details, see :cpp:func::`make_empty_column`.
+
+    Parameters
+    ----------
+    type_or_id : Union[DataType, type_id, object]
+        The column data type.
+
+    Returns
+    -------
+    Column
+        An empty Column
+    """
     cdef unique_ptr[column] result
     cdef type_id id
 
@@ -60,7 +74,11 @@ cpdef Column make_numeric_column(
     size_type size,
     MaskArg mstate
 ):
+    """Creates an empty numeric column.
+
+    For details, see :cpp:func::`make_numeric_column`.
 
+    """
     cdef unique_ptr[column] result
     cdef mask_state state
 
diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx
index ae5d33aaa46..afb95dba5b3 100644
--- a/python/pylibcudf/pylibcudf/groupby.pyx
+++ b/python/pylibcudf/pylibcudf/groupby.pyx
@@ -286,7 +286,7 @@ cdef class GroupBy:
 
         Returns
         -------
-        Tuple[List[int], Table, Table]]
+        Tuple[List[int], Table, Table]
             A tuple of tables containing three items:
                 - A list of integer offsets into the group keys/values
                 - A table of group keys
diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
index bcc2151f5b6..965724a47b1 100644
--- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx parquet.pyx types.pyx)
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx
+                   types.pyx
+)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd
index 62820048584..1bcc0a3f963 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/io/__init__.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 # CSV is removed since it is def not cpdef (to force kw-only arguments)
-from . cimport avro, datasource, json, parquet, types
+from . cimport avro, datasource, json, orc, parquet, timezone, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
index 27640f7d955..2e4f215b12c 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.py
+++ b/python/pylibcudf/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, csv, datasource, json, parquet, types
+from . import avro, csv, datasource, json, orc, parquet, timezone, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx
index 667c67f4c36..438b0ff1634 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyx
+++ b/python/pylibcudf/pylibcudf/io/avro.pyx
@@ -20,6 +20,8 @@ cpdef TableWithMetadata read_avro(
     """
     Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.
 
+    For details, see :cpp:func:`read_avro`.
+
     Parameters
     ----------
     source_info: SourceInfo
diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd
new file mode 100644
index 00000000000..b111d617b1b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/orc.pxd
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libc.stdint cimport uint64_t
+from libcpp cimport bool
+from libcpp.optional cimport optional
+from libcpp.string cimport string
+from libcpp.vector cimport vector
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.orc_metadata cimport (
+    column_statistics,
+    parsed_orc_statistics,
+    statistics_type,
+)
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.types cimport DataType
+
+
+cpdef TableWithMetadata read_orc(
+    SourceInfo source_info,
+    list columns = *,
+    list stripes = *,
+    size_type skip_rows = *,
+    size_type nrows = *,
+    bool use_index = *,
+    bool use_np_dtypes = *,
+    DataType timestamp_type = *,
+    list decimal128_columns = *
+)
+
+cdef class OrcColumnStatistics:
+    cdef optional[uint64_t] number_of_values_c
+    cdef optional[bool] has_null_c
+    cdef statistics_type type_specific_stats_c
+    cdef dict column_stats
+
+    cdef void _init_stats_dict(self)
+
+    @staticmethod
+    cdef OrcColumnStatistics from_libcudf(column_statistics& col_stats)
+
+
+cdef class ParsedOrcStatistics:
+    cdef parsed_orc_statistics c_obj
+
+    @staticmethod
+    cdef ParsedOrcStatistics from_libcudf(parsed_orc_statistics& orc_stats)
+
+
+cpdef ParsedOrcStatistics read_parsed_orc_statistics(
+    SourceInfo source_info
+)
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx
new file mode 100644
index 00000000000..01a5e4b04a1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/orc.pyx
@@ -0,0 +1,302 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp cimport bool
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+import datetime
+
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.orc cimport (
+    orc_reader_options,
+    read_orc as cpp_read_orc,
+)
+from pylibcudf.libcudf.io.orc_metadata cimport (
+    binary_statistics,
+    bucket_statistics,
+    column_statistics,
+    date_statistics,
+    decimal_statistics,
+    double_statistics,
+    integer_statistics,
+    no_statistics,
+    read_parsed_orc_statistics as cpp_read_parsed_orc_statistics,
+    statistics_type,
+    string_statistics,
+    timestamp_statistics,
+)
+from pylibcudf.libcudf.io.types cimport table_with_metadata
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.types cimport DataType
+from pylibcudf.variant cimport get_if, holds_alternative
+
+
+cdef class OrcColumnStatistics:
+    def __init__(self):
+        raise TypeError(
+            "OrcColumnStatistics should not be instantiated by users. If it is "
+            "being constructed in Cython from a preexisting libcudf object, "
+            "use `OrcColumnStatistics.from_libcudf` instead."
+        )
+
+    @property
+    def number_of_values(self):
+        if self.number_of_values_c.has_value():
+            return self.number_of_values_c.value()
+        return None
+
+    @property
+    def has_null(self):
+        if self.has_null_c.has_value():
+            return self.has_null_c.value()
+        return None
+
+    cdef void _init_stats_dict(self):
+        # Initialize stats to return and parse stats blob
+        self.column_stats = {}
+
+        cdef statistics_type type_specific_stats = self.type_specific_stats_c
+
+        cdef integer_statistics* int_stats
+        cdef double_statistics* dbl_stats
+        cdef string_statistics* str_stats
+        cdef bucket_statistics* bucket_stats
+        cdef decimal_statistics* dec_stats
+        cdef date_statistics* date_stats
+        cdef binary_statistics* bin_stats
+        cdef timestamp_statistics* ts_stats
+
+        if holds_alternative[no_statistics](type_specific_stats):
+            pass
+        elif int_stats := get_if[integer_statistics](&type_specific_stats):
+            if int_stats.minimum.has_value():
+                self.column_stats["minimum"] = int_stats.minimum.value()
+            else:
+                self.column_stats["minimum"] = None
+            if int_stats.maximum.has_value():
+                self.column_stats["maximum"] = int_stats.maximum.value()
+            else:
+                self.column_stats["maximum"] = None
+            if int_stats.sum.has_value():
+                self.column_stats["sum"] = int_stats.sum.value()
+            else:
+                self.column_stats["sum"] = None
+        elif dbl_stats := get_if[double_statistics](&type_specific_stats):
+            if dbl_stats.minimum.has_value():
+                self.column_stats["minimum"] = dbl_stats.minimum.value()
+            else:
+                self.column_stats["minimum"] = None
+            if dbl_stats.maximum.has_value():
+                self.column_stats["maximum"] = dbl_stats.maximum.value()
+            else:
+                self.column_stats["maximum"] = None
+            if dbl_stats.sum.has_value():
+                self.column_stats["sum"] = dbl_stats.sum.value()
+            else:
+                self.column_stats["sum"] = None
+        elif str_stats := get_if[string_statistics](&type_specific_stats):
+            if str_stats.minimum.has_value():
+                self.column_stats["minimum"] = str_stats.minimum.value().decode("utf-8")
+            else:
+                self.column_stats["minimum"] = None
+            if str_stats.maximum.has_value():
+                self.column_stats["maximum"] = str_stats.maximum.value().decode("utf-8")
+            else:
+                self.column_stats["maximum"] = None
+            if str_stats.sum.has_value():
+                self.column_stats["sum"] = str_stats.sum.value()
+            else:
+                self.column_stats["sum"] = None
+        elif bucket_stats := get_if[bucket_statistics](&type_specific_stats):
+            self.column_stats["true_count"] = bucket_stats.count[0]
+            self.column_stats["false_count"] = (
+                self.number_of_values
+                - self.column_stats["true_count"]
+            )
+        elif dec_stats := get_if[decimal_statistics](&type_specific_stats):
+            if dec_stats.minimum.has_value():
+                self.column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8")
+            else:
+                self.column_stats["minimum"] = None
+            if dec_stats.maximum.has_value():
+                self.column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8")
+            else:
+                self.column_stats["maximum"] = None
+            if dec_stats.sum.has_value():
+                self.column_stats["sum"] = dec_stats.sum.value().decode("utf-8")
+            else:
+                self.column_stats["sum"] = None
+        elif date_stats := get_if[date_statistics](&type_specific_stats):
+            if date_stats.minimum.has_value():
+                self.column_stats["minimum"] = datetime.datetime.fromtimestamp(
+                    datetime.timedelta(date_stats.minimum.value()).total_seconds(),
+                    datetime.timezone.utc,
+                )
+            else:
+                self.column_stats["minimum"] = None
+            if date_stats.maximum.has_value():
+                self.column_stats["maximum"] = datetime.datetime.fromtimestamp(
+                    datetime.timedelta(date_stats.maximum.value()).total_seconds(),
+                    datetime.timezone.utc,
+                )
+            else:
+                self.column_stats["maximum"] = None
+        elif bin_stats := get_if[binary_statistics](&type_specific_stats):
+            if bin_stats.sum.has_value():
+                self.column_stats["sum"] = bin_stats.sum.value()
+            else:
+                self.column_stats["sum"] = None
+        elif ts_stats := get_if[timestamp_statistics](&type_specific_stats):
+            # Before ORC-135, the local timezone offset was included and they were
+            # stored as minimum and maximum. After ORC-135, the timestamp is
+            # adjusted to UTC before being converted to milliseconds and stored
+            # in minimumUtc and maximumUtc.
+            # TODO: Support minimum and maximum by reading writer's local timezone
+            if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value():
+                self.column_stats["minimum"] = datetime.datetime.fromtimestamp(
+                    ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc
+                )
+                self.column_stats["maximum"] = datetime.datetime.fromtimestamp(
+                    ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc
+                )
+        else:
+            raise ValueError("Unsupported statistics type")
+
+    def __getitem__(self, item):
+        return self.column_stats[item]
+
+    def __contains__(self, item):
+        return item in self.column_stats
+
+    def get(self, item, default=None):
+        return self.column_stats.get(item, default)
+
+    @staticmethod
+    cdef OrcColumnStatistics from_libcudf(column_statistics& col_stats):
+        cdef OrcColumnStatistics out = OrcColumnStatistics.__new__(OrcColumnStatistics)
+        out.number_of_values_c = col_stats.number_of_values
+        out.has_null_c = col_stats.has_null
+        out.type_specific_stats_c = col_stats.type_specific_stats
+        out._init_stats_dict()
+        return out
+
+
+cdef class ParsedOrcStatistics:
+
+    @property
+    def column_names(self):
+        return [name.decode() for name in self.c_obj.column_names]
+
+    @property
+    def file_stats(self):
+        return [
+            OrcColumnStatistics.from_libcudf(self.c_obj.file_stats[i])
+            for i in range(self.c_obj.file_stats.size())
+        ]
+
+    @property
+    def stripes_stats(self):
+        return [
+            [
+                OrcColumnStatistics.from_libcudf(stripe_stats_c[i])
+                for i in range(stripe_stats_c.size())
+            ]
+            for stripe_stats_c in self.c_obj.stripes_stats
+        ]
+
+    @staticmethod
+    cdef ParsedOrcStatistics from_libcudf(parsed_orc_statistics& orc_stats):
+        cdef ParsedOrcStatistics out = ParsedOrcStatistics.__new__(ParsedOrcStatistics)
+        out.c_obj = move(orc_stats)
+        return out
+
+
+cpdef TableWithMetadata read_orc(
+    SourceInfo source_info,
+    list columns = None,
+    list stripes = None,
+    size_type skip_rows = 0,
+    size_type nrows = -1,
+    bool use_index = True,
+    bool use_np_dtypes = True,
+    DataType timestamp_type = None,
+    list decimal128_columns = None,
+):
+    """Reads an ORC file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the Parquet file from.
+    columns : list, default None
+        The string names of the columns to be read.
+    stripes : list[list[size_type]], default None
+        List of stripes to be read.
+    skip_rows : int64_t, default 0
+        The number of rows to skip from the start of the file.
+    nrows : size_type, default -1
+        The number of rows to read. By default, read the entire file.
+    use_index : bool, default True
+        Whether to use the row index to speed up reading.
+    use_np_dtypes : bool, default True
+        Whether to use numpy compatible dtypes.
+    timestamp_type : DataType, default None
+        The timestamp type to use for the timestamp columns.
+    decimal128_columns : list, default None
+        List of column names to be read as 128-bit decimals.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata (column names) that were read in.
+    """
+    cdef orc_reader_options opts
+    cdef vector[vector[size_type]] c_stripes
+    opts = move(
+        orc_reader_options.builder(source_info.c_obj)
+        .use_index(use_index)
+        .build()
+    )
+    if nrows >= 0:
+        opts.set_num_rows(nrows)
+    if skip_rows >= 0:
+        opts.set_skip_rows(skip_rows)
+    if stripes is not None:
+        c_stripes = stripes
+        opts.set_stripes(c_stripes)
+    if timestamp_type is not None:
+        opts.set_timestamp_type(timestamp_type.c_obj)
+
+    cdef vector[string] c_decimal128_columns
+    if decimal128_columns is not None and len(decimal128_columns) > 0:
+        c_decimal128_columns.reserve(len(decimal128_columns))
+        for col in decimal128_columns:
+            if not isinstance(col, str):
+                raise TypeError("Decimal 128 column names must be strings!")
+            c_decimal128_columns.push_back(col.encode())
+        opts.set_decimal128_columns(c_decimal128_columns)
+
+    cdef vector[string] c_column_names
+    if columns is not None and len(columns) > 0:
+        c_column_names.reserve(len(columns))
+        for col in columns:
+            if not isinstance(col, str):
+                raise TypeError("Column names must be strings!")
+            c_column_names.push_back(col.encode())
+        opts.set_columns(c_column_names)
+
+    cdef table_with_metadata c_result
+
+    with nogil:
+        c_result = move(cpp_read_orc(opts))
+
+    return TableWithMetadata.from_libcudf(c_result)
+
+
+cpdef ParsedOrcStatistics read_parsed_orc_statistics(
+    SourceInfo source_info
+):
+    cdef parsed_orc_statistics parsed = (
+        cpp_read_parsed_orc_statistics(source_info.c_obj)
+    )
+    return ParsedOrcStatistics.from_libcudf(parsed)
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index df1f1b14247..981ca7b8159 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -59,6 +59,8 @@ cdef class ChunkedParquetReader:
     """
     Reads chunks of a Parquet file into a :py:class:`~.types.TableWithMetadata`.
 
+    For details, see :cpp:class:`chunked_parquet_reader`.
+
     Parameters
     ----------
     source_info : SourceInfo
@@ -167,6 +169,8 @@ cpdef read_parquet(
 ):
     """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`.
 
+    For details, see :cpp:func:`read_parquet`.
+
     Parameters
     ----------
     source_info : SourceInfo
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pxd b/python/pylibcudf/pylibcudf/io/timezone.pxd
new file mode 100644
index 00000000000..2aa755dbbd8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/timezone.pxd
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from ..table cimport Table
+
+
+cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name)
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx
new file mode 100644
index 00000000000..e02239d7252
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/timezone.pyx
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport make_optional
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from pylibcudf.libcudf.io.timezone cimport (
+    make_timezone_transition_table as cpp_make_timezone_transition_table,
+)
+from pylibcudf.libcudf.table.table cimport table
+
+from ..table cimport Table
+
+
+cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name):
+    """
+    Creates a transition table to convert ORC timestamps to UTC.
+
+    Parameters
+    ----------
+    tzif_dir : str
+        The directory where the TZif files are located
+    timezone_name : str
+        standard timezone name
+
+    Returns
+    -------
+    Table
+        The transition table for the given timezone.
+    """
+    cdef unique_ptr[table] c_result
+    cdef string c_tzdir = tzif_dir.encode()
+    cdef string c_tzname = timezone_name.encode()
+
+    with nogil:
+        c_result = move(
+            cpp_make_timezone_transition_table(
+                make_optional[string](c_tzdir),
+                c_tzname
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index 1600a805b37..563a02761da 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -130,6 +130,7 @@ cdef class TableWithMetadata:
         """
         return self.metadata.per_file_user_data
 
+
 cdef class SourceInfo:
     """A class containing details on a source to read from.
 
diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx
index b5a7445df36..b3f6a92d85c 100644
--- a/python/pylibcudf/pylibcudf/labeling.pyx
+++ b/python/pylibcudf/pylibcudf/labeling.pyx
@@ -20,6 +20,8 @@ cpdef Column label_bins(
 ):
     """Labels elements based on membership in the specified bins.
 
+    For details see :cpp:func:`label_bins`.
+
     Parameters
     ----------
     input : Column
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
index e4a09b8feb2..dca24c7f665 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
@@ -35,6 +35,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_use_index(bool val) except +
         void enable_use_np_dtypes(bool val) except +
         void set_timestamp_type(data_type type) except +
+        void set_decimal128_columns(vector[string] val) except +
 
         @staticmethod
         orc_reader_options_builder builder(
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
index db6cb0cdfa5..9302ffe2f80 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
@@ -1,11 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-cimport pylibcudf.libcudf.io.types as cudf_io_types
 from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
 from libcpp cimport bool
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
+from pylibcudf.libcudf.io cimport types as cudf_io_types
 from pylibcudf.variant cimport monostate, variant
 
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
index 1ea10e8a194..89bddbffab5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
@@ -25,3 +25,10 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
         const column_view& partition_map,
         int num_partitions
     ) except +
+
+    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] \
+        round_robin_partition "cudf::round_robin_partition" (
+        const table_view& input,
+        int num_partitions,
+        int start_partition
+    ) except +
diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx
index 947caddc485..6f82124d06e 100644
--- a/python/pylibcudf/pylibcudf/lists.pyx
+++ b/python/pylibcudf/pylibcudf/lists.pyx
@@ -52,6 +52,8 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx):
 
     All other columns will be duplicated for each element in the list.
 
+    For details, see :cpp:func:`explode_outer`.
+
     Parameters
     ----------
     input : Table
@@ -75,6 +77,8 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx):
 cpdef Column concatenate_rows(Table input):
     """Concatenate multiple lists columns into a single lists column row-wise.
 
+    For details, see :cpp:func:`concatenate_list_elements`.
+
     Parameters
     ----------
     input : Table
@@ -96,6 +100,8 @@ cpdef Column concatenate_rows(Table input):
 cpdef Column concatenate_list_elements(Column input, bool dropna):
     """Concatenate multiple lists on the same row into a single list.
 
+    For details, see :cpp:func:`concatenate_list_elements`.
+
     Parameters
     ----------
     input : Column
@@ -168,6 +174,8 @@ cpdef Column contains_nulls(Column input):
     """Create a column of bool values indicating whether
     each row in the lists column contains a null value.
 
+    For details, see :cpp:func:`contains_nulls`.
+
     Parameters
     ----------
     input : Column
@@ -290,6 +298,8 @@ cpdef Column segmented_gather(Column input, Column gather_map_list):
 cpdef Column extract_list_element(Column input, ColumnOrSizeType index):
     """Create a column of extracted list elements.
 
+    For details, see :cpp:func:`extract_list_element`.
+
     Parameters
     ----------
     input : Column
@@ -318,6 +328,8 @@ cpdef Column count_elements(Column input):
     list element in the given lists column.
     For details, see :cpp:func:`count_elements`.
 
+    For details, see :cpp:func:`count_elements`.
+
     Parameters
     ----------
     input : Column
diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx
index a7d43c9d158..6d707b67449 100644
--- a/python/pylibcudf/pylibcudf/merge.pyx
+++ b/python/pylibcudf/pylibcudf/merge.pyx
@@ -19,6 +19,8 @@ cpdef Table merge (
 ):
     """Merge a set of sorted tables.
 
+    For details see :cpp:func:`merge`.
+
     Parameters
     ----------
     tables_to_merge : list
diff --git a/python/pylibcudf/pylibcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/partitioning.pxd
new file mode 100644
index 00000000000..aad60149fc4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/partitioning.pxd
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef tuple[Table, list] hash_partition(
+    Table input,
+    list columns_to_hash,
+    int num_partitions
+)
+
+cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions)
+
+cpdef tuple[Table, list] round_robin_partition(
+    Table input,
+    int num_partitions,
+    int start_partition=*
+)
diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx
new file mode 100644
index 00000000000..8fa70daab5a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/partitioning.pyx
@@ -0,0 +1,120 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cimport pylibcudf.libcudf.types as libcudf_types
+from libcpp.memory cimport unique_ptr
+from libcpp.pair cimport pair
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.libcudf cimport partitioning as cpp_partitioning
+from pylibcudf.libcudf.table.table cimport table
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef tuple[Table, list] hash_partition(
+    Table input,
+    list columns_to_hash,
+    int num_partitions
+):
+    """
+    Partitions rows from the input table into multiple output tables.
+
+    For details, see :cpp:func:`hash_partition`.
+
+    Parameters
+    ----------
+    input : Table
+        The table to partition
+    columns_to_hash : list[int]
+        Indices of input columns to hash
+    num_partitions : int
+        The number of partitions to use
+
+    Returns
+    -------
+    tuple[Table, list[int]]
+        An output table and a vector of row offsets to each partition
+    """
+    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
+    cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash
+    cdef int c_num_partitions = num_partitions
+
+    with nogil:
+        c_result = move(
+            cpp_partitioning.hash_partition(
+                input.view(), c_columns_to_hash, c_num_partitions
+            )
+        )
+
+    return Table.from_libcudf(move(c_result.first)), list(c_result.second)
+
+cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions):
+    """
+    Partitions rows of `t` according to the mapping specified by `partition_map`.
+
+    For details, see :cpp:func:`partition`.
+
+    Parameters
+    ----------
+    t : Table
+        The table to partition
+    partition_map : Column
+        Non-nullable column of integer values that map each row
+        in `t` to it's partition.
+    num_partitions : int
+        The total number of partitions
+
+    Returns
+    -------
+    tuple[Table, list[int]]
+        An output table and a list of row offsets to each partition
+    """
+    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
+    cdef int c_num_partitions = num_partitions
+
+    with nogil:
+        c_result = move(
+            cpp_partitioning.partition(t.view(), partition_map.view(), c_num_partitions)
+        )
+
+    return Table.from_libcudf(move(c_result.first)), list(c_result.second)
+
+
+cpdef tuple[Table, list] round_robin_partition(
+    Table input,
+    int num_partitions,
+    int start_partition=0
+):
+    """
+    Round-robin partition.
+
+    For details, see :cpp:func:`round_robin_partition`.
+
+    Parameters
+    ----------
+    input : Table
+        The input table to be round-robin partitioned
+    num_partitions : int
+        Number of partitions for the table
+    start_partition : int, default 0
+        Index of the 1st partition
+
+    Returns
+    -------
+    tuple[Table, list[int]]
+        The partitioned table and the partition offsets
+        for each partition within the table.
+    """
+    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
+    cdef int c_num_partitions = num_partitions
+    cdef int c_start_partition = start_partition
+
+    with nogil:
+        c_result = move(
+            cpp_partitioning.round_robin_partition(
+                input.view(), c_num_partitions, c_start_partition
+            )
+        )
+
+    return Table.from_libcudf(move(c_result.first)), list(c_result.second)
diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx
index b847ade774d..3a771fbe7ef 100644
--- a/python/pylibcudf/pylibcudf/quantiles.pyx
+++ b/python/pylibcudf/pylibcudf/quantiles.pyx
@@ -30,6 +30,8 @@ cpdef Column quantile(
     Computes the specified quantiles by interpolating values between which they lie,
     using the interpolation strategy specified in interp.
 
+    For details see :cpp:func:`quantile`.
+
     Parameters
     ----------
     input: Column
@@ -91,6 +93,8 @@ cpdef Table quantiles(
     specified quantiles. In the event a quantile lies in between rows, the specified
     interpolation strategy is used to pick between the rows.
 
+    For details see :cpp:func:`quantiles`.
+
     Parameters
     ----------
     input: Table
diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx
index a99145be900..eb1499ebbea 100644
--- a/python/pylibcudf/pylibcudf/reshape.pyx
+++ b/python/pylibcudf/pylibcudf/reshape.pyx
@@ -23,6 +23,8 @@ cpdef Column interleave_columns(Table source_table):
     in     = [[A1, A2, A3], [B1, B2, B3]]
     return = [A1, B1, A2, B2, A3, B3]
 
+    For details, see :cpp:func:`interleave_columns`.
+
     Parameters
     ----------
     source_table: Table
@@ -44,6 +46,8 @@ cpdef Column interleave_columns(Table source_table):
 cpdef Table tile(Table source_table, size_type count):
     """Repeats the rows from input table count times to form a new table.
 
+    For details, see :cpp:func:`tile`.
+
     Parameters
     ----------
     source_table: Table
diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx
index ff2468f3f9c..814bc6553d8 100644
--- a/python/pylibcudf/pylibcudf/search.pyx
+++ b/python/pylibcudf/pylibcudf/search.pyx
@@ -19,6 +19,8 @@ cpdef Column lower_bound(
 ):
     """Find smallest indices in haystack where needles may be inserted to retain order.
 
+    For details, see :cpp:func:`lower_bound`.
+
     Parameters
     ----------
     haystack : Table
@@ -58,6 +60,8 @@ cpdef Column upper_bound(
 ):
     """Find largest indices in haystack where needles may be inserted to retain order.
 
+    For details, see :cpp:func:`upper_bound`.
+
     Parameters
     ----------
     haystack : Table
@@ -92,6 +96,8 @@ cpdef Column upper_bound(
 cpdef Column contains(Column haystack, Column needles):
     """Check whether needles are present in haystack.
 
+    For details, see :cpp:func:`contains`.
+
     Parameters
     ----------
     haystack : Table
diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx
index bd173eebacb..42289d54bca 100644
--- a/python/pylibcudf/pylibcudf/sorting.pyx
+++ b/python/pylibcudf/pylibcudf/sorting.pyx
@@ -16,6 +16,8 @@ from .table cimport Table
 cpdef Column sorted_order(Table source_table, list column_order, list null_precedence):
     """Computes the row indices required to sort the table.
 
+    For details, see :cpp:func:`sorted_order`.
+
     Parameters
     ----------
     source_table : Table
@@ -52,6 +54,8 @@ cpdef Column stable_sorted_order(
     """Computes the row indices required to sort the table,
     preserving order of equal elements.
 
+    For details, see :cpp:func:`stable_sorted_order`.
+
     Parameters
     ----------
     source_table : Table
@@ -90,6 +94,8 @@ cpdef Column rank(
 ):
     """Computes the rank of each element in the column.
 
+    For details, see :cpp:func:`rank`.
+
     Parameters
     ----------
     input_view : Column
@@ -128,6 +134,8 @@ cpdef Column rank(
 cpdef bool is_sorted(Table tbl, list column_order, list null_precedence):
     """Checks if the table is sorted.
 
+    For details, see :cpp:func:`is_sorted`.
+
     Parameters
     ----------
     tbl : Table
@@ -165,6 +173,8 @@ cpdef Table segmented_sort_by_key(
 ):
     """Sorts the table by key, within segments.
 
+    For details, see :cpp:func:`segmented_sort_by_key`.
+
     Parameters
     ----------
     values : Table
@@ -209,6 +219,8 @@ cpdef Table stable_segmented_sort_by_key(
     """Sorts the table by key preserving order of equal elements,
     within segments.
 
+    For details, see :cpp:func:`stable_segmented_sort_by_key`.
+
     Parameters
     ----------
     values : Table
@@ -251,6 +263,8 @@ cpdef Table sort_by_key(
 ):
     """Sorts the table by key.
 
+    For details, see :cpp:func:`sort_by_key`.
+
     Parameters
     ----------
     values : Table
@@ -290,6 +304,8 @@ cpdef Table stable_sort_by_key(
 ):
     """Sorts the table by key preserving order of equal elements.
 
+    For details, see :cpp:func:`stable_sort_by_key`.
+
     Parameters
     ----------
     values : Table
@@ -324,6 +340,8 @@ cpdef Table stable_sort_by_key(
 cpdef Table sort(Table source_table, list column_order, list null_precedence):
     """Sorts the table.
 
+    For details, see :cpp:func:`sort`.
+
     Parameters
     ----------
     source_table : Table
@@ -355,6 +373,8 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence):
 cpdef Table stable_sort(Table source_table, list column_order, list null_precedence):
     """Sorts the table preserving order of equal elements.
 
+    For details, see :cpp:func:`stable_sort`.
+
     Parameters
     ----------
     source_table : Table
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx
index b574bfa9fa2..d5475ea79d5 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pyx
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx
@@ -25,6 +25,8 @@ from .table cimport Table
 cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold):
     """Filters out rows from the input table based on the presence of nulls.
 
+    For details, see :cpp:func:`drop_nulls`.
+
     Parameters
     ----------
     source_table : Table
@@ -53,6 +55,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold):
 cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold):
     """Filters out rows from the input table based on the presence of NaNs.
 
+    For details, see :cpp:func:`drop_nans`.
+
     Parameters
     ----------
     source_table : Table
@@ -81,6 +85,8 @@ cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold):
 cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask):
     """Filters out rows from the input table based on a boolean mask.
 
+    For details, see :cpp:func:`apply_boolean_mask`.
+
     Parameters
     ----------
     source_table : Table
@@ -111,6 +117,8 @@ cpdef Table unique(
 ):
     """Filter duplicate consecutive rows from the input table.
 
+    For details, see :cpp:func:`unique`.
+
     Parameters
     ----------
     input : Table
@@ -153,6 +161,8 @@ cpdef Table distinct(
 ):
     """Get the distinct rows from the input table.
 
+    For details, see :cpp:func:`distinct`.
+
     Parameters
     ----------
     input : Table
@@ -191,6 +201,8 @@ cpdef Column distinct_indices(
 ):
     """Get the indices of the distinct rows from the input table.
 
+    For details, see :cpp:func:`distinct_indices`.
+
     Parameters
     ----------
     input : Table
@@ -226,6 +238,8 @@ cpdef Table stable_distinct(
 ):
     """Get the distinct rows from the input table, preserving input order.
 
+    For details, see :cpp:func:`stable_distinct`.
+
     Parameters
     ----------
     input : Table
@@ -263,6 +277,8 @@ cpdef size_type unique_count(
 ):
     """Returns the number of unique consecutive elements in the input column.
 
+    For details, see :cpp:func:`unique_count`.
+
     Parameters
     ----------
     source : Column
@@ -294,6 +310,8 @@ cpdef size_type distinct_count(
 ):
     """Returns the number of distinct elements in the input column.
 
+    For details, see :cpp:func:`distinct_count`.
+
     Parameters
     ----------
     source : Column
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 77f20b0b917..142bc124ca2 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -13,8 +13,21 @@
 # =============================================================================
 
 set(cython_sources
-    capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx
-    regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx
+    attributes.pyx
+    capitalize.pyx
+    case.pyx
+    char_types.pyx
+    contains.pyx
+    extract.pyx
+    find.pyx
+    findall.pyx
+    regex_flags.pyx
+    regex_program.pyx
+    repeat.pyx
+    replace.pyx
+    side_type.pyx
+    slice.pyx
+    strip.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 91d884b294b..d8afccc7336 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from . cimport (
+    attributes,
     capitalize,
     case,
     char_types,
@@ -16,3 +17,21 @@ from . cimport (
     strip,
 )
 from .side_type cimport side_type
+
+__all__ = [
+    "attributes",
+    "capitalize",
+    "case",
+    "char_types",
+    "contains",
+    "convert",
+    "extract",
+    "find",
+    "findall",
+    "regex_flags",
+    "regex_program",
+    "replace",
+    "slice",
+    "strip",
+    "side_type",
+]
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index b4856784390..22452812e42 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from . import (
+    attributes,
     capitalize,
     case,
     char_types,
@@ -17,3 +18,21 @@
     strip,
 )
 from .side_type import SideType
+
+__all__ = [
+    "attributes",
+    "capitalize",
+    "case",
+    "char_types",
+    "contains",
+    "convert",
+    "extract",
+    "find",
+    "findall",
+    "regex_flags",
+    "regex_program",
+    "replace",
+    "slice",
+    "strip",
+    "SideType",
+]
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pxd b/python/pylibcudf/pylibcudf/strings/attributes.pxd
new file mode 100644
index 00000000000..27398766924
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column count_characters(Column source_strings)
+
+cpdef Column count_bytes(Column source_strings)
+
+cpdef Column code_points(Column source_strings)
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx
new file mode 100644
index 00000000000..36bee7bd1d9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport attributes as cpp_attributes
+
+
+cpdef Column count_characters(Column source_strings):
+    """
+    Returns a column containing character lengths of each string
+    in the given column.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Column of strings.
+
+    Returns
+    -------
+    Column
+        New column with lengths for each string
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_attributes.count_characters(source_strings.view()))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column count_bytes(Column source_strings):
+    """
+    Returns a column containing byte lengths of each string
+    in the given column.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Column of strings.
+
+    Returns
+    -------
+    Column
+        New column with the number of bytes for each string
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_attributes.count_bytes(source_strings.view()))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column code_points(Column source_strings):
+    """
+    Creates a numeric column with code point values (integers)
+    for each character of each string.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Column of strings.
+
+    Returns
+    -------
+    Column
+        New column with code point integer values for each character
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_attributes.code_points(source_strings.view()))
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx
index 03ecb13a50e..3a6b87504b3 100644
--- a/python/pylibcudf/pylibcudf/strings/findall.pyx
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyx
@@ -13,7 +13,7 @@ cpdef Column findall(Column input, RegexProgram pattern):
     Returns a lists column of strings for each matching occurrence using
     the regex_program pattern within each string.
 
-    For details, see For details, see :cpp:func:`cudf::strings::findall`.
+    For details, see :cpp:func:`cudf::strings::findall`.
 
     Parameters
     ----------
diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py
index babe6634318..9f389fa42c4 100644
--- a/python/pylibcudf/pylibcudf/tests/common/utils.py
+++ b/python/pylibcudf/pylibcudf/tests/common/utils.py
@@ -9,6 +9,7 @@
 import pyarrow.compute as pc
 import pylibcudf as plc
 import pytest
+from pyarrow.orc import write_table as orc_write_table
 from pyarrow.parquet import write_table as pq_write_table
 from pylibcudf.io.types import CompressionType
 
@@ -242,13 +243,21 @@ def is_nested_list(typ):
     return nesting_level(typ)[0] > 1
 
 
-def _convert_numeric_types_to_floating(pa_table):
+def _convert_types(pa_table, input_pred, result_type):
     """
-    Useful little helper for testing the
-    dtypes option in I/O readers.
+    Useful little helper for testing the dtypes option in I/O readers.
 
-    Returns a tuple containing the pylibcudf dtypes
-    and the new pyarrow schema
+    Returns a tuple containing the pylibcudf dtypes and the new pyarrow schema based on
+    the data in the table.
+
+    Parameters
+    ----------
+    pa_table : pyarrow.Table
+        The table from which to extract the dtypes
+    input_pred : function
+        Predicate that evaluates to true for types to replace
+    result_type : pa.DataType
+        The type to cast to
     """
     dtypes = []
     new_fields = []
@@ -257,11 +266,9 @@ def _convert_numeric_types_to_floating(pa_table):
         child_types = []
 
         plc_type = plc.interop.from_arrow(field.type)
-        if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer(
-            field.type
-        ):
-            plc_type = plc.interop.from_arrow(pa.float64())
-            field = field.with_type(pa.float64())
+        if input_pred(field.type):
+            plc_type = plc.interop.from_arrow(result_type)
+            field = field.with_type(result_type)
 
         dtypes.append((field.name, plc_type, child_types))
 
@@ -332,6 +339,16 @@ def make_source(path_or_buf, pa_table, format, **kwargs):
             if isinstance(path_or_buf, io.IOBase)
             else path_or_buf,
         )
+    elif format == "orc":
+        # The conversion to pandas is lossy (doesn't preserve
+        # nested types) so we
+        # will just use pyarrow directly to write this
+        orc_write_table(
+            pa_table,
+            pa.PythonFile(path_or_buf)
+            if isinstance(path_or_buf, io.IOBase)
+            else path_or_buf,
+        )
     if isinstance(path_or_buf, io.IOBase):
         path_or_buf.seek(0)
     return path_or_buf
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
index ccd7eef54f3..ab26f23418d 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
@@ -9,7 +9,7 @@
 import pytest
 from pylibcudf.io.types import CompressionType
 from utils import (
-    _convert_numeric_types_to_floating,
+    _convert_types,
     assert_table_and_meta_eq,
     make_source,
     write_source_str,
@@ -148,7 +148,11 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols):
     if usecols is not None:
         pa_table = pa_table.select(usecols)
 
-    dtypes, new_fields = _convert_numeric_types_to_floating(pa_table)
+    dtypes, new_fields = _convert_types(
+        pa_table,
+        lambda t: (pa.types.is_unsigned_integer(t) or pa.types.is_integer(t)),
+        pa.float64(),
+    )
     # Extract the dtype out of the (name, type, child_types) tuple
     # (read_csv doesn't support this format since it doesn't support nested columns)
     dtypes = {name: dtype for name, dtype, _ in dtypes}
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
new file mode 100644
index 00000000000..42b14b1feff
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import _convert_types, assert_table_and_meta_eq, make_source
+
+# Shared kwargs to pass to make_source
+_COMMON_ORC_SOURCE_KWARGS = {"format": "orc"}
+
+
+@pytest.mark.parametrize("columns", [None, ["col_int64", "col_bool"]])
+def test_read_orc_basic(
+    table_data, binary_source_or_sink, nrows_skiprows, columns
+):
+    _, pa_table = table_data
+    nrows, skiprows = nrows_skiprows
+
+    # ORC reader doesn't support skip_rows for nested columns
+    if skiprows > 0:
+        colnames_to_drop = []
+        for i in range(len(pa_table.schema)):
+            field = pa_table.schema.field(i)
+
+            if pa.types.is_nested(field.type):
+                colnames_to_drop.append(field.name)
+        pa_table = pa_table.drop(colnames_to_drop)
+    # ORC doesn't support unsigned ints
+    # let's cast to int64
+    _, new_fields = _convert_types(
+        pa_table, pa.types.is_unsigned_integer, pa.int64()
+    )
+    pa_table = pa_table.cast(pa.schema(new_fields))
+
+    source = make_source(
+        binary_source_or_sink, pa_table, **_COMMON_ORC_SOURCE_KWARGS
+    )
+
+    res = plc.io.orc.read_orc(
+        plc.io.SourceInfo([source]),
+        nrows=nrows,
+        skip_rows=skiprows,
+        columns=columns,
+    )
+
+    if columns is not None:
+        pa_table = pa_table.select(columns)
+
+    # Adapt to nrows/skiprows
+    pa_table = pa_table.slice(
+        offset=skiprows, length=nrows if nrows != -1 else None
+    )
+
+    assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
new file mode 100644
index 00000000000..76b0424b2af
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import zoneinfo
+
+import pylibcudf as plc
+import pytest
+
+
+def test_make_timezone_transition_table():
+    if len(zoneinfo.TZPATH) == 0:
+        pytest.skip("No TZPATH available.")
+    tz_path = zoneinfo.TZPATH[0]
+    result = plc.io.timezone.make_timezone_transition_table(
+        tz_path, "America/Los_Angeles"
+    )
+    assert isinstance(result, plc.Table)
+    assert result.num_rows() > 0
diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
new file mode 100644
index 00000000000..444d0089d2c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_table_eq
+
+
+@pytest.fixture(scope="module")
+def partitioning_data():
+    data = {"a": [1, 2, 3], "b": [1, 2, 5], "c": [1, 2, 10]}
+    pa_table = pa.table(data)
+    plc_table = plc.interop.from_arrow(pa_table)
+    return data, plc_table, pa_table
+
+
+def test_partition(partitioning_data):
+    raw_data, plc_table, pa_table = partitioning_data
+    result, result_offsets = plc.partitioning.partition(
+        plc_table,
+        plc.interop.from_arrow(pa.array([0, 0, 0])),
+        1,
+    )
+    expected = pa.table(
+        list(raw_data.values()),
+        schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3),
+    )
+    assert_table_eq(expected, result)
+    assert result_offsets == [0, 3]
+
+
+def test_hash_partition(partitioning_data):
+    raw_data, plc_table, pa_table = partitioning_data
+    result, result_offsets = plc.partitioning.hash_partition(
+        plc_table, [0, 1], 1
+    )
+    expected = pa.table(
+        list(raw_data.values()),
+        schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3),
+    )
+    assert_table_eq(expected, result)
+    assert result_offsets == [0]
+
+
+def test_round_robin_partition(partitioning_data):
+    raw_data, plc_table, pa_table = partitioning_data
+    result, result_offsets = plc.partitioning.round_robin_partition(
+        plc_table, 1, 0
+    )
+    expected = pa.table(
+        list(raw_data.values()),
+        schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3),
+    )
+    assert_table_eq(expected, result)
+    assert result_offsets == [0]
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
new file mode 100644
index 00000000000..a1820def0b1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture()
+def str_data():
+    pa_data = pa.array(["A", None])
+    return pa_data, plc.interop.from_arrow(pa_data)
+
+
+def test_count_characters(str_data):
+    result = plc.strings.attributes.count_characters(str_data[1])
+    expected = pc.utf8_length(str_data[0])
+    assert_column_eq(expected, result)
+
+
+def test_count_bytes(str_data):
+    result = plc.strings.attributes.count_characters(str_data[1])
+    expected = pc.binary_length(str_data[0])
+    assert_column_eq(expected, result)
+
+
+def test_code_points(str_data):
+    result = plc.strings.attributes.code_points(str_data[1])
+    exp_data = [ord(str_data[0].to_pylist()[0])]
+    expected = pa.chunked_array([exp_data], type=pa.int32())
+    assert_column_eq(expected, result)
diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py
new file mode 100644
index 00000000000..ac11123f680
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from packaging.version import parse
+
+
+@pytest.mark.skipif(
+    parse(pa.__version__) < parse("16.0.0"),
+    reason="https://github.com/apache/arrow/pull/40070",
+)
+@pytest.mark.parametrize(
+    "arr",
+    [
+        [],
+        [1, 2, 3],
+        [1, 2],
+        [1],
+    ],
+)
+def test_transpose(arr):
+    data = {"a": arr, "b": arr}
+    arrow_tbl = pa.table(data)
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+    plc_result = plc.transpose.transpose(plc_tbl)
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.Table.from_pandas(
+        arrow_tbl.to_pandas().T, preserve_index=False
+    ).rename_columns([""] * len(arr))
+    expected = pa.table(expected, schema=result.schema)
+    assert result.equals(expected)
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index bcd6185521a..de425a27c15 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -20,6 +20,8 @@ from .utils cimport int_to_bitmask_ptr
 cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
     """Create a null mask preserving existing nulls and converting nans to null.
 
+    For details, see :cpp:func:`nans_to_nulls`.
+
     Parameters
     ----------
     input : Column
diff --git a/python/pylibcudf/pylibcudf/transpose.pxd b/python/pylibcudf/pylibcudf/transpose.pxd
new file mode 100644
index 00000000000..7b5a7676b49
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/transpose.pxd
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from .table cimport Table
+
+
+cpdef Table transpose(Table input_table)
diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx
new file mode 100644
index 00000000000..a708f6cc37f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/transpose.pyx
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.pair cimport pair
+from libcpp.utility cimport move
+from pylibcudf.libcudf cimport transpose as cpp_transpose
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table_view cimport table_view
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef Table transpose(Table input_table):
+    """Transpose a Table.
+
+    For details, see :cpp:func:`transpose`.
+
+    Parameters
+    ----------
+    input_table : Table
+        Table to transpose
+
+    Returns
+    -------
+    Table
+        Transposed table.
+    """
+    cdef pair[unique_ptr[column], table_view] c_result
+    cdef Table owner_table
+
+    with nogil:
+        c_result = move(cpp_transpose.transpose(input_table.view()))
+
+    owner_table = Table(
+        [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns()
+    )
+
+    return Table.from_table_view(c_result.second, owner_table)