From e42b91bfca834c55f1b1c77bd4d6b1542523fd5e Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 25 Sep 2024 16:21:36 -0500
Subject: [PATCH 01/11] Add polars to "all" dependency list. (#16875)

This adds Polars to the "all" dependency list, ensuring that devcontainers and developers using the conda environment can use the Polars GPU backend provided by cudf.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16875
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 +
 conda/environments/all_cuda-125_arch-x86_64.yaml | 1 +
 dependencies.yaml                                | 1 +
 3 files changed, 3 insertions(+)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 16b3d112992..5a05dfd0530 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -65,6 +65,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
+- polars>=1.8,<1.9
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<18.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index cce2e0eea84..8490296233d 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -63,6 +63,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
+- polars>=1.8,<1.9
 - pre-commit
 - pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
diff --git a/dependencies.yaml b/dependencies.yaml
index 339adbc5ff9..6909eb7168d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -25,6 +25,7 @@ files:
       - rapids_build_setuptools
       - run_common
       - run_cudf
+      - run_cudf_polars
       - run_pylibcudf
       - run_dask_cudf
       - run_custreamz

From c1f377ab911748700c032465d0b237c6a792d984 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 25 Sep 2024 17:51:44 -0400
Subject: [PATCH 02/11] Migrate ORC reader to pylibcudf (#16042)

xref #15162

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16042
---
 python/cudf/cudf/_lib/io/utils.pxd            |   4 +
 python/cudf/cudf/_lib/orc.pyx                 | 313 ++++--------------
 python/cudf/cudf/_lib/utils.pxd               |   2 +-
 python/cudf/cudf/_lib/utils.pyx               |   8 +-
 python/cudf/cudf/io/orc.py                    |  11 +-
 python/cudf/cudf/tests/test_orc.py            |  34 +-
 python/cudf/cudf/utils/ioutils.py             |   4 +-
 python/pylibcudf/pylibcudf/io/CMakeLists.txt  |   2 +-
 python/pylibcudf/pylibcudf/io/__init__.pxd    |   2 +-
 python/pylibcudf/pylibcudf/io/__init__.py     |   2 +-
 python/pylibcudf/pylibcudf/io/orc.pxd         |  50 +++
 python/pylibcudf/pylibcudf/io/orc.pyx         | 302 +++++++++++++++++
 python/pylibcudf/pylibcudf/io/types.pyx       |   1 +
 python/pylibcudf/pylibcudf/libcudf/io/orc.pxd |   1 +
 .../pylibcudf/libcudf/io/orc_metadata.pxd     |   2 +-
 .../pylibcudf/pylibcudf/tests/common/utils.py |  37 ++-
 .../pylibcudf/pylibcudf/tests/io/test_csv.py  |   8 +-
 .../pylibcudf/pylibcudf/tests/io/test_orc.py  |  53 +++
 18 files changed, 537 insertions(+), 299 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/io/orc.pxd
 create mode 100644 python/pylibcudf/pylibcudf/io/orc.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_orc.py

diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 1938f00c179..76a6e32fde0 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -21,6 +21,10 @@ cdef add_df_col_struct_names(
     df,
     child_names_dict
 )
+cdef update_col_struct_field_names(
+    Column col,
+    child_names
+)
 cdef update_struct_field_names(
     table,
     vector[column_name_info]& schema_info)
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index adeba6fffb1..f88c48ce989 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -1,8 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-import cudf
-from cudf.core.buffer import acquire_spill_lock
-
 from libc.stdint cimport int64_t
 from libcpp cimport bool, int
 from libcpp.map cimport map
@@ -11,187 +8,43 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-import datetime
 from collections import OrderedDict
 
-cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
-
 try:
     import ujson as json
 except ImportError:
     import json
 
 cimport pylibcudf.libcudf.io.types as cudf_io_types
+cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
 from pylibcudf.libcudf.io.data_sink cimport data_sink
 from pylibcudf.libcudf.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
-    orc_reader_options,
     orc_writer_options,
-    read_orc as libcudf_read_orc,
     write_orc as libcudf_write_orc,
 )
-from pylibcudf.libcudf.io.orc_metadata cimport (
-    binary_statistics,
-    bucket_statistics,
-    column_statistics,
-    date_statistics,
-    decimal_statistics,
-    double_statistics,
-    integer_statistics,
-    no_statistics,
-    parsed_orc_statistics,
-    read_parsed_orc_statistics as libcudf_read_parsed_orc_statistics,
-    statistics_type,
-    string_statistics,
-    timestamp_statistics,
-)
 from pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     compression_type,
     sink_info,
-    source_info,
     table_input_metadata,
-    table_with_metadata,
 )
 from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport data_type, size_type, type_id
-from pylibcudf.variant cimport get_if as std_get_if, holds_alternative
 
 from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_column_struct_field_names,
-)
+from cudf._lib.io.utils cimport make_sink_info, update_col_struct_field_names
+from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
 
-from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
-
-from cudf._lib.types cimport underlying_type_t_type_id
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+import pylibcudf as plc
 
+import cudf
+from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
+from cudf.core.buffer import acquire_spill_lock
 
 
-cdef _parse_column_type_statistics(column_statistics stats):
-    # Initialize stats to return and parse stats blob
-    column_stats = {}
-
-    if stats.number_of_values.has_value():
-        column_stats["number_of_values"] = stats.number_of_values.value()
-
-    if stats.has_null.has_value():
-        column_stats["has_null"] = stats.has_null.value()
-
-    cdef statistics_type type_specific_stats = stats.type_specific_stats
-
-    cdef integer_statistics* int_stats
-    cdef double_statistics* dbl_stats
-    cdef string_statistics* str_stats
-    cdef bucket_statistics* bucket_stats
-    cdef decimal_statistics* dec_stats
-    cdef date_statistics* date_stats
-    cdef binary_statistics* bin_stats
-    cdef timestamp_statistics* ts_stats
-
-    if holds_alternative[no_statistics](type_specific_stats):
-        return column_stats
-    elif int_stats := std_get_if[integer_statistics](&type_specific_stats):
-        if int_stats.minimum.has_value():
-            column_stats["minimum"] = int_stats.minimum.value()
-        else:
-            column_stats["minimum"] = None
-        if int_stats.maximum.has_value():
-            column_stats["maximum"] = int_stats.maximum.value()
-        else:
-            column_stats["maximum"] = None
-        if int_stats.sum.has_value():
-            column_stats["sum"] = int_stats.sum.value()
-        else:
-            column_stats["sum"] = None
-    elif dbl_stats := std_get_if[double_statistics](&type_specific_stats):
-        if dbl_stats.minimum.has_value():
-            column_stats["minimum"] = dbl_stats.minimum.value()
-        else:
-            column_stats["minimum"] = None
-        if dbl_stats.maximum.has_value():
-            column_stats["maximum"] = dbl_stats.maximum.value()
-        else:
-            column_stats["maximum"] = None
-        if dbl_stats.sum.has_value():
-            column_stats["sum"] = dbl_stats.sum.value()
-        else:
-            column_stats["sum"] = None
-    elif str_stats := std_get_if[string_statistics](&type_specific_stats):
-        if str_stats.minimum.has_value():
-            column_stats["minimum"] = str_stats.minimum.value().decode("utf-8")
-        else:
-            column_stats["minimum"] = None
-        if str_stats.maximum.has_value():
-            column_stats["maximum"] = str_stats.maximum.value().decode("utf-8")
-        else:
-            column_stats["maximum"] = None
-        if str_stats.sum.has_value():
-            column_stats["sum"] = str_stats.sum.value()
-        else:
-            column_stats["sum"] = None
-    elif bucket_stats := std_get_if[bucket_statistics](&type_specific_stats):
-        column_stats["true_count"] = bucket_stats.count[0]
-        column_stats["false_count"] = (
-            column_stats["number_of_values"]
-            - column_stats["true_count"]
-        )
-    elif dec_stats := std_get_if[decimal_statistics](&type_specific_stats):
-        if dec_stats.minimum.has_value():
-            column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8")
-        else:
-            column_stats["minimum"] = None
-        if dec_stats.maximum.has_value():
-            column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8")
-        else:
-            column_stats["maximum"] = None
-        if dec_stats.sum.has_value():
-            column_stats["sum"] = dec_stats.sum.value().decode("utf-8")
-        else:
-            column_stats["sum"] = None
-    elif date_stats := std_get_if[date_statistics](&type_specific_stats):
-        if date_stats.minimum.has_value():
-            column_stats["minimum"] = datetime.datetime.fromtimestamp(
-                datetime.timedelta(date_stats.minimum.value()).total_seconds(),
-                datetime.timezone.utc,
-            )
-        else:
-            column_stats["minimum"] = None
-        if date_stats.maximum.has_value():
-            column_stats["maximum"] = datetime.datetime.fromtimestamp(
-                datetime.timedelta(date_stats.maximum.value()).total_seconds(),
-                datetime.timezone.utc,
-            )
-        else:
-            column_stats["maximum"] = None
-    elif bin_stats := std_get_if[binary_statistics](&type_specific_stats):
-        if bin_stats.sum.has_value():
-            column_stats["sum"] = bin_stats.sum.value()
-        else:
-            column_stats["sum"] = None
-    elif ts_stats := std_get_if[timestamp_statistics](&type_specific_stats):
-        # Before ORC-135, the local timezone offset was included and they were
-        # stored as minimum and maximum. After ORC-135, the timestamp is
-        # adjusted to UTC before being converted to milliseconds and stored
-        # in minimumUtc and maximumUtc.
-        # TODO: Support minimum and maximum by reading writer's local timezone
-        if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value():
-            column_stats["minimum"] = datetime.datetime.fromtimestamp(
-                ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc
-            )
-            column_stats["maximum"] = datetime.datetime.fromtimestamp(
-                ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc
-            )
-    else:
-        raise ValueError("Unsupported statistics type")
-    return column_stats
-
-
+# TODO: Consider inlining this function since it seems to only be used in one place.
 cpdef read_parsed_orc_statistics(filepath_or_buffer):
     """
     Cython function to call into libcudf API, see `read_parsed_orc_statistics`.
@@ -201,25 +54,13 @@ cpdef read_parsed_orc_statistics(filepath_or_buffer):
     cudf.io.orc.read_orc_statistics
     """
 
-    cdef parsed_orc_statistics parsed = (
-        libcudf_read_parsed_orc_statistics(make_source_info([filepath_or_buffer]))
+    parsed = (
+        plc.io.orc.read_parsed_orc_statistics(
+            plc.io.SourceInfo([filepath_or_buffer])
+        )
     )
 
-    cdef vector[column_statistics] file_stats = parsed.file_stats
-    cdef vector[vector[column_statistics]] stripes_stats = parsed.stripes_stats
-
-    parsed_file_stats = [
-        _parse_column_type_statistics(file_stats[column_index])
-        for column_index in range(file_stats.size())
-    ]
-
-    parsed_stripes_stats = [
-        [_parse_column_type_statistics(stripes_stats[stripe_index][column_index])
-         for column_index in range(stripes_stats[stripe_index].size())]
-        for stripe_index in range(stripes_stats.size())
-    ]
-
-    return parsed.column_names, parsed_file_stats, parsed_stripes_stats
+    return parsed.column_names, parsed.file_stats, parsed.stripes_stats
 
 
 cpdef read_orc(object filepaths_or_buffers,
@@ -235,36 +76,34 @@ cpdef read_orc(object filepaths_or_buffers,
     See Also
     --------
     cudf.read_orc
+
+    Notes
+    -----
+    Currently this function only considers the metadata of the first file in the list of
+    filepaths_or_buffers.
     """
-    cdef orc_reader_options c_orc_reader_options = make_orc_reader_options(
-        filepaths_or_buffers,
+
+    if columns is not None:
+        columns = [str(col) for col in columns]
+
+    tbl_w_meta = plc.io.orc.read_orc(
+        plc.io.SourceInfo(filepaths_or_buffers),
         columns,
-        stripes or [],
+        stripes,
         get_skiprows_arg(skip_rows),
         get_num_rows_arg(num_rows),
-        (
-            type_id.EMPTY
-            if timestamp_type is None else
-            <type_id>(
-                <underlying_type_t_type_id> (
-                    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
-                        cudf.dtype(timestamp_type)
-                    ]
-                )
-            )
-        ),
         use_index,
+        plc.types.DataType(
+            SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[
+                cudf.dtype(timestamp_type)
+            ]
+        )
     )
 
-    cdef table_with_metadata c_result
-    cdef size_type nrows
+    names = tbl_w_meta.column_names(include_children=False)
 
-    with nogil:
-        c_result = move(libcudf_read_orc(c_orc_reader_options))
-
-    names = [info.name.decode() for info in c_result.metadata.schema_info]
     actual_index_names, col_names, is_range_index, reset_index_name, \
-        range_idx = _get_index_from_metadata(c_result.metadata.user_data,
+        range_idx = _get_index_from_metadata(tbl_w_meta.per_file_user_data,
                                              names,
                                              skip_rows,
                                              num_rows)
@@ -272,11 +111,11 @@ cpdef read_orc(object filepaths_or_buffers,
     if columns is not None and (isinstance(columns, list) and len(columns) == 0):
         # When `columns=[]`, index needs to be
         # established, but not the columns.
-        nrows = c_result.tbl.get()[0].view().num_rows()
+        nrows = tbl_w_meta.tbl.num_rows()
         return {}, cudf.RangeIndex(nrows)
 
-    data, index = data_from_unique_ptr(
-        move(c_result.tbl),
+    data, index = data_from_pylibcudf_io(
+        tbl_w_meta,
         col_names if columns is None else names,
         actual_index_names
     )
@@ -286,11 +125,13 @@ cpdef read_orc(object filepaths_or_buffers,
     elif reset_index_name:
         index.names = [None] * len(index.names)
 
+    child_name_values = tbl_w_meta.child_names.values()
+
     data = {
-        name: update_column_struct_field_names(
-            col, c_result.metadata.schema_info[i]
+        name: update_col_struct_field_names(
+            col, child_names
         )
-        for i, (name, col) in enumerate(data.items())
+        for (name, col), child_names in zip(data.items(), child_name_values)
     }
 
     return data, index
@@ -313,32 +154,35 @@ cdef compression_type _get_comp_type(object compression):
         raise ValueError(f"Unsupported `compression` type {compression}")
 
 cdef tuple _get_index_from_metadata(
-        map[string, string] user_data,
+        vector[map[string, string]] user_data,
         object names,
         object skip_rows,
         object num_rows):
-    json_str = user_data[b'pandas'].decode('utf-8')
+
     meta = None
     index_col = None
     is_range_index = False
     reset_index_name = False
     range_idx = None
-    if json_str != "":
-        meta = json.loads(json_str)
-        if 'index_columns' in meta and len(meta['index_columns']) > 0:
-            index_col = meta['index_columns']
-            if isinstance(index_col[0], dict) and \
-                    index_col[0]['kind'] == 'range':
-                is_range_index = True
-            else:
-                index_col_names = OrderedDict()
-                for idx_col in index_col:
-                    for c in meta['columns']:
-                        if c['field_name'] == idx_col:
-                            index_col_names[idx_col] = \
-                                c['name'] or c['field_name']
-                            if c['name'] is None:
-                                reset_index_name = True
+
+    if user_data.size() > 0:
+        json_str = user_data[0][b'pandas'].decode('utf-8')
+        if json_str != "":
+            meta = json.loads(json_str)
+            if 'index_columns' in meta and len(meta['index_columns']) > 0:
+                index_col = meta['index_columns']
+                if isinstance(index_col[0], dict) and \
+                        index_col[0]['kind'] == 'range':
+                    is_range_index = True
+                else:
+                    index_col_names = OrderedDict()
+                    for idx_col in index_col:
+                        for c in meta['columns']:
+                            if c['field_name'] == idx_col:
+                                index_col_names[idx_col] = \
+                                    c['name'] or c['field_name']
+                                if c['name'] is None:
+                                    reset_index_name = True
 
     actual_index_names = None
     if index_col is not None and len(index_col) > 0:
@@ -473,41 +317,6 @@ cdef int64_t get_num_rows_arg(object arg) except*:
     return <int64_t> arg
 
 
-cdef orc_reader_options make_orc_reader_options(
-    object filepaths_or_buffers,
-    object column_names,
-    object stripes,
-    int64_t skip_rows,
-    int64_t num_rows,
-    type_id timestamp_type,
-    bool use_index
-) except*:
-
-    cdef vector[vector[size_type]] strps = stripes
-    cdef orc_reader_options opts
-    cdef source_info src = make_source_info(filepaths_or_buffers)
-    opts = move(
-        orc_reader_options.builder(src)
-        .stripes(strps)
-        .skip_rows(skip_rows)
-        .timestamp_type(data_type(timestamp_type))
-        .use_index(use_index)
-        .build()
-    )
-    if num_rows >= 0:
-        opts.set_num_rows(num_rows)
-
-    cdef vector[string] c_column_names
-    if column_names is not None:
-        c_column_names.reserve(len(column_names))
-        for col in column_names:
-            c_column_names.push_back(str(col).encode())
-        if len(column_names) > 0:
-            opts.set_columns(c_column_names)
-
-    return opts
-
-
 cdef class ORCWriter:
     """
     ORCWriter lets you you incrementally write out a ORC file from a series
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index ff97fe80310..7254db5c43d 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.table.table cimport table, table_view
 cdef data_from_unique_ptr(
     unique_ptr[table] c_tbl, column_names, index_names=*)
 cdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
-cdef data_from_pylibcudf_io(tbl_with_meta)
+cdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *)
 cdef data_from_table_view(
     table_view tv, object owner, object column_names, object index_names=*)
 cdef table_view table_view_from_columns(columns) except *
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 8660cca9322..9e5b99f64eb 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -316,15 +316,17 @@ cdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
         index_names
     )
 
-cdef data_from_pylibcudf_io(tbl_with_meta):
+cdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None):
     """
     Unpacks the TableWithMetadata from libcudf I/O
     into a dict of columns and an Index (cuDF format)
     """
+    if column_names is None:
+        column_names = tbl_with_meta.column_names(include_children=False)
     return _data_from_columns(
         columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns],
-        column_names=tbl_with_meta.column_names(include_children=False),
-        index_names=None
+        column_names=column_names,
+        index_names=index_names
     )
 
 cdef columns_from_table_view(
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index c54293badbe..68b60809bb9 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -181,11 +181,6 @@ def read_orc_statistics(
             parsed_stripes_statistics,
         ) = liborc.read_parsed_orc_statistics(path_or_buf)
 
-        # Parse column names
-        column_names = [
-            column_name.decode("utf-8") for column_name in column_names
-        ]
-
         # Parse file statistics
         file_statistics = {
             column_name: column_stats
@@ -248,9 +243,9 @@ def _filter_stripes(
         num_rows_scanned = 0
         for i, stripe_statistics in enumerate(stripes_statistics):
             num_rows_before_stripe = num_rows_scanned
-            num_rows_scanned += next(iter(stripe_statistics.values()))[
-                "number_of_values"
-            ]
+            num_rows_scanned += next(
+                iter(stripe_statistics.values())
+            ).number_of_values
             if stripes is not None and i not in stripes:
                 continue
             if skip_rows is not None and num_rows_scanned <= skip_rows:
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index c2a30b76bea..1dd732c7191 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -184,25 +184,25 @@ def test_orc_read_statistics(datadir):
         pytest.skip(".orc file is not found: %s" % e)
 
     # Check numberOfValues
-    assert_eq(file_statistics[0]["int1"]["number_of_values"], 11_000)
+    assert_eq(file_statistics[0]["int1"].number_of_values, 11_000)
     assert_eq(
-        file_statistics[0]["int1"]["number_of_values"],
+        file_statistics[0]["int1"].number_of_values,
         sum(
             [
-                stripes_statistics[0]["int1"]["number_of_values"],
-                stripes_statistics[1]["int1"]["number_of_values"],
-                stripes_statistics[2]["int1"]["number_of_values"],
+                stripes_statistics[0]["int1"].number_of_values,
+                stripes_statistics[1]["int1"].number_of_values,
+                stripes_statistics[2]["int1"].number_of_values,
             ]
         ),
     )
     assert_eq(
-        stripes_statistics[1]["int1"]["number_of_values"],
-        stripes_statistics[1]["string1"]["number_of_values"],
+        stripes_statistics[1]["int1"].number_of_values,
+        stripes_statistics[1]["string1"].number_of_values,
     )
-    assert_eq(stripes_statistics[2]["string1"]["number_of_values"], 1_000)
+    assert_eq(stripes_statistics[2]["string1"].number_of_values, 1_000)
 
     # Check other statistics
-    assert_eq(stripes_statistics[2]["string1"]["has_null"], False)
+    assert_eq(stripes_statistics[2]["string1"].has_null, False)
     assert_eq(
         file_statistics[0]["int1"]["minimum"],
         min(
@@ -1538,8 +1538,8 @@ def test_empty_statistics():
     for stats in got:
         # Similar expected stats for the first 6 columns in this case
         for col_name in ascii_lowercase[:6]:
-            assert stats[0][col_name].get("number_of_values") == 0
-            assert stats[0][col_name].get("has_null") is True
+            assert stats[0][col_name].number_of_values == 0
+            assert stats[0][col_name].has_null is True
             assert stats[0][col_name].get("minimum") is None
             assert stats[0][col_name].get("maximum") is None
         for col_name in ascii_lowercase[:3]:
@@ -1547,17 +1547,17 @@ def test_empty_statistics():
         # Sum for decimal column is a string
         assert stats[0]["d"].get("sum") == "0"
 
-        assert stats[0]["g"].get("number_of_values") == 0
-        assert stats[0]["g"].get("has_null") is True
+        assert stats[0]["g"].number_of_values == 0
+        assert stats[0]["g"].has_null is True
         assert stats[0]["g"].get("true_count") == 0
         assert stats[0]["g"].get("false_count") == 0
 
-        assert stats[0]["h"].get("number_of_values") == 0
-        assert stats[0]["h"].get("has_null") is True
+        assert stats[0]["h"].number_of_values == 0
+        assert stats[0]["h"].has_null is True
         assert stats[0]["h"].get("sum") == 0
 
-        assert stats[0]["i"].get("number_of_values") == 1
-        assert stats[0]["i"].get("has_null") is False
+        assert stats[0]["i"].number_of_values == 1
+        assert stats[0]["i"].has_null is False
         assert stats[0]["i"].get("minimum") == 1
         assert stats[0]["i"].get("maximum") == 1
         assert stats[0]["i"].get("sum") == 1
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 1180da321e6..d636f36f282 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1873,7 +1873,7 @@ def _apply_filter_bool_eq(val, col_stats):
                 return False
         elif val is False:
             if (col_stats["false_count"] == 0) or (
-                col_stats["true_count"] == col_stats["number_of_values"]
+                col_stats["true_count"] == col_stats.number_of_values
             ):
                 return False
     return True
@@ -1900,7 +1900,7 @@ def _apply_predicate(op, val, col_stats):
             return False
         # TODO: Replace pd.isnull with
         # cudf.isnull once it is implemented
-        if pd.isnull(val) and not col_stats["has_null"]:
+        if pd.isnull(val) and not col_stats.has_null:
             return False
         if not _apply_filter_bool_eq(val, col_stats):
             return False
diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
index bcc2151f5b6..529a71a48ce 100644
--- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx parquet.pyx types.pyx)
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd
index 62820048584..5927a19dc69 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/io/__init__.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 # CSV is removed since it is def not cpdef (to force kw-only arguments)
-from . cimport avro, datasource, json, parquet, types
+from . cimport avro, datasource, json, orc, parquet, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
index 27640f7d955..5d899ee0808 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.py
+++ b/python/pylibcudf/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, csv, datasource, json, parquet, types
+from . import avro, csv, datasource, json, orc, parquet, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd
new file mode 100644
index 00000000000..b111d617b1b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/orc.pxd
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libc.stdint cimport uint64_t
+from libcpp cimport bool
+from libcpp.optional cimport optional
+from libcpp.string cimport string
+from libcpp.vector cimport vector
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.orc_metadata cimport (
+    column_statistics,
+    parsed_orc_statistics,
+    statistics_type,
+)
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.types cimport DataType
+
+
+cpdef TableWithMetadata read_orc(
+    SourceInfo source_info,
+    list columns = *,
+    list stripes = *,
+    size_type skip_rows = *,
+    size_type nrows = *,
+    bool use_index = *,
+    bool use_np_dtypes = *,
+    DataType timestamp_type = *,
+    list decimal128_columns = *
+)
+
+cdef class OrcColumnStatistics:
+    cdef optional[uint64_t] number_of_values_c
+    cdef optional[bool] has_null_c
+    cdef statistics_type type_specific_stats_c
+    cdef dict column_stats
+
+    cdef void _init_stats_dict(self)
+
+    @staticmethod
+    cdef OrcColumnStatistics from_libcudf(column_statistics& col_stats)
+
+
+cdef class ParsedOrcStatistics:
+    cdef parsed_orc_statistics c_obj
+
+    @staticmethod
+    cdef ParsedOrcStatistics from_libcudf(parsed_orc_statistics& orc_stats)
+
+
+cpdef ParsedOrcStatistics read_parsed_orc_statistics(
+    SourceInfo source_info
+)
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx
new file mode 100644
index 00000000000..01a5e4b04a1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/orc.pyx
@@ -0,0 +1,302 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp cimport bool
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+import datetime
+
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.orc cimport (
+    orc_reader_options,
+    read_orc as cpp_read_orc,
+)
+from pylibcudf.libcudf.io.orc_metadata cimport (
+    binary_statistics,
+    bucket_statistics,
+    column_statistics,
+    date_statistics,
+    decimal_statistics,
+    double_statistics,
+    integer_statistics,
+    no_statistics,
+    read_parsed_orc_statistics as cpp_read_parsed_orc_statistics,
+    statistics_type,
+    string_statistics,
+    timestamp_statistics,
+)
+from pylibcudf.libcudf.io.types cimport table_with_metadata
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.types cimport DataType
+from pylibcudf.variant cimport get_if, holds_alternative
+
+
+cdef class OrcColumnStatistics:
+    def __init__(self):
+        raise TypeError(
+            "OrcColumnStatistics should not be instantiated by users. If it is "
+            "being constructed in Cython from a preexisting libcudf object, "
+            "use `OrcColumnStatistics.from_libcudf` instead."
+        )
+
+    @property
+    def number_of_values(self):
+        if self.number_of_values_c.has_value():
+            return self.number_of_values_c.value()
+        return None
+
+    @property
+    def has_null(self):
+        if self.has_null_c.has_value():
+            return self.has_null_c.value()
+        return None
+
+    cdef void _init_stats_dict(self):
+        # Initialize stats to return and parse stats blob
+        self.column_stats = {}
+
+        cdef statistics_type type_specific_stats = self.type_specific_stats_c
+
+        cdef integer_statistics* int_stats
+        cdef double_statistics* dbl_stats
+        cdef string_statistics* str_stats
+        cdef bucket_statistics* bucket_stats
+        cdef decimal_statistics* dec_stats
+        cdef date_statistics* date_stats
+        cdef binary_statistics* bin_stats
+        cdef timestamp_statistics* ts_stats
+
+        if holds_alternative[no_statistics](type_specific_stats):
+            pass
+        elif int_stats := get_if[integer_statistics](&type_specific_stats):
+            if int_stats.minimum.has_value():
+                self.column_stats["minimum"] = int_stats.minimum.value()
+            else:
+                self.column_stats["minimum"] = None
+            if int_stats.maximum.has_value():
+                self.column_stats["maximum"] = int_stats.maximum.value()
+            else:
+                self.column_stats["maximum"] = None
+            if int_stats.sum.has_value():
+                self.column_stats["sum"] = int_stats.sum.value()
+            else:
+                self.column_stats["sum"] = None
+        elif dbl_stats := get_if[double_statistics](&type_specific_stats):
+            if dbl_stats.minimum.has_value():
+                self.column_stats["minimum"] = dbl_stats.minimum.value()
+            else:
+                self.column_stats["minimum"] = None
+            if dbl_stats.maximum.has_value():
+                self.column_stats["maximum"] = dbl_stats.maximum.value()
+            else:
+                self.column_stats["maximum"] = None
+            if dbl_stats.sum.has_value():
+                self.column_stats["sum"] = dbl_stats.sum.value()
+            else:
+                self.column_stats["sum"] = None
+        elif str_stats := get_if[string_statistics](&type_specific_stats):
+            if str_stats.minimum.has_value():
+                self.column_stats["minimum"] = str_stats.minimum.value().decode("utf-8")
+            else:
+                self.column_stats["minimum"] = None
+            if str_stats.maximum.has_value():
+                self.column_stats["maximum"] = str_stats.maximum.value().decode("utf-8")
+            else:
+                self.column_stats["maximum"] = None
+            if str_stats.sum.has_value():
+                self.column_stats["sum"] = str_stats.sum.value()
+            else:
+                self.column_stats["sum"] = None
+        elif bucket_stats := get_if[bucket_statistics](&type_specific_stats):
+            self.column_stats["true_count"] = bucket_stats.count[0]
+            self.column_stats["false_count"] = (
+                self.number_of_values
+                - self.column_stats["true_count"]
+            )
+        elif dec_stats := get_if[decimal_statistics](&type_specific_stats):
+            if dec_stats.minimum.has_value():
+                self.column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8")
+            else:
+                self.column_stats["minimum"] = None
+            if dec_stats.maximum.has_value():
+                self.column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8")
+            else:
+                self.column_stats["maximum"] = None
+            if dec_stats.sum.has_value():
+                self.column_stats["sum"] = dec_stats.sum.value().decode("utf-8")
+            else:
+                self.column_stats["sum"] = None
+        elif date_stats := get_if[date_statistics](&type_specific_stats):
+            if date_stats.minimum.has_value():
+                self.column_stats["minimum"] = datetime.datetime.fromtimestamp(
+                    datetime.timedelta(date_stats.minimum.value()).total_seconds(),
+                    datetime.timezone.utc,
+                )
+            else:
+                self.column_stats["minimum"] = None
+            if date_stats.maximum.has_value():
+                self.column_stats["maximum"] = datetime.datetime.fromtimestamp(
+                    datetime.timedelta(date_stats.maximum.value()).total_seconds(),
+                    datetime.timezone.utc,
+                )
+            else:
+                self.column_stats["maximum"] = None
+        elif bin_stats := get_if[binary_statistics](&type_specific_stats):
+            if bin_stats.sum.has_value():
+                self.column_stats["sum"] = bin_stats.sum.value()
+            else:
+                self.column_stats["sum"] = None
+        elif ts_stats := get_if[timestamp_statistics](&type_specific_stats):
+            # Before ORC-135, the local timezone offset was included and they were
+            # stored as minimum and maximum. After ORC-135, the timestamp is
+            # adjusted to UTC before being converted to milliseconds and stored
+            # in minimumUtc and maximumUtc.
+            # TODO: Support minimum and maximum by reading writer's local timezone
+            if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value():
+                self.column_stats["minimum"] = datetime.datetime.fromtimestamp(
+                    ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc
+                )
+                self.column_stats["maximum"] = datetime.datetime.fromtimestamp(
+                    ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc
+                )
+        else:
+            raise ValueError("Unsupported statistics type")
+
+    def __getitem__(self, item):
+        return self.column_stats[item]
+
+    def __contains__(self, item):
+        return item in self.column_stats
+
+    def get(self, item, default=None):
+        return self.column_stats.get(item, default)
+
+    @staticmethod
+    cdef OrcColumnStatistics from_libcudf(column_statistics& col_stats):
+        cdef OrcColumnStatistics out = OrcColumnStatistics.__new__(OrcColumnStatistics)
+        out.number_of_values_c = col_stats.number_of_values
+        out.has_null_c = col_stats.has_null
+        out.type_specific_stats_c = col_stats.type_specific_stats
+        out._init_stats_dict()
+        return out
+
+
+cdef class ParsedOrcStatistics:
+
+    @property
+    def column_names(self):
+        return [name.decode() for name in self.c_obj.column_names]
+
+    @property
+    def file_stats(self):
+        return [
+            OrcColumnStatistics.from_libcudf(self.c_obj.file_stats[i])
+            for i in range(self.c_obj.file_stats.size())
+        ]
+
+    @property
+    def stripes_stats(self):
+        return [
+            [
+                OrcColumnStatistics.from_libcudf(stripe_stats_c[i])
+                for i in range(stripe_stats_c.size())
+            ]
+            for stripe_stats_c in self.c_obj.stripes_stats
+        ]
+
+    @staticmethod
+    cdef ParsedOrcStatistics from_libcudf(parsed_orc_statistics& orc_stats):
+        cdef ParsedOrcStatistics out = ParsedOrcStatistics.__new__(ParsedOrcStatistics)
+        out.c_obj = move(orc_stats)
+        return out
+
+
+cpdef TableWithMetadata read_orc(
+    SourceInfo source_info,
+    list columns = None,
+    list stripes = None,
+    size_type skip_rows = 0,
+    size_type nrows = -1,
+    bool use_index = True,
+    bool use_np_dtypes = True,
+    DataType timestamp_type = None,
+    list decimal128_columns = None,
+):
+    """Reads an ORC file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the Parquet file from.
+    columns : list, default None
+        The string names of the columns to be read.
+    stripes : list[list[size_type]], default None
+        List of stripes to be read.
+    skip_rows : int64_t, default 0
+        The number of rows to skip from the start of the file.
+    nrows : size_type, default -1
+        The number of rows to read. By default, read the entire file.
+    use_index : bool, default True
+        Whether to use the row index to speed up reading.
+    use_np_dtypes : bool, default True
+        Whether to use numpy compatible dtypes.
+    timestamp_type : DataType, default None
+        The timestamp type to use for the timestamp columns.
+    decimal128_columns : list, default None
+        List of column names to be read as 128-bit decimals.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata (column names) that were read in.
+    """
+    cdef orc_reader_options opts
+    cdef vector[vector[size_type]] c_stripes
+    opts = move(
+        orc_reader_options.builder(source_info.c_obj)
+        .use_index(use_index)
+        .build()
+    )
+    if nrows >= 0:
+        opts.set_num_rows(nrows)
+    if skip_rows >= 0:
+        opts.set_skip_rows(skip_rows)
+    if stripes is not None:
+        c_stripes = stripes
+        opts.set_stripes(c_stripes)
+    if timestamp_type is not None:
+        opts.set_timestamp_type(timestamp_type.c_obj)
+
+    cdef vector[string] c_decimal128_columns
+    if decimal128_columns is not None and len(decimal128_columns) > 0:
+        c_decimal128_columns.reserve(len(decimal128_columns))
+        for col in decimal128_columns:
+            if not isinstance(col, str):
+                raise TypeError("Decimal 128 column names must be strings!")
+            c_decimal128_columns.push_back(col.encode())
+        opts.set_decimal128_columns(c_decimal128_columns)
+
+    cdef vector[string] c_column_names
+    if columns is not None and len(columns) > 0:
+        c_column_names.reserve(len(columns))
+        for col in columns:
+            if not isinstance(col, str):
+                raise TypeError("Column names must be strings!")
+            c_column_names.push_back(col.encode())
+        opts.set_columns(c_column_names)
+
+    cdef table_with_metadata c_result
+
+    with nogil:
+        c_result = move(cpp_read_orc(opts))
+
+    return TableWithMetadata.from_libcudf(c_result)
+
+
+cpdef ParsedOrcStatistics read_parsed_orc_statistics(
+    SourceInfo source_info
+):
+    cdef parsed_orc_statistics parsed = (
+        cpp_read_parsed_orc_statistics(source_info.c_obj)
+    )
+    return ParsedOrcStatistics.from_libcudf(parsed)
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index 1600a805b37..563a02761da 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -130,6 +130,7 @@ cdef class TableWithMetadata:
         """
         return self.metadata.per_file_user_data
 
+
 cdef class SourceInfo:
     """A class containing details on a source to read from.
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
index e4a09b8feb2..dca24c7f665 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
@@ -35,6 +35,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_use_index(bool val) except +
         void enable_use_np_dtypes(bool val) except +
         void set_timestamp_type(data_type type) except +
+        void set_decimal128_columns(vector[string] val) except +
 
         @staticmethod
         orc_reader_options_builder builder(
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
index db6cb0cdfa5..9302ffe2f80 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
@@ -1,11 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-cimport pylibcudf.libcudf.io.types as cudf_io_types
 from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
 from libcpp cimport bool
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
+from pylibcudf.libcudf.io cimport types as cudf_io_types
 from pylibcudf.variant cimport monostate, variant
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py
index babe6634318..9f389fa42c4 100644
--- a/python/pylibcudf/pylibcudf/tests/common/utils.py
+++ b/python/pylibcudf/pylibcudf/tests/common/utils.py
@@ -9,6 +9,7 @@
 import pyarrow.compute as pc
 import pylibcudf as plc
 import pytest
+from pyarrow.orc import write_table as orc_write_table
 from pyarrow.parquet import write_table as pq_write_table
 from pylibcudf.io.types import CompressionType
 
@@ -242,13 +243,21 @@ def is_nested_list(typ):
     return nesting_level(typ)[0] > 1
 
 
-def _convert_numeric_types_to_floating(pa_table):
+def _convert_types(pa_table, input_pred, result_type):
     """
-    Useful little helper for testing the
-    dtypes option in I/O readers.
+    Useful little helper for testing the dtypes option in I/O readers.
 
-    Returns a tuple containing the pylibcudf dtypes
-    and the new pyarrow schema
+    Returns a tuple containing the pylibcudf dtypes and the new pyarrow schema based on
+    the data in the table.
+
+    Parameters
+    ----------
+    pa_table : pyarrow.Table
+        The table from which to extract the dtypes
+    input_pred : function
+        Predicate that evaluates to true for types to replace
+    result_type : pa.DataType
+        The type to cast to
     """
     dtypes = []
     new_fields = []
@@ -257,11 +266,9 @@ def _convert_numeric_types_to_floating(pa_table):
         child_types = []
 
         plc_type = plc.interop.from_arrow(field.type)
-        if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer(
-            field.type
-        ):
-            plc_type = plc.interop.from_arrow(pa.float64())
-            field = field.with_type(pa.float64())
+        if input_pred(field.type):
+            plc_type = plc.interop.from_arrow(result_type)
+            field = field.with_type(result_type)
 
         dtypes.append((field.name, plc_type, child_types))
 
@@ -332,6 +339,16 @@ def make_source(path_or_buf, pa_table, format, **kwargs):
             if isinstance(path_or_buf, io.IOBase)
             else path_or_buf,
         )
+    elif format == "orc":
+        # The conversion to pandas is lossy (doesn't preserve
+        # nested types) so we
+        # will just use pyarrow directly to write this
+        orc_write_table(
+            pa_table,
+            pa.PythonFile(path_or_buf)
+            if isinstance(path_or_buf, io.IOBase)
+            else path_or_buf,
+        )
     if isinstance(path_or_buf, io.IOBase):
         path_or_buf.seek(0)
     return path_or_buf
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
index ccd7eef54f3..ab26f23418d 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
@@ -9,7 +9,7 @@
 import pytest
 from pylibcudf.io.types import CompressionType
 from utils import (
-    _convert_numeric_types_to_floating,
+    _convert_types,
     assert_table_and_meta_eq,
     make_source,
     write_source_str,
@@ -148,7 +148,11 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols):
     if usecols is not None:
         pa_table = pa_table.select(usecols)
 
-    dtypes, new_fields = _convert_numeric_types_to_floating(pa_table)
+    dtypes, new_fields = _convert_types(
+        pa_table,
+        lambda t: (pa.types.is_unsigned_integer(t) or pa.types.is_integer(t)),
+        pa.float64(),
+    )
     # Extract the dtype out of the (name, type, child_types) tuple
     # (read_csv doesn't support this format since it doesn't support nested columns)
     dtypes = {name: dtype for name, dtype, _ in dtypes}
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
new file mode 100644
index 00000000000..42b14b1feff
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import _convert_types, assert_table_and_meta_eq, make_source
+
+# Shared kwargs to pass to make_source
+_COMMON_ORC_SOURCE_KWARGS = {"format": "orc"}
+
+
+@pytest.mark.parametrize("columns", [None, ["col_int64", "col_bool"]])
+def test_read_orc_basic(
+    table_data, binary_source_or_sink, nrows_skiprows, columns
+):
+    _, pa_table = table_data
+    nrows, skiprows = nrows_skiprows
+
+    # ORC reader doesn't support skip_rows for nested columns
+    if skiprows > 0:
+        colnames_to_drop = []
+        for i in range(len(pa_table.schema)):
+            field = pa_table.schema.field(i)
+
+            if pa.types.is_nested(field.type):
+                colnames_to_drop.append(field.name)
+        pa_table = pa_table.drop(colnames_to_drop)
+    # ORC doesn't support unsigned ints
+    # let's cast to int64
+    _, new_fields = _convert_types(
+        pa_table, pa.types.is_unsigned_integer, pa.int64()
+    )
+    pa_table = pa_table.cast(pa.schema(new_fields))
+
+    source = make_source(
+        binary_source_or_sink, pa_table, **_COMMON_ORC_SOURCE_KWARGS
+    )
+
+    res = plc.io.orc.read_orc(
+        plc.io.SourceInfo([source]),
+        nrows=nrows,
+        skip_rows=skiprows,
+        columns=columns,
+    )
+
+    if columns is not None:
+        pa_table = pa_table.select(columns)
+
+    # Adapt to nrows/skiprows
+    pa_table = pa_table.slice(
+        offset=skiprows, length=nrows if nrows != -1 else None
+    )
+
+    assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)

From 503ce030f9523eda83677caafdd221385348a69c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 25 Sep 2024 12:11:03 -1000
Subject: [PATCH 03/11] Add transpose API to pylibcudf (#16749)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16749
---
 .../user_guide/api_docs/pylibcudf/index.rst   |  1 +
 .../api_docs/pylibcudf/transpose.rst          |  6 +++
 python/cudf/cudf/_lib/transpose.pyx           | 30 ++++-----------
 python/pylibcudf/pylibcudf/CMakeLists.txt     |  1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |  2 +
 python/pylibcudf/pylibcudf/__init__.py        |  2 +
 .../pylibcudf/tests/test_transpose.py         | 32 ++++++++++++++++
 python/pylibcudf/pylibcudf/transpose.pxd      |  5 +++
 python/pylibcudf/pylibcudf/transpose.pyx      | 38 +++++++++++++++++++
 9 files changed, 95 insertions(+), 22 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_transpose.py
 create mode 100644 python/pylibcudf/pylibcudf/transpose.pxd
 create mode 100644 python/pylibcudf/pylibcudf/transpose.pyx

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index d6f8cd2a1ff..edb0963ed29 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -38,6 +38,7 @@ This page provides API documentation for pylibcudf.
     table
     traits
     transform
+    transpose
     types
     unary
 
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst
new file mode 100644
index 00000000000..6241295e770
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst
@@ -0,0 +1,6 @@
+=========
+transpose
+=========
+
+.. automodule:: pylibcudf.transpose
+   :members:
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index f78fbd4c844..995d278cb88 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -1,32 +1,18 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.transpose cimport transpose as cpp_transpose
+import pylibcudf as plc
 
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
 
 
 def transpose(list source_columns):
     """Transpose m n-row columns into n m-row columns
     """
-    cdef pair[unique_ptr[column], table_view] c_result
-    cdef table_view c_input = table_view_from_columns(source_columns)
-
-    with nogil:
-        c_result = move(cpp_transpose(c_input))
-
-    # Notice, the data pointer of `result_owner` has been exposed
-    # through `c_result.second` at this point.
-    result_owner = Column.from_unique_ptr(
-        move(c_result.first), data_ptr_exposed=True
-    )
-    return columns_from_table_view(
-        c_result.second,
-        owners=[result_owner] * c_result.second.num_columns()
+    input_table = plc.table.Table(
+        [col.to_pylibcudf(mode="read") for col in source_columns]
     )
+    result_table = plc.transpose.transpose(input_table)
+    return [
+        Column.from_pylibcudf(col, data_ptr_exposed=True)
+        for col in result_table.columns()
+    ]
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index f07c8897e34..fb3a6c13a70 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -44,6 +44,7 @@ set(cython_sources
     table.pyx
     traits.pyx
     transform.pyx
+    transpose.pyx
     types.pyx
     unary.pyx
     utils.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index b7cf6413c05..66d9c3d6165 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -29,6 +29,7 @@ from . cimport (
     strings,
     traits,
     transform,
+    transpose,
     types,
     unary,
 )
@@ -72,6 +73,7 @@ __all__ = [
     "sorting",
     "traits",
     "transform",
+    "transpose",
     "types",
     "unary",
 ]
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 84b1c29f791..0a3615fa941 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -40,6 +40,7 @@
     strings,
     traits,
     transform,
+    transpose,
     types,
     unary,
 )
@@ -86,6 +87,7 @@
     "sorting",
     "traits",
     "transform",
+    "transpose",
     "types",
     "unary",
 ]
diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py
new file mode 100644
index 00000000000..ac11123f680
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from packaging.version import parse
+
+
+@pytest.mark.skipif(
+    parse(pa.__version__) < parse("16.0.0"),
+    reason="https://github.com/apache/arrow/pull/40070",
+)
+@pytest.mark.parametrize(
+    "arr",
+    [
+        [],
+        [1, 2, 3],
+        [1, 2],
+        [1],
+    ],
+)
+def test_transpose(arr):
+    data = {"a": arr, "b": arr}
+    arrow_tbl = pa.table(data)
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+    plc_result = plc.transpose.transpose(plc_tbl)
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.Table.from_pandas(
+        arrow_tbl.to_pandas().T, preserve_index=False
+    ).rename_columns([""] * len(arr))
+    expected = pa.table(expected, schema=result.schema)
+    assert result.equals(expected)
diff --git a/python/pylibcudf/pylibcudf/transpose.pxd b/python/pylibcudf/pylibcudf/transpose.pxd
new file mode 100644
index 00000000000..7b5a7676b49
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/transpose.pxd
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from .table cimport Table
+
+
+cpdef Table transpose(Table input_table)
diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx
new file mode 100644
index 00000000000..a708f6cc37f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/transpose.pyx
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.pair cimport pair
+from libcpp.utility cimport move
+from pylibcudf.libcudf cimport transpose as cpp_transpose
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table_view cimport table_view
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef Table transpose(Table input_table):
+    """Transpose a Table.
+
+    For details, see :cpp:func:`transpose`.
+
+    Parameters
+    ----------
+    input_table : Table
+        Table to transpose
+
+    Returns
+    -------
+    Table
+        Transposed table.
+    """
+    cdef pair[unique_ptr[column], table_view] c_result
+    cdef Table owner_table
+
+    with nogil:
+        c_result = move(cpp_transpose.transpose(input_table.view()))
+
+    owner_table = Table(
+        [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns()
+    )
+
+    return Table.from_table_view(c_result.second, owner_table)

From 0425963e14570fc723e3804f0bd7de7460d295f2 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 25 Sep 2024 17:43:07 -0500
Subject: [PATCH 04/11] Add experimental `filesystem="arrow"` support in
 `dask_cudf.read_parquet` (#16684)

This PR piggybacks on the existing CPU/Arrow Parquet infrastructure in dask-expr. With this PR,

```python
df = dask_cudf.read_parquet(path, filesystem="arrow")
```
will produce a `cudf`-backed collection using PyArrow for IO (i.e. disk->`pa.Table`->`cudf.DataFrame`). Before this PR, passing `filesystem="arrow"` will simply result in an error.

Although this code path is not ideal for fast/local storage, it can be **very** efficient for remote storage (e.g. S3).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Matthew Murray (https://github.com/Matt711)
  - David Wendt (https://github.com/davidwendt)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - https://github.com/brandon-b-miller
  - https://github.com/nvdbaranec

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16684
---
 docs/dask_cudf/source/best_practices.rst      |   9 ++
 docs/dask_cudf/source/index.rst               |   7 +-
 python/dask_cudf/dask_cudf/backends.py        | 142 +++++++++++++++++-
 python/dask_cudf/dask_cudf/expr/_expr.py      |  89 +++++++++++
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |  41 +++--
 5 files changed, 267 insertions(+), 21 deletions(-)

diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst
index 142124163af..83039f86fed 100644
--- a/docs/dask_cudf/source/best_practices.rst
+++ b/docs/dask_cudf/source/best_practices.rst
@@ -252,6 +252,15 @@ result in a simple 1-to-1 mapping between files and output partitions.
   correspond to a reasonable partition size, use ``blocksize=None``
   to avoid unnecessary metadata collection.
 
+.. note::
+  When reading from remote storage (e.g. S3 and GCS), performance will
+  likely improve with ``filesystem="arrow"``. When this option is set,
+  PyArrow will be used to perform IO on multiple CPU threads. Please be
+  aware that this feature is experimental, and behavior may change in
+  the future (without deprecation). Do not pass in ``blocksize`` or
+  ``aggregate_files`` when this feature is used. Instead, set the
+  ``"dataframe.parquet.minimum-partition-size"`` config to control
+  file aggregation.
 
 Use :func:`from_map`
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 23ca7e49753..6eb755d7854 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -40,9 +40,10 @@ Using Dask cuDF
 The Dask DataFrame API (Recommended)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Simply use the `Dask configuration <dask:configuration>`__ system to
-set the ``"dataframe.backend"`` option to ``"cudf"``. From Python,
-this can be achieved like so::
+Simply use the `Dask configuration
+<https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html>`__
+system to set the ``"dataframe.backend"`` option to ``"cudf"``.
+From Python, this can be achieved like so::
 
   import dask
 
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 9347ebba5de..bead964a0ef 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -8,6 +8,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+from packaging.version import Version
 from pandas.api.types import is_scalar
 
 import dask.dataframe as dd
@@ -52,6 +53,10 @@
 get_parallel_type.register(cudf.BaseIndex, lambda _: Index)
 
 
+# Required for Arrow filesystem support in read_parquet
+PYARROW_GE_15 = Version(pa.__version__) >= Version("15.0.0")
+
+
 @meta_nonempty.register(cudf.BaseIndex)
 @_dask_cudf_performance_tracking
 def _nonempty_index(idx):
@@ -695,15 +700,140 @@ def from_dict(
         )
 
     @staticmethod
-    def read_parquet(*args, engine=None, **kwargs):
+    def read_parquet(path, *args, filesystem="fsspec", engine=None, **kwargs):
         import dask_expr as dx
+        import fsspec
 
-        from dask_cudf.io.parquet import CudfEngine
+        if (
+            isinstance(filesystem, fsspec.AbstractFileSystem)
+            or isinstance(filesystem, str)
+            and filesystem.lower() == "fsspec"
+        ):
+            # Default "fsspec" filesystem
+            from dask_cudf.io.parquet import CudfEngine
 
-        _raise_unsupported_parquet_kwargs(**kwargs)
-        return _default_backend(
-            dx.read_parquet, *args, engine=CudfEngine, **kwargs
-        )
+            _raise_unsupported_parquet_kwargs(**kwargs)
+            return _default_backend(
+                dx.read_parquet,
+                path,
+                *args,
+                filesystem=filesystem,
+                engine=CudfEngine,
+                **kwargs,
+            )
+
+        else:
+            # EXPERIMENTAL filesystem="arrow" support.
+            # This code path uses PyArrow for IO, which is only
+            # beneficial for remote storage (e.g. S3)
+
+            from fsspec.utils import stringify_path
+            from pyarrow import fs as pa_fs
+
+            # CudfReadParquetPyarrowFS requires import of distributed beforehand
+            # (See: https://github.com/dask/dask/issues/11352)
+            import distributed  # noqa: F401
+            from dask.core import flatten
+            from dask.dataframe.utils import pyarrow_strings_enabled
+
+            from dask_cudf.expr._expr import CudfReadParquetPyarrowFS
+
+            if args:
+                raise ValueError(f"Unexpected positional arguments: {args}")
+
+            if not (
+                isinstance(filesystem, pa_fs.FileSystem)
+                or isinstance(filesystem, str)
+                and filesystem.lower() in ("arrow", "pyarrow")
+            ):
+                raise ValueError(f"Unexpected filesystem value: {filesystem}.")
+
+            if not PYARROW_GE_15:
+                raise NotImplementedError(
+                    "Experimental Arrow filesystem support requires pyarrow>=15"
+                )
+
+            if not isinstance(path, str):
+                path = stringify_path(path)
+
+            # Extract kwargs
+            columns = kwargs.pop("columns", None)
+            filters = kwargs.pop("filters", None)
+            categories = kwargs.pop("categories", None)
+            index = kwargs.pop("index", None)
+            storage_options = kwargs.pop("storage_options", None)
+            dtype_backend = kwargs.pop("dtype_backend", None)
+            calculate_divisions = kwargs.pop("calculate_divisions", False)
+            ignore_metadata_file = kwargs.pop("ignore_metadata_file", False)
+            metadata_task_size = kwargs.pop("metadata_task_size", None)
+            split_row_groups = kwargs.pop("split_row_groups", "infer")
+            blocksize = kwargs.pop("blocksize", "default")
+            aggregate_files = kwargs.pop("aggregate_files", None)
+            parquet_file_extension = kwargs.pop(
+                "parquet_file_extension", (".parq", ".parquet", ".pq")
+            )
+            arrow_to_pandas = kwargs.pop("arrow_to_pandas", None)
+            open_file_options = kwargs.pop("open_file_options", None)
+
+            # Validate and normalize kwargs
+            kwargs["dtype_backend"] = dtype_backend
+            if arrow_to_pandas is not None:
+                raise ValueError(
+                    "arrow_to_pandas not supported for the 'cudf' backend."
+                )
+            if open_file_options is not None:
+                raise ValueError(
+                    "The open_file_options argument is no longer supported "
+                    "by the 'cudf' backend."
+                )
+            if filters is not None:
+                for filter in flatten(filters, container=list):
+                    _, op, val = filter
+                    if op == "in" and not isinstance(val, (set, list, tuple)):
+                        raise TypeError(
+                            "Value of 'in' filter must be a list, set or tuple."
+                        )
+            if metadata_task_size is not None:
+                raise NotImplementedError(
+                    "metadata_task_size is not supported when using the pyarrow filesystem."
+                )
+            if split_row_groups != "infer":
+                raise NotImplementedError(
+                    "split_row_groups is not supported when using the pyarrow filesystem."
+                )
+            if parquet_file_extension != (".parq", ".parquet", ".pq"):
+                raise NotImplementedError(
+                    "parquet_file_extension is not supported when using the pyarrow filesystem."
+                )
+            if blocksize is not None and blocksize != "default":
+                warnings.warn(
+                    "blocksize is not supported when using the pyarrow filesystem."
+                    "blocksize argument will be ignored."
+                )
+            if aggregate_files is not None:
+                warnings.warn(
+                    "aggregate_files is not supported when using the pyarrow filesystem. "
+                    "Please use the 'dataframe.parquet.minimum-partition-size' config."
+                    "aggregate_files argument will be ignored."
+                )
+
+            return dx.new_collection(
+                CudfReadParquetPyarrowFS(
+                    path,
+                    columns=dx._util._convert_to_list(columns),
+                    filters=filters,
+                    categories=categories,
+                    index=index,
+                    calculate_divisions=calculate_divisions,
+                    storage_options=storage_options,
+                    filesystem=filesystem,
+                    ignore_metadata_file=ignore_metadata_file,
+                    arrow_to_pandas=arrow_to_pandas,
+                    pyarrow_strings_enabled=pyarrow_strings_enabled(),
+                    kwargs=kwargs,
+                    _series=isinstance(columns, str),
+                )
+            )
 
     @staticmethod
     def read_csv(
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index b284ab3774d..af83a01da98 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -2,10 +2,13 @@
 import functools
 
 import dask_expr._shuffle as _shuffle_module
+import pandas as pd
 from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
 from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns
 from dask_expr._reductions import Reduction, Var
+from dask_expr.io.io import FusedParquetIO
+from dask_expr.io.parquet import ReadParquetPyarrowFS
 
 from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
 from dask.dataframe.dispatch import is_categorical_dtype
@@ -18,6 +21,92 @@
 ##
 
 
+class CudfFusedParquetIO(FusedParquetIO):
+    @staticmethod
+    def _load_multiple_files(
+        frag_filters,
+        columns,
+        schema,
+        *to_pandas_args,
+    ):
+        import pyarrow as pa
+
+        from dask.base import apply, tokenize
+        from dask.threaded import get
+
+        token = tokenize(frag_filters, columns, schema)
+        name = f"pq-file-{token}"
+        dsk = {
+            (name, i): (
+                CudfReadParquetPyarrowFS._fragment_to_table,
+                frag,
+                filter,
+                columns,
+                schema,
+            )
+            for i, (frag, filter) in enumerate(frag_filters)
+        }
+        dsk[name] = (
+            apply,
+            pa.concat_tables,
+            [list(dsk.keys())],
+            {"promote_options": "permissive"},
+        )
+        return CudfReadParquetPyarrowFS._table_to_pandas(
+            get(dsk, name),
+            *to_pandas_args,
+        )
+
+
+class CudfReadParquetPyarrowFS(ReadParquetPyarrowFS):
+    @functools.cached_property
+    def _dataset_info(self):
+        from dask_cudf.io.parquet import set_object_dtypes_from_pa_schema
+
+        dataset_info = super()._dataset_info
+        meta_pd = dataset_info["base_meta"]
+        if isinstance(meta_pd, cudf.DataFrame):
+            return dataset_info
+
+        # Convert to cudf
+        # (drop unsupported timezone information)
+        for k, v in meta_pd.dtypes.items():
+            if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None:
+                meta_pd[k] = meta_pd[k].dt.tz_localize(None)
+        meta_cudf = cudf.from_pandas(meta_pd)
+
+        # Re-set "object" dtypes to align with pa schema
+        kwargs = dataset_info.get("kwargs", {})
+        set_object_dtypes_from_pa_schema(
+            meta_cudf,
+            kwargs.get("schema", None),
+        )
+
+        dataset_info["base_meta"] = meta_cudf
+        self.operands[type(self)._parameters.index("_dataset_info_cache")] = (
+            dataset_info
+        )
+        return dataset_info
+
+    @staticmethod
+    def _table_to_pandas(
+        table,
+        index_name,
+        *args,
+    ):
+        df = cudf.DataFrame.from_arrow(table)
+        if index_name is not None:
+            df = df.set_index(index_name)
+        return df
+
+    def _tune_up(self, parent):
+        if self._fusion_compression_factor >= 1:
+            return
+        if isinstance(parent, CudfFusedParquetIO):
+            return
+        return parent.substitute(self, CudfFusedParquetIO(self))
+
+
 class RenameAxisCudf(RenameAxis):
     # TODO: Remove this after rename_axis is supported in cudf
     # (See: https://github.com/rapidsai/cudf/issues/16895)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index a14ffbc37dc..cf8af82e112 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -12,6 +12,7 @@
 from dask.dataframe import assert_eq
 
 import dask_cudf
+from dask_cudf.tests.utils import QUERY_PLANNING_ON
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
@@ -127,7 +128,20 @@ def test_read_parquet_open_file_options_raises():
         )
 
 
-def test_read_parquet_filesystem(s3_base, s3so, pdf):
+@pytest.mark.parametrize(
+    "filesystem",
+    [
+        pytest.param(
+            "arrow",
+            marks=pytest.mark.skipif(
+                not QUERY_PLANNING_ON or not dask_cudf.backends.PYARROW_GE_15,
+                reason="Not supported",
+            ),
+        ),
+        "fsspec",
+    ],
+)
+def test_read_parquet_filesystem(s3_base, s3so, pdf, filesystem):
     fname = "test_parquet_filesystem.parquet"
     bucket = "parquet"
     buffer = BytesIO()
@@ -135,21 +149,24 @@ def test_read_parquet_filesystem(s3_base, s3so, pdf):
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         path = f"s3://{bucket}/{fname}"
+        if filesystem == "arrow":
+            # This feature requires arrow >= 15
+            pytest.importorskip("pyarrow", minversion="15.0.0")
 
-        # Cannot pass filesystem="arrow"
-        with pytest.raises(ValueError):
-            dask_cudf.read_parquet(
+            import pyarrow.fs as pa_fs
+
+            df = dask_cudf.read_parquet(
+                path,
+                filesystem=pa_fs.S3FileSystem(
+                    endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+                ),
+            )
+        else:
+            df = dask_cudf.read_parquet(
                 path,
                 storage_options=s3so,
-                filesystem="arrow",
+                filesystem=filesystem,
             )
-
-        # Can pass filesystem="fsspec"
-        df = dask_cudf.read_parquet(
-            path,
-            storage_options=s3so,
-            filesystem="fsspec",
-        )
         assert df.b.sum().compute() == 9
 
 

From c7f6a22bb3edd3cea377d5405ca48a9eee353bc4 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 25 Sep 2024 12:59:58 -1000
Subject: [PATCH 05/11] Add string.attributes APIs to pylibcudf (#16785)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16785
---
 python/cudf/cudf/_lib/strings/attributes.pyx  | 46 ++++-------
 .../pylibcudf/strings/CMakeLists.txt          | 17 ++++-
 .../pylibcudf/pylibcudf/strings/__init__.pxd  | 19 +++++
 .../pylibcudf/pylibcudf/strings/__init__.py   | 19 +++++
 .../pylibcudf/strings/attributes.pxd          | 10 +++
 .../pylibcudf/strings/attributes.pyx          | 76 +++++++++++++++++++
 .../pylibcudf/tests/test_string_attributes.py | 32 ++++++++
 7 files changed, 185 insertions(+), 34 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/strings/attributes.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/attributes.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_attributes.py

diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx
index fe8c17c9e31..df81b3942b4 100644
--- a/python/cudf/cudf/_lib/strings/attributes.pyx
+++ b/python/cudf/cudf/_lib/strings/attributes.pyx
@@ -2,19 +2,10 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.attributes cimport (
-    code_points as cpp_code_points,
-    count_bytes as cpp_count_bytes,
-    count_characters as cpp_count_characters,
-)
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def count_characters(Column source_strings):
@@ -22,13 +13,10 @@ def count_characters(Column source_strings):
     Returns an integer numeric column containing the
     length of each string in characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_count_characters(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.attributes.count_characters(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -37,13 +25,10 @@ def count_bytes(Column source_strings):
     Returns an integer numeric column containing the
     number of bytes of each string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_count_bytes(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.attributes.count_bytes(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -52,10 +37,7 @@ def code_points(Column source_strings):
     Creates a numeric column with code point values (integers)
     for each character of each string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_code_points(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.attributes.code_points(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 77f20b0b917..142bc124ca2 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -13,8 +13,21 @@
 # =============================================================================
 
 set(cython_sources
-    capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx
-    regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx
+    attributes.pyx
+    capitalize.pyx
+    case.pyx
+    char_types.pyx
+    contains.pyx
+    extract.pyx
+    find.pyx
+    findall.pyx
+    regex_flags.pyx
+    regex_program.pyx
+    repeat.pyx
+    replace.pyx
+    side_type.pyx
+    slice.pyx
+    strip.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 91d884b294b..d8afccc7336 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from . cimport (
+    attributes,
     capitalize,
     case,
     char_types,
@@ -16,3 +17,21 @@ from . cimport (
     strip,
 )
 from .side_type cimport side_type
+
+__all__ = [
+    "attributes",
+    "capitalize",
+    "case",
+    "char_types",
+    "contains",
+    "convert",
+    "extract",
+    "find",
+    "findall",
+    "regex_flags",
+    "regex_program",
+    "replace",
+    "slice",
+    "strip",
+    "side_type",
+]
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index b4856784390..22452812e42 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from . import (
+    attributes,
     capitalize,
     case,
     char_types,
@@ -17,3 +18,21 @@
     strip,
 )
 from .side_type import SideType
+
+__all__ = [
+    "attributes",
+    "capitalize",
+    "case",
+    "char_types",
+    "contains",
+    "convert",
+    "extract",
+    "find",
+    "findall",
+    "regex_flags",
+    "regex_program",
+    "replace",
+    "slice",
+    "strip",
+    "SideType",
+]
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pxd b/python/pylibcudf/pylibcudf/strings/attributes.pxd
new file mode 100644
index 00000000000..27398766924
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column count_characters(Column source_strings)
+
+cpdef Column count_bytes(Column source_strings)
+
+cpdef Column code_points(Column source_strings)
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx
new file mode 100644
index 00000000000..36bee7bd1d9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport attributes as cpp_attributes
+
+
+cpdef Column count_characters(Column source_strings):
+    """
+    Returns a column containing character lengths of each string
+    in the given column.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Column of strings.
+
+    Returns
+    -------
+    Column
+        New column with lengths for each string
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_attributes.count_characters(source_strings.view()))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column count_bytes(Column source_strings):
+    """
+    Returns a column containing byte lengths of each string
+    in the given column.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Column of strings.
+
+    Returns
+    -------
+    Column
+        New column with the number of bytes for each string
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_attributes.count_bytes(source_strings.view()))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column code_points(Column source_strings):
+    """
+    Creates a numeric column with code point values (integers)
+    for each character of each string.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Column of strings.
+
+    Returns
+    -------
+    Column
+        New column with code point integer values for each character
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_attributes.code_points(source_strings.view()))
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
new file mode 100644
index 00000000000..a1820def0b1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture()
+def str_data():
+    pa_data = pa.array(["A", None])
+    return pa_data, plc.interop.from_arrow(pa_data)
+
+
+def test_count_characters(str_data):
+    result = plc.strings.attributes.count_characters(str_data[1])
+    expected = pc.utf8_length(str_data[0])
+    assert_column_eq(expected, result)
+
+
+def test_count_bytes(str_data):
+    result = plc.strings.attributes.count_characters(str_data[1])
+    expected = pc.binary_length(str_data[0])
+    assert_column_eq(expected, result)
+
+
+def test_code_points(str_data):
+    result = plc.strings.attributes.code_points(str_data[1])
+    exp_data = [ord(str_data[0].to_pylist()[0])]
+    expected = pa.chunked_array([exp_data], type=pa.int32())
+    assert_column_eq(expected, result)

From 12ee360048473ddd06019090c7d19c67d6959f7a Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 25 Sep 2024 20:13:45 -0400
Subject: [PATCH 06/11] [REVIEW] JSON host tree algorithms (#16545)

Depends on #16836
This change adds a new host tree building algorithms for JSON reader and utf8 field name support.

This constructs the device_column_tree using an adjacency list created from parent information.
This adjacency list is pruned based on input schema, and also types are enforced as per schema. `mark_is_pruned`
Tree is constructed from pruned adjacency list, (with mixed types handling). `construct_tree`

utf8 field name support added: (spark requested)
utf8 decoding of field names during hashing of field nodes so that utf8 encoded field names also match to same column.

All unit tests passes, 1 unit test added where old algorithm fails.
This code is kept under experimental flag.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16545
---
 cpp/include/cudf/io/json.hpp                  |  36 +
 cpp/src/io/json/host_tree_algorithms.cu       | 776 ++++++++++++++++--
 cpp/src/io/json/json_column.cu                |  46 +-
 cpp/src/io/json/json_tree.cu                  | 153 +++-
 cpp/src/io/json/nested_json.hpp               |  29 +-
 cpp/tests/io/json/json_test.cpp               |  53 ++
 cpp/tests/io/json/json_tree.cpp               |   1 +
 cpp/tests/io/json/json_tree_csr.cu            |   1 +
 .../main/java/ai/rapids/cudf/JSONOptions.java |  15 +
 java/src/main/java/ai/rapids/cudf/Table.java  |   9 +
 java/src/main/native/src/TableJni.cpp         |  12 +-
 11 files changed, 1011 insertions(+), 120 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index ff25a5bacae..6798557e14e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -105,6 +105,8 @@ class json_reader_options {
   char _delimiter = '\n';
   // Prune columns on read, selected based on the _dtypes option
   bool _prune_columns = false;
+  // Experimental features: new column tree construction
+  bool _experimental = false;
 
   // Bytes to skip from the start
   size_t _byte_range_offset = 0;
@@ -277,6 +279,15 @@ class json_reader_options {
    */
   [[nodiscard]] bool is_enabled_prune_columns() const { return _prune_columns; }
 
+  /**
+   * @brief Whether to enable experimental features.
+   *
+   * When set to true, experimental features, such as the new column tree construction,
+   * utf-8 matching of field names will be enabled.
+   * @return true if experimental features are enabled
+   */
+  [[nodiscard]] bool is_enabled_experimental() const { return _experimental; }
+
   /**
    * @brief Whether to parse dates as DD/MM versus MM/DD.
    *
@@ -453,6 +464,16 @@ class json_reader_options {
    */
   void enable_prune_columns(bool val) { _prune_columns = val; }
 
+  /**
+   * @brief Set whether to enable experimental features.
+   *
+   * When set to true, experimental features, such as the new column tree construction,
+   * utf-8 matching of field names will be enabled.
+   *
+   * @param val Boolean value to enable/disable experimental features
+   */
+  void enable_experimental(bool val) { _experimental = val; }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
@@ -695,6 +716,21 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether to enable experimental features.
+   *
+   * When set to true, experimental features, such as the new column tree construction,
+   * utf-8 matching of field names will be enabled.
+   *
+   * @param val Boolean value to enable/disable experimental features
+   * @return this for chaining
+   */
+  json_reader_options_builder& experimental(bool val)
+  {
+    options._experimental = val;
+    return *this;
+  }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 70d61132b42..5855f1b5a5f 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -43,6 +44,7 @@
 #include <thrust/uninitialized_fill.h>
 
 #include <algorithm>
+#include <deque>
 
 namespace cudf::io::json::detail {
 
@@ -58,16 +60,15 @@ namespace cudf::io::json::detail {
  */
 rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_array_children_level,
                                                           tree_meta_t const& d_tree,
-                                                          device_span<NodeIndexT> col_ids,
+                                                          device_span<NodeIndexT const> col_ids,
                                                           size_type const num_columns,
                                                           rmm::cuda_stream_view stream)
 {
-  CUDF_FUNC_RANGE();
   auto [level2_nodes, level2_indices] = get_array_children_indices(
     row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream);
   auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin());
   rmm::device_uvector<NodeIndexT> values_column_indices(num_columns, stream);
-  thrust::scatter(rmm::exec_policy(stream),
+  thrust::scatter(rmm::exec_policy_nosync(stream),
                   level2_indices.begin(),
                   level2_indices.end(),
                   col_id_location,
@@ -90,12 +91,11 @@ std::vector<std::string> copy_strings_to_host_sync(
   device_span<SymbolOffsetT const> node_range_end,
   rmm::cuda_stream_view stream)
 {
-  CUDF_FUNC_RANGE();
   auto const num_strings = node_range_begin.size();
   rmm::device_uvector<size_type> string_offsets(num_strings, stream);
   rmm::device_uvector<size_type> string_lengths(num_strings, stream);
   auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
-  thrust::transform(rmm::exec_policy(stream),
+  thrust::transform(rmm::exec_policy_nosync(stream),
                     d_offset_pairs,
                     d_offset_pairs + num_strings,
                     thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()),
@@ -161,18 +161,18 @@ std::vector<std::string> copy_strings_to_host_sync(
 rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const> input,
                                                       tree_meta_t const& d_column_tree,
                                                       tree_meta_t const& tree,
-                                                      device_span<NodeIndexT> col_ids,
+                                                      device_span<NodeIndexT const> col_ids,
                                                       cudf::io::json_reader_options const& options,
                                                       rmm::cuda_stream_view stream)
 {
   auto const num_nodes = col_ids.size();
   auto const num_cols  = d_column_tree.node_categories.size();
   rmm::device_uvector<uint8_t> is_all_nulls(num_cols, stream);
-  thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
+  thrust::fill(rmm::exec_policy_nosync(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
 
   auto parse_opt = parsing_options(options, stream);
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::counting_iterator<size_type>(0),
     num_nodes,
     [options           = parse_opt.view(),
@@ -193,7 +193,7 @@ rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const>
   return is_all_nulls;
 }
 
-NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT> col_ids,
+NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT const> col_ids,
                                        bool is_enabled_lines,
                                        rmm::cuda_stream_view stream)
 {
@@ -221,33 +221,34 @@ struct json_column_data {
   bitmask_type* validity;
 };
 
-std::pair<cudf::detail::host_vector<uint8_t>,
-          std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>>
-build_tree(device_json_column& root,
-           std::vector<uint8_t> const& is_str_column_all_nulls,
-           tree_meta_t& d_column_tree,
-           device_span<NodeIndexT const> d_unique_col_ids,
-           device_span<size_type const> d_max_row_offsets,
-           std::vector<std::string> const& column_names,
-           NodeIndexT row_array_parent_col_id,
-           bool is_array_of_arrays,
-           cudf::io::json_reader_options const& options,
-           rmm::cuda_stream_view stream,
-           rmm::device_async_resource_ref mr);
-void scatter_offsets(
-  tree_meta_t& tree,
-  device_span<NodeIndexT> col_ids,
-  device_span<size_type> row_offsets,
-  device_span<size_type> node_ids,
-  device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+using hashmap_of_device_columns =
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;
+
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
   tree_meta_t& d_column_tree,
-  host_span<const uint8_t> ignore_vals,
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>& columns,
-  rmm::cuda_stream_view stream);
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+void scatter_offsets(tree_meta_t const& tree,
+                     device_span<NodeIndexT const> col_ids,
+                     device_span<size_type const> row_offsets,
+                     device_span<size_type> node_ids,
+                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+                     tree_meta_t const& d_column_tree,
+                     host_span<const bool> ignore_vals,
+                     hashmap_of_device_columns const& columns,
+                     rmm::cuda_stream_view stream);
 
 /**
  * @brief Constructs `d_json_column` from node tree representation
- * Newly constructed columns are insert into `root`'s children.
+ * Newly constructed columns are inserted into `root`'s children.
  * `root` must be a list type.
  *
  * @param input Input JSON string device data
@@ -265,28 +266,28 @@ void scatter_offsets(
  * of child_offets and validity members of `d_json_column`
  */
 void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t& tree,
-                             device_span<NodeIndexT> col_ids,
-                             device_span<size_type> row_offsets,
+                             tree_meta_t const& tree,
+                             device_span<NodeIndexT const> col_ids,
+                             device_span<size_type const> row_offsets,
                              device_json_column& root,
                              bool is_array_of_arrays,
                              cudf::io::json_reader_options const& options,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
-  CUDF_FUNC_RANGE();
-
   bool const is_enabled_lines                 = options.is_enabled_lines();
   bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
-  auto const num_nodes                        = col_ids.size();
-  rmm::device_uvector<NodeIndexT> sorted_col_ids(col_ids.size(), stream);  // make a copy
-  thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin());
+  // make a copy
+  auto sorted_col_ids = cudf::detail::make_device_uvector_async(
+    col_ids, stream, cudf::get_current_device_resource_ref());
 
   // sort by {col_id} on {node_ids} stable
   rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
-  thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end());
-  thrust::stable_sort_by_key(
-    rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
+  thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end());
+  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
+                             sorted_col_ids.begin(),
+                             sorted_col_ids.end(),
+                             node_ids.begin());
 
   NodeIndexT const row_array_parent_col_id =
     get_row_array_parent_col_id(col_ids, is_enabled_lines, stream);
@@ -316,7 +317,7 @@ void make_device_json_column(device_span<SymbolT const> input,
       cudf::detail::make_host_vector_sync(values_column_indices, stream);
     std::transform(unique_col_ids.begin(),
                    unique_col_ids.end(),
-                   column_names.begin(),
+                   column_names.cbegin(),
                    column_names.begin(),
                    [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
                      auto col_id, auto name) mutable {
@@ -333,17 +334,17 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
     return std::vector<uint8_t>();
   }();
-  auto [ignore_vals, columns] = build_tree(root,
-                                           is_str_column_all_nulls,
-                                           d_column_tree,
-                                           d_unique_col_ids,
-                                           d_max_row_offsets,
-                                           column_names,
-                                           row_array_parent_col_id,
-                                           is_array_of_arrays,
-                                           options,
-                                           stream,
-                                           mr);
+  auto const [ignore_vals, columns] = build_tree(root,
+                                                 is_str_column_all_nulls,
+                                                 d_column_tree,
+                                                 d_unique_col_ids,
+                                                 d_max_row_offsets,
+                                                 column_names,
+                                                 row_array_parent_col_id,
+                                                 is_array_of_arrays,
+                                                 options,
+                                                 stream,
+                                                 mr);
 
   scatter_offsets(tree,
                   col_ids,
@@ -356,19 +357,18 @@ void make_device_json_column(device_span<SymbolT const> input,
                   stream);
 }
 
-std::pair<cudf::detail::host_vector<uint8_t>,
-          std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>>
-build_tree(device_json_column& root,
-           std::vector<uint8_t> const& is_str_column_all_nulls,
-           tree_meta_t& d_column_tree,
-           device_span<NodeIndexT const> d_unique_col_ids,
-           device_span<size_type const> d_max_row_offsets,
-           std::vector<std::string> const& column_names,
-           NodeIndexT row_array_parent_col_id,
-           bool is_array_of_arrays,
-           cudf::io::json_reader_options const& options,
-           rmm::cuda_stream_view stream,
-           rmm::device_async_resource_ref mr)
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
+  tree_meta_t& d_column_tree,
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
 {
   bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
   auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
@@ -380,6 +380,7 @@ build_tree(device_json_column& root,
     cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
   auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
   auto num_columns           = d_unique_col_ids.size();
+  stream.synchronize();
 
   auto to_json_col_type = [](auto category) {
     switch (category) {
@@ -439,11 +440,12 @@ build_tree(device_json_column& root,
   });
 
   // use hash map because we may skip field name's col_ids
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
+  hashmap_of_device_columns columns;
   // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
   std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
   // find column_ids which are values, but should be ignored in validity
-  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
+  auto ignore_vals = cudf::detail::make_host_vector<bool>(num_columns, stream);
+  std::fill(ignore_vals.begin(), ignore_vals.end(), false);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   std::vector<uint8_t> is_pruned(num_columns, 0);
   // for columns that are not mixed type but have been forced as string
@@ -452,7 +454,7 @@ build_tree(device_json_column& root,
 
   std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
     [&](NodeIndexT this_col_id, device_json_column& col) {
-      for (auto col_name : col.column_order) {
+      for (auto const& col_name : col.column_order) {
         auto child_id                  = mapped_columns[{this_col_id, col_name}];
         is_mixed_type_column[child_id] = 1;
         remove_child_columns(child_id, col.child_columns.at(col_name));
@@ -523,7 +525,7 @@ build_tree(device_json_column& root,
     if (parent_col_id != parent_node_sentinel &&
           (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
         forced_as_string_column[parent_col_id]) {
-      ignore_vals[this_col_id] = 1;
+      ignore_vals[this_col_id] = true;
       if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
       if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
       continue;
@@ -569,12 +571,12 @@ build_tree(device_json_column& root,
       }
 
       if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
-        ignore_vals[this_col_id] = 1;
+        ignore_vals[this_col_id] = true;
         continue;
       }
       if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
         // remap
-        ignore_vals[old_col_id] = 1;
+        ignore_vals[old_col_id] = true;
         mapped_columns.erase({parent_col_id, name});
         columns.erase(old_col_id);
         parent_col.child_columns.erase(name);
@@ -624,7 +626,7 @@ build_tree(device_json_column& root,
       auto parent_col_id = column_parent_ids[this_col_id];
       if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
         is_mixed_type_column[this_col_id] = 1;
-        ignore_vals[this_col_id]          = 1;
+        ignore_vals[this_col_id]          = true;
         columns.erase(this_col_id);
       }
       // Convert only mixed type columns as string (so to copy), but not its children
@@ -644,7 +646,7 @@ build_tree(device_json_column& root,
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
       forced_as_string_column[this_col_id] = true;
-      ignore_vals[this_col_id]             = 1;
+      ignore_vals[this_col_id]             = true;
     }
     // Convert only mixed type columns as string (so to copy), but not its children
     if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
@@ -664,16 +666,15 @@ build_tree(device_json_column& root,
   return {ignore_vals, columns};
 }
 
-void scatter_offsets(
-  tree_meta_t& tree,
-  device_span<NodeIndexT> col_ids,
-  device_span<size_type> row_offsets,
-  device_span<size_type> node_ids,
-  device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
-  tree_meta_t& d_column_tree,
-  host_span<const uint8_t> ignore_vals,
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>& columns,
-  rmm::cuda_stream_view stream)
+void scatter_offsets(tree_meta_t const& tree,
+                     device_span<NodeIndexT const> col_ids,
+                     device_span<size_type const> row_offsets,
+                     device_span<size_type> node_ids,
+                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+                     tree_meta_t const& d_column_tree,
+                     host_span<const bool> ignore_vals,
+                     hashmap_of_device_columns const& columns,
+                     rmm::cuda_stream_view stream)
 {
   auto const num_nodes   = col_ids.size();
   auto const num_columns = d_column_tree.node_categories.size();
@@ -695,7 +696,7 @@ void scatter_offsets(
 
   // 3. scatter string offsets to respective columns, set validity bits
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::counting_iterator<size_type>(0),
     num_nodes,
     [column_categories = d_column_tree.node_categories.begin(),
@@ -739,7 +740,7 @@ void scatter_offsets(
                                                                   : col_ids[parent_node_ids[node_id]];
       }));
   auto const list_children_end = thrust::copy_if(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
     thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
       num_nodes,
@@ -757,12 +758,12 @@ void scatter_offsets(
 
   auto const num_list_children =
     list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
-  thrust::stable_sort_by_key(rmm::exec_policy(stream),
+  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
                              parent_col_ids.begin(),
                              parent_col_ids.begin() + num_list_children,
                              node_ids.begin());
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator<size_type>(0),
     num_list_children,
     [node_ids        = node_ids.begin(),
@@ -805,4 +806,599 @@ void scatter_offsets(
   stream.synchronize();
 }
 
+namespace experimental {
+
+std::map<std::string, schema_element> unified_schema(cudf::io::json_reader_options const& options)
+{
+  return std::visit(
+    cudf::detail::visitor_overload{
+      [](std::vector<data_type> const& user_dtypes) {
+        std::map<std::string, schema_element> dnew;
+        std::transform(thrust::counting_iterator<size_t>(0),
+                       thrust::counting_iterator<size_t>(user_dtypes.size()),
+                       std::inserter(dnew, dnew.end()),
+                       [&user_dtypes](auto i) {
+                         return std::pair(std::to_string(i), schema_element{user_dtypes[i]});
+                       });
+        return dnew;
+      },
+      [](std::map<std::string, data_type> const& user_dtypes) {
+        std::map<std::string, schema_element> dnew;
+        std::transform(user_dtypes.begin(),
+                       user_dtypes.end(),
+                       std::inserter(dnew, dnew.end()),
+                       [](auto key_dtype) {
+                         return std::pair(key_dtype.first, schema_element{key_dtype.second});
+                       });
+        return dnew;
+      },
+      [](std::map<std::string, schema_element> const& user_dtypes) { return user_dtypes; }},
+    options.get_dtypes());
+}
+
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
+  tree_meta_t& d_column_tree,
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+/**
+ * @brief Constructs `d_json_column` from node tree representation
+ * Newly constructed columns are inserted into `root`'s children.
+ * `root` must be a list type.
+ *
+ * @param input Input JSON string device data
+ * @param tree Node tree representation of the JSON string
+ * @param col_ids Column ids of the nodes in the tree
+ * @param row_offsets Row offsets of the nodes in the tree
+ * @param root Root node of the `d_json_column` tree
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param options Parsing options specifying the parsing behaviour
+ * options affecting behaviour are
+ *   is_enabled_lines: Whether the input is a line-delimited JSON
+ *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the device memory
+ * of child_offets and validity members of `d_json_column`
+ */
+void make_device_json_column(device_span<SymbolT const> input,
+                             tree_meta_t const& tree,
+                             device_span<NodeIndexT const> col_ids,
+                             device_span<size_type const> row_offsets,
+                             device_json_column& root,
+                             bool is_array_of_arrays,
+                             cudf::io::json_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  bool const is_enabled_lines                 = options.is_enabled_lines();
+  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  // make a copy
+  auto sorted_col_ids = cudf::detail::make_device_uvector_async(
+    col_ids, stream, cudf::get_current_device_resource_ref());
+
+  // sort by {col_id} on {node_ids} stable
+  rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
+  thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end());
+  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
+                             sorted_col_ids.begin(),
+                             sorted_col_ids.end(),
+                             node_ids.begin());
+
+  NodeIndexT const row_array_parent_col_id =
+    get_row_array_parent_col_id(col_ids, is_enabled_lines, stream);
+
+  // 1. gather column information.
+  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
+    reduce_to_column_tree(tree,
+                          col_ids,
+                          sorted_col_ids,
+                          node_ids,
+                          row_offsets,
+                          is_array_of_arrays,
+                          row_array_parent_col_id,
+                          stream);
+
+  auto num_columns                      = d_unique_col_ids.size();
+  std::vector<std::string> column_names = copy_strings_to_host_sync(
+    input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
+  // array of arrays column names
+  if (is_array_of_arrays) {
+    auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
+    auto const column_parent_ids =
+      cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
+    TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
+    auto values_column_indices =
+      get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
+    auto h_values_column_indices =
+      cudf::detail::make_host_vector_sync(values_column_indices, stream);
+    std::transform(unique_col_ids.begin(),
+                   unique_col_ids.end(),
+                   column_names.cbegin(),
+                   column_names.begin(),
+                   [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
+                     auto col_id, auto name) mutable {
+                     return column_parent_ids[col_id] == row_array_parent_col_id
+                              ? std::to_string(h_values_column_indices[col_id])
+                              : name;
+                   });
+  }
+
+  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
+    if (is_enabled_mixed_types_as_string) {
+      return cudf::detail::make_std_vector_sync(
+        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
+    }
+    return std::vector<uint8_t>();
+  }();
+  auto const [ignore_vals, columns] = build_tree(root,
+                                                 is_str_column_all_nulls,
+                                                 d_column_tree,
+                                                 d_unique_col_ids,
+                                                 d_max_row_offsets,
+                                                 column_names,
+                                                 row_array_parent_col_id,
+                                                 is_array_of_arrays,
+                                                 options,
+                                                 stream,
+                                                 mr);
+  if (ignore_vals.empty()) return;
+  scatter_offsets(tree,
+                  col_ids,
+                  row_offsets,
+                  node_ids,
+                  sorted_col_ids,
+                  d_column_tree,
+                  ignore_vals,
+                  columns,
+                  stream);
+}
+
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
+  tree_meta_t& d_column_tree,
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  bool const is_enabled_lines                 = options.is_enabled_lines();
+  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
+  auto column_categories =
+    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
+  auto const column_parent_ids =
+    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
+  auto column_range_beg =
+    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
+  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
+  auto num_columns           = d_unique_col_ids.size();
+  stream.synchronize();
+
+  auto to_json_col_type = [](auto category) {
+    switch (category) {
+      case NC_STRUCT: return json_col_t::StructColumn;
+      case NC_LIST: return json_col_t::ListColumn;
+      case NC_STR: [[fallthrough]];
+      case NC_VAL: return json_col_t::StringColumn;
+      default: return json_col_t::Unknown;
+    }
+  };
+
+  auto initialize_json_columns = [&](auto i, auto& col_ref, auto column_category) {
+    auto& col = col_ref.get();
+    if (col.type != json_col_t::Unknown) { return; }
+    if (column_category == NC_ERR || column_category == NC_FN) {
+      return;
+    } else if (column_category == NC_VAL || column_category == NC_STR) {
+      col.string_offsets.resize(max_row_offsets[i] + 1, stream);
+      col.string_lengths.resize(max_row_offsets[i] + 1, stream);
+      thrust::fill(
+        rmm::exec_policy_nosync(stream),
+        thrust::make_zip_iterator(col.string_offsets.begin(), col.string_lengths.begin()),
+        thrust::make_zip_iterator(col.string_offsets.end(), col.string_lengths.end()),
+        thrust::make_tuple(0, 0));
+    } else if (column_category == NC_LIST) {
+      col.child_offsets.resize(max_row_offsets[i] + 2, stream);
+      thrust::uninitialized_fill(
+        rmm::exec_policy_nosync(stream), col.child_offsets.begin(), col.child_offsets.end(), 0);
+    }
+    col.num_rows = max_row_offsets[i] + 1;
+    col.validity =
+      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
+    col.type = to_json_col_type(column_category);
+  };
+
+  // 2. generate nested columns tree and its device_memory
+  // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
+  auto h_range_col_id_it =
+    thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin());
+  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
+    return thrust::get<0>(a) < thrust::get<0>(b);
+  });
+  // adjacency list construction
+  std::map<NodeIndexT, std::vector<NodeIndexT>> adj;
+  for (auto const this_col_id : unique_col_ids) {
+    auto parent_col_id = column_parent_ids[this_col_id];
+    adj[parent_col_id].push_back(this_col_id);
+  }
+
+  // Pruning
+  auto is_pruned = cudf::detail::make_host_vector<bool>(num_columns, stream);
+  std::fill_n(is_pruned.begin(), num_columns, options.is_enabled_prune_columns());
+
+  // prune all children of a column, but not self.
+  auto ignore_all_children = [&](auto parent_col_id) {
+    std::deque<NodeIndexT> offspring;
+    if (adj.count(parent_col_id)) {
+      for (auto const& child : adj[parent_col_id]) {
+        offspring.push_back(child);
+      }
+    }
+    while (!offspring.empty()) {
+      auto this_id = offspring.front();
+      offspring.pop_front();
+      is_pruned[this_id] = true;
+      if (adj.count(this_id)) {
+        for (auto const& child : adj[this_id]) {
+          offspring.push_back(child);
+        }
+      }
+    }
+  };
+
+  // Pruning: iterate through schema and mark only those columns and enforce type.
+  // NoPruning: iterate through schema and enforce type.
+
+  if (adj[parent_node_sentinel].empty())
+    return {cudf::detail::make_host_vector<bool>(0, stream), {}};  // for empty file
+  CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1");
+  auto expected_types = cudf::detail::make_host_vector<NodeT>(num_columns, stream);
+  std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES);
+
+  auto lookup_names = [&column_names](auto child_ids, auto name) {
+    for (auto const& child_id : child_ids) {
+      if (column_names[child_id] == name) return child_id;
+    }
+    return -1;
+  };
+  // recursive lambda on schema to mark columns as pruned.
+  std::function<void(NodeIndexT root, schema_element const& schema)> mark_is_pruned;
+  mark_is_pruned = [&is_pruned,
+                    &mark_is_pruned,
+                    &adj,
+                    &lookup_names,
+                    &column_categories,
+                    &expected_types,
+                    &ignore_all_children](NodeIndexT root, schema_element const& schema) -> void {
+    if (root == -1) return;
+    bool pass =
+      (schema.type == data_type{type_id::STRUCT} and column_categories[root] == NC_STRUCT) or
+      (schema.type == data_type{type_id::LIST} and column_categories[root] == NC_LIST) or
+      (schema.type != data_type{type_id::STRUCT} and schema.type != data_type{type_id::LIST} and
+       column_categories[root] != NC_FN);
+    if (!pass) {
+      // ignore all children of this column and prune this column.
+      is_pruned[root] = true;
+      ignore_all_children(root);
+      return;
+    }
+    is_pruned[root]    = false;
+    auto expected_type = [](auto type, auto cat) {
+      if (type == data_type{type_id::STRUCT} and cat == NC_STRUCT) return NC_STRUCT;
+      if (type == data_type{type_id::LIST} and cat == NC_LIST) return NC_LIST;
+      if (type != data_type{type_id::STRUCT} and type != data_type{type_id::LIST}) return NC_STR;
+      return NC_ERR;
+    }(schema.type, column_categories[root]);
+    expected_types[root] = expected_type;  // forced type.
+    // ignore children of nested columns, but not self.
+    if (expected_type == NC_STR and
+        (column_categories[root] == NC_STRUCT or column_categories[root] == NC_LIST))
+      ignore_all_children(root);
+    if (not(schema.type == data_type{type_id::STRUCT} or schema.type == data_type{type_id::LIST}))
+      return;  // no children to mark for non-nested.
+    auto child_ids = adj.count(root) ? adj[root] : std::vector<NodeIndexT>{};
+    if (schema.type == data_type{type_id::STRUCT}) {
+      for (auto const& key_pair : schema.child_types) {
+        auto col_id = lookup_names(child_ids, key_pair.first);
+        if (col_id == -1) continue;
+        is_pruned[col_id] = false;
+        for (auto const& child_id : adj[col_id])  // children of field (>1 if mixed)
+          mark_is_pruned(child_id, key_pair.second);
+      }
+    } else if (schema.type == data_type{type_id::LIST}) {
+      // partial solution for list children to have any name.
+      auto this_list_child_name =
+        schema.child_types.size() == 1 ? schema.child_types.begin()->first : list_child_name;
+      if (schema.child_types.count(this_list_child_name) == 0) return;
+      auto list_child = schema.child_types.at(this_list_child_name);
+      for (auto const& child_id : child_ids)
+        mark_is_pruned(child_id, list_child);
+    }
+  };
+  if (is_array_of_arrays) {
+    if (adj[adj[parent_node_sentinel][0]].empty())
+      return {cudf::detail::make_host_vector<bool>(0, stream), {}};
+    auto root_list_col_id =
+      is_enabled_lines ? adj[parent_node_sentinel][0] : adj[adj[parent_node_sentinel][0]][0];
+    // mark root and row array col_id as not pruned.
+    if (!is_enabled_lines) {
+      auto top_level_list_id       = adj[parent_node_sentinel][0];
+      is_pruned[top_level_list_id] = false;
+    }
+    is_pruned[root_list_col_id] = false;
+    std::visit(cudf::detail::visitor_overload{
+                 [&root_list_col_id, &adj, &mark_is_pruned, &column_names](
+                   std::vector<data_type> const& user_dtypes) -> void {
+                   for (size_t i = 0; i < adj[root_list_col_id].size() && i < user_dtypes.size();
+                        i++) {
+                     NodeIndexT const first_child_id = adj[root_list_col_id][i];
+                     auto name                       = column_names[first_child_id];
+                     auto value_id                   = std::stol(name);
+                     if (value_id >= 0 and value_id < static_cast<long>(user_dtypes.size()))
+                       mark_is_pruned(first_child_id, schema_element{user_dtypes[value_id]});
+                     // Note: mixed type - forced type, will work here.
+                   }
+                 },
+                 [&root_list_col_id, &adj, &mark_is_pruned, &column_names](
+                   std::map<std::string, data_type> const& user_dtypes) -> void {
+                   for (size_t i = 0; i < adj[root_list_col_id].size(); i++) {
+                     auto const first_child_id = adj[root_list_col_id][i];
+                     auto name                 = column_names[first_child_id];
+                     if (user_dtypes.count(name))
+                       mark_is_pruned(first_child_id, schema_element{user_dtypes.at(name)});
+                   }
+                 },
+                 [&root_list_col_id, &adj, &mark_is_pruned, &column_names](
+                   std::map<std::string, schema_element> const& user_dtypes) -> void {
+                   for (size_t i = 0; i < adj[root_list_col_id].size(); i++) {
+                     auto const first_child_id = adj[root_list_col_id][i];
+                     auto name                 = column_names[first_child_id];
+                     if (user_dtypes.count(name))
+                       mark_is_pruned(first_child_id, user_dtypes.at(name));
+                   }
+                 }},
+               options.get_dtypes());
+  } else {
+    auto root_struct_col_id =
+      is_enabled_lines
+        ? adj[parent_node_sentinel][0]
+        : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]);
+    // mark root and row struct col_id as not pruned.
+    if (!is_enabled_lines) {
+      auto top_level_list_id       = adj[parent_node_sentinel][0];
+      is_pruned[top_level_list_id] = false;
+    }
+    is_pruned[root_struct_col_id] = false;
+    schema_element u_schema{data_type{type_id::STRUCT}};
+    u_schema.child_types = unified_schema(options);
+    std::visit(
+      cudf::detail::visitor_overload{
+        [&is_pruned, &root_struct_col_id, &adj, &mark_is_pruned](
+          std::vector<data_type> const& user_dtypes) -> void {
+          for (size_t i = 0; i < adj[root_struct_col_id].size() && i < user_dtypes.size(); i++) {
+            NodeIndexT const first_field_id = adj[root_struct_col_id][i];
+            is_pruned[first_field_id]       = false;
+            for (auto const& child_id : adj[first_field_id])  // children of field (>1 if mixed)
+              mark_is_pruned(child_id, schema_element{user_dtypes[i]});
+          }
+        },
+        [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema](
+          std::map<std::string, data_type> const& user_dtypes) -> void {
+          mark_is_pruned(root_struct_col_id, u_schema);
+        },
+        [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema](
+          std::map<std::string, schema_element> const& user_dtypes) -> void {
+          mark_is_pruned(root_struct_col_id, u_schema);
+        }},
+      options.get_dtypes());
+  }
+  // Useful for array of arrays
+  auto named_level =
+    is_enabled_lines
+      ? adj[parent_node_sentinel][0]
+      : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]);
+
+  auto handle_mixed_types = [&column_categories,
+                             &is_str_column_all_nulls,
+                             &is_pruned,
+                             &expected_types,
+                             &is_enabled_mixed_types_as_string,
+                             &ignore_all_children](std::vector<NodeIndexT>& child_ids) {
+    // do these on unpruned columns only.
+    // when mixed types is disabled, ignore string sibling of nested column.
+    // when mixed types is disabled, and both list and struct columns are siblings, error out.
+    // when mixed types is enabled, force string type on all columns
+
+    // Remove pruned children (forced type will not clash here because other types are already
+    // pruned)
+    child_ids.erase(
+      std::remove_if(child_ids.begin(),
+                     child_ids.end(),
+                     [&is_pruned](NodeIndexT child_id) { return is_pruned[child_id]; }),
+      child_ids.end());
+    // find string id, struct id, list id.
+    NodeIndexT str_col_id{-1}, struct_col_id{-1}, list_col_id{-1};
+    for (auto const& child_id : child_ids) {
+      if (column_categories[child_id] == NC_VAL || column_categories[child_id] == NC_STR)
+        str_col_id = child_id;
+      else if (column_categories[child_id] == NC_STRUCT)
+        struct_col_id = child_id;
+      else if (column_categories[child_id] == NC_LIST)
+        list_col_id = child_id;
+    }
+    // conditions for handling mixed types.
+    if (is_enabled_mixed_types_as_string) {
+      if (struct_col_id != -1 and list_col_id != -1) {
+        expected_types[struct_col_id] = NC_STR;
+        expected_types[list_col_id]   = NC_STR;
+        // ignore children of nested columns.
+        ignore_all_children(struct_col_id);
+        ignore_all_children(list_col_id);
+      }
+      if ((struct_col_id != -1 or list_col_id != -1) and str_col_id != -1) {
+        if (is_str_column_all_nulls[str_col_id])
+          is_pruned[str_col_id] = true;
+        else {
+          // ignore children of nested columns.
+          if (struct_col_id != -1) {
+            expected_types[struct_col_id] = NC_STR;
+            ignore_all_children(struct_col_id);
+          }
+          if (list_col_id != -1) {
+            expected_types[list_col_id] = NC_STR;
+            ignore_all_children(list_col_id);
+          }
+        }
+      }
+    } else {
+      // if both are present, error out.
+      CUDF_EXPECTS(struct_col_id == -1 or list_col_id == -1,
+                   "A mix of lists and structs within the same column is not supported");
+      // either one only: so ignore str column.
+      if ((struct_col_id != -1 or list_col_id != -1) and str_col_id != -1) {
+        is_pruned[str_col_id] = true;
+      }
+    }
+  };
+
+  using dev_ref = std::reference_wrapper<device_json_column>;
+  std::unordered_map<NodeIndexT, dev_ref> columns;
+  columns.try_emplace(parent_node_sentinel, std::ref(root));
+  // convert adjaceny list to tree.
+  dev_ref parent_ref = std::ref(root);
+  // creates children column
+  std::function<void(NodeIndexT, dev_ref)> construct_tree;
+  construct_tree = [&](NodeIndexT root, dev_ref ref) -> void {
+    if (is_pruned[root]) return;
+    auto expected_category =
+      expected_types[root] == NUM_NODE_CLASSES ? column_categories[root] : expected_types[root];
+    initialize_json_columns(root, ref, expected_category);
+    auto child_ids = adj.count(root) ? adj[root] : std::vector<NodeIndexT>{};
+    if (expected_category == NC_STRUCT) {
+      // find field column ids, and its children and create columns.
+      for (auto const& field_id : child_ids) {
+        auto name = column_names[field_id];
+        if (is_pruned[field_id]) continue;
+        auto inserted =
+          ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second;
+        ref.get().column_order.emplace_back(name);
+        CUDF_EXPECTS(inserted,
+                     "struct child column insertion failed, duplicate column name in the parent");
+        auto this_ref = std::ref(ref.get().child_columns.at(name));
+        // Mixed type handling
+        auto& value_col_ids = adj[field_id];
+        handle_mixed_types(value_col_ids);
+        if (value_col_ids.empty()) {
+          // If no column is present, remove the uninitialized column.
+          ref.get().child_columns.erase(name);
+          ref.get().column_order.pop_back();
+          continue;
+        }
+        for (auto const& child_id : value_col_ids)  // children of field (>1 if mixed)
+        {
+          if (is_pruned[child_id]) continue;
+          columns.try_emplace(child_id, this_ref);
+          construct_tree(child_id, this_ref);
+        }
+      }
+    } else if (expected_category == NC_LIST) {
+      // array of arrays interpreted as array of structs.
+      if (is_array_of_arrays and root == named_level) {
+        // create column names
+        std::map<NodeIndexT, std::vector<NodeIndexT>> array_values;
+        for (auto const& child_id : child_ids) {
+          if (is_pruned[child_id]) continue;
+          auto name = column_names[child_id];
+          array_values[std::stoi(name)].push_back(child_id);
+        }
+        //
+        for (auto const& value_id_pair : array_values) {
+          auto [value_id, value_col_ids] = value_id_pair;
+          auto name                      = std::to_string(value_id);
+          auto inserted =
+            ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second;
+          ref.get().column_order.emplace_back(name);
+          CUDF_EXPECTS(inserted,
+                       "list child column insertion failed, duplicate column name in the parent");
+          auto this_ref = std::ref(ref.get().child_columns.at(name));
+          handle_mixed_types(value_col_ids);
+          if (value_col_ids.empty()) {
+            // If no column is present, remove the uninitialized column.
+            ref.get().child_columns.erase(name);
+            ref.get().column_order.pop_back();
+            continue;
+          }
+          for (auto const& child_id : value_col_ids)  // children of field (>1 if mixed)
+          {
+            if (is_pruned[child_id]) continue;
+            columns.try_emplace(child_id, this_ref);
+            construct_tree(child_id, this_ref);
+          }
+        }
+      } else {
+        if (child_ids.empty()) return;
+        auto inserted =
+          ref.get()
+            .child_columns.try_emplace(list_child_name, device_json_column(stream, mr))
+            .second;
+        CUDF_EXPECTS(inserted,
+                     "list child column insertion failed, duplicate column name in the parent");
+        ref.get().column_order.emplace_back(list_child_name);
+        auto this_ref = std::ref(ref.get().child_columns.at(list_child_name));
+        // Mixed type handling
+        handle_mixed_types(child_ids);
+        if (child_ids.empty()) {
+          // If no column is present, remove the uninitialized column.
+          ref.get().child_columns.erase(list_child_name);
+        }
+        for (auto const& child_id : child_ids) {
+          if (is_pruned[child_id]) continue;
+          columns.try_emplace(child_id, this_ref);
+          construct_tree(child_id, this_ref);
+        }
+      }
+    }
+  };
+  auto inserted = parent_ref.get()
+                    .child_columns.try_emplace(list_child_name, device_json_column(stream, mr))
+                    .second;
+  CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
+  parent_ref = std::ref(parent_ref.get().child_columns.at(list_child_name));
+  columns.try_emplace(adj[parent_node_sentinel][0], parent_ref);
+  construct_tree(adj[parent_node_sentinel][0], parent_ref);
+
+  // Forced string type due to input schema and mixed type as string.
+  for (size_t i = 0; i < expected_types.size(); i++) {
+    if (expected_types[i] == NC_STR) {
+      if (columns.count(i)) { columns.at(i).get().forced_as_string_column = true; }
+    }
+  }
+  std::transform(expected_types.cbegin(),
+                 expected_types.cend(),
+                 column_categories.cbegin(),
+                 expected_types.begin(),
+                 [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; });
+  cudaMemcpyAsync(d_column_tree.node_categories.begin(),
+                  expected_types.data(),
+                  expected_types.size() * sizeof(column_categories[0]),
+                  cudaMemcpyDefault,
+                  stream.value());
+
+  return {is_pruned, columns};
+}
+}  // namespace experimental
+
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index dfd9285f682..912e93d52ae 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -104,7 +104,7 @@ void print_tree(host_span<SymbolT const> input,
  * max row offsets of columns
  */
 std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
-reduce_to_column_tree(tree_meta_t& tree,
+reduce_to_column_tree(tree_meta_t const& tree,
                       device_span<NodeIndexT const> original_col_ids,
                       device_span<NodeIndexT const> sorted_col_ids,
                       device_span<NodeIndexT const> ordered_node_ids,
@@ -317,7 +317,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
     // Note: json_col modified here, moves this memory
   };
 
-  auto get_child_schema = [schema](auto child_name) -> std::optional<schema_element> {
+  auto get_child_schema = [&schema](auto child_name) -> std::optional<schema_element> {
     if (schema.has_value()) {
       auto const result = schema.value().child_types.find(child_name);
       if (result != std::end(schema.value().child_types)) { return result->second; }
@@ -325,6 +325,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
     return {};
   };
 
+  auto get_list_child_schema = [&schema]() -> std::optional<schema_element> {
+    if (schema.has_value()) {
+      if (schema.value().child_types.size() > 0) return schema.value().child_types.begin()->second;
+    }
+    return {};
+  };
+
   switch (json_col.type) {
     case json_col_t::StringColumn: {
       // move string_offsets to GPU and transform to string column
@@ -439,9 +446,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                                                      rmm::device_buffer{},
                                                      0);
       // Create children column
-      auto child_schema_element = json_col.child_columns.empty()
-                                    ? std::optional<schema_element>{}
-                                    : get_child_schema(json_col.child_columns.begin()->first);
+      auto child_schema_element =
+        json_col.child_columns.empty() ? std::optional<schema_element>{} : get_list_child_schema();
       auto [child_column, names] =
         json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
           ? std::pair<std::unique_ptr<column>,
@@ -479,6 +485,16 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
   }
 }
 
+template <typename... Args>
+auto make_device_json_column_dispatch(bool experimental, Args&&... args)
+{
+  if (experimental) {
+    return experimental::make_device_json_column(std::forward<Args>(args)...);
+  } else {
+    return make_device_json_column(std::forward<Args>(args)...);
+  }
+}
+
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
@@ -524,6 +540,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                   gpu_tree,
                                   is_array_of_arrays,
                                   options.is_enabled_lines(),
+                                  options.is_enabled_experimental(),
                                   stream,
                                   cudf::get_current_device_resource_ref());
 
@@ -536,15 +553,16 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                0);
 
   // Get internal JSON column
-  make_device_json_column(d_input,
-                          gpu_tree,
-                          gpu_col_id,
-                          gpu_row_offsets,
-                          root_column,
-                          is_array_of_arrays,
-                          options,
-                          stream,
-                          mr);
+  make_device_json_column_dispatch(options.is_enabled_experimental(),
+                                   d_input,
+                                   gpu_tree,
+                                   gpu_col_id,
+                                   gpu_row_offsets,
+                                   root_column,
+                                   is_array_of_arrays,
+                                   options,
+                                   stream,
+                                   mr);
 
   // data_root refers to the root column of the data represented by the given JSON string
   auto& data_root =
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 4d0dc010c57..d949635c1cc 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include "io/utilities/hostdevice_vector.hpp"
+#include "io/utilities/parsing_utils.cuh"
+#include "io/utilities/string_parsing.hpp"
 #include "nested_json.hpp"
 
 #include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/scatter.cuh>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/helper_functions.cuh>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
@@ -34,12 +35,14 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/device/device_radix_sort.cuh>
+#include <cuco/static_map.cuh>
 #include <cuco/static_set.cuh>
 #include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/fill.h>
+#include <thrust/functional.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -492,6 +495,85 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
           std::move(node_range_end)};
 }
 
+// Return field node ids after unicode decoding of field names and matching them to same field names
+std::pair<size_t, rmm::device_uvector<size_type>> remapped_field_nodes_after_unicode_decode(
+  device_span<SymbolT const> d_input,
+  tree_meta_t const& d_tree,
+  device_span<size_type const> keys,
+  rmm::cuda_stream_view stream)
+{
+  size_t num_keys = keys.size();
+  if (num_keys == 0) { return {num_keys, rmm::device_uvector<size_type>(num_keys, stream)}; }
+  rmm::device_uvector<size_type> offsets(num_keys, stream);
+  rmm::device_uvector<size_type> lengths(num_keys, stream);
+  auto offset_length_it = thrust::make_zip_iterator(offsets.begin(), lengths.begin());
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    keys.begin(),
+                    keys.end(),
+                    offset_length_it,
+                    [node_range_begin = d_tree.node_range_begin.data(),
+                     node_range_end   = d_tree.node_range_end.data()] __device__(auto key) {
+                      return thrust::make_tuple(node_range_begin[key],
+                                                node_range_end[key] - node_range_begin[key]);
+                    });
+  cudf::io::parse_options_view opt{',', '\n', '\0', '.'};
+  opt.keepquotes = true;
+
+  auto utf8_decoded_fields = parse_data(d_input.data(),
+                                        offset_length_it,
+                                        num_keys,
+                                        data_type{type_id::STRING},
+                                        rmm::device_buffer{},
+                                        0,
+                                        opt,
+                                        stream,
+                                        cudf::get_current_device_resource_ref());
+  // hash using iter, create a hashmap for 0-num_keys.
+  // insert and find. -> array
+  // store to static_map with keys as field key[index], and values as key[array[index]]
+
+  auto str_view         = strings_column_view{utf8_decoded_fields->view()};
+  auto const char_ptr   = str_view.chars_begin(stream);
+  auto const offset_ptr = str_view.offsets().begin<size_type>();
+
+  // String hasher
+  auto const d_hasher = cuda::proclaim_return_type<
+    typename cudf::hashing::detail::default_hash<cudf::string_view>::result_type>(
+    [char_ptr, offset_ptr] __device__(auto node_id) {
+      auto const field_name = cudf::string_view(char_ptr + offset_ptr[node_id],
+                                                offset_ptr[node_id + 1] - offset_ptr[node_id]);
+      return cudf::hashing::detail::default_hash<cudf::string_view>{}(field_name);
+    });
+  auto const d_equal = [char_ptr, offset_ptr] __device__(auto node_id1, auto node_id2) {
+    auto const field_name1 = cudf::string_view(char_ptr + offset_ptr[node_id1],
+                                               offset_ptr[node_id1 + 1] - offset_ptr[node_id1]);
+    auto const field_name2 = cudf::string_view(char_ptr + offset_ptr[node_id2],
+                                               offset_ptr[node_id2 + 1] - offset_ptr[node_id2]);
+    return field_name1 == field_name2;
+  };
+
+  using hasher_type                             = decltype(d_hasher);
+  constexpr size_type empty_node_index_sentinel = -1;
+  auto key_set                                  = cuco::static_set{
+    cuco::extent{compute_hash_table_size(num_keys)},
+    cuco::empty_key{empty_node_index_sentinel},
+    d_equal,
+    cuco::linear_probing<1, hasher_type>{d_hasher},
+                                     {},
+                                     {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+  auto const counting_iter = thrust::make_counting_iterator<size_type>(0);
+  rmm::device_uvector<size_type> found_keys(num_keys, stream);
+  key_set.insert_and_find_async(counting_iter,
+                                counting_iter + num_keys,
+                                found_keys.begin(),
+                                thrust::make_discard_iterator(),
+                                stream.value());
+  // set.size will synchronize the stream before return.
+  return {key_set.size(stream), std::move(found_keys)};
+}
+
 /**
  * @brief Generates unique node_type id for each node.
  * Field nodes with the same name are assigned the same node_type id.
@@ -500,11 +582,14 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
  * All inputs and outputs are in node_id order.
  * @param d_input JSON string in device memory
  * @param d_tree Tree representation of the JSON
+ * @param is_enabled_experimental Whether to enable experimental features such as
+ * utf8 field name support
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @return Vector of node_type ids
  */
 rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<SymbolT const> d_input,
                                                               tree_meta_t const& d_tree,
+                                                              bool is_enabled_experimental,
                                                               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
@@ -536,7 +621,7 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
   };
   // key-value pairs: uses node_id itself as node_type. (unique node_id for a field name due to
   // hashing)
-  auto const iter = thrust::make_counting_iterator<size_type>(0);
+  auto const counting_iter = thrust::make_counting_iterator<size_type>(0);
 
   auto const is_field_name_node = [node_categories =
                                      d_tree.node_categories.data()] __device__(auto node_id) {
@@ -554,15 +639,61 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
                                      {},
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value()};
-  key_set.insert_if_async(iter,
-                          iter + num_nodes,
+  key_set.insert_if_async(counting_iter,
+                          counting_iter + num_nodes,
                           thrust::counting_iterator<size_type>(0),  // stencil
                           is_field_name_node,
                           stream.value());
 
+  // experimental feature: utf8 field name support
+  // parse_data on field names,
+  // rehash it using another map,
+  // reassign the reverse map values to new matched node indices.
+  auto get_utf8_matched_field_nodes = [&]() {
+    auto make_map = [&stream](auto num_keys) {
+      using hasher_type3 = cudf::hashing::detail::default_hash<size_type>;
+      return cuco::static_map{
+        cuco::extent{compute_hash_table_size(num_keys, 100)},  // 100% occupancy
+        cuco::empty_key{empty_node_index_sentinel},
+        cuco::empty_value{empty_node_index_sentinel},
+        {},
+        cuco::linear_probing<1, hasher_type3>{hasher_type3{}},
+        {},
+        {},
+        cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+        stream.value()};
+    };
+    if (!is_enabled_experimental) { return std::pair{false, make_map(0)}; }
+    // get all unique field node ids for utf8 decoding
+    auto num_keys = key_set.size(stream);
+    rmm::device_uvector<size_type> keys(num_keys, stream);
+    key_set.retrieve_all(keys.data(), stream.value());
+
+    auto [num_unique_fields, found_keys] =
+      remapped_field_nodes_after_unicode_decode(d_input, d_tree, keys, stream);
+
+    auto is_need_remap = num_unique_fields != num_keys;
+    if (!is_need_remap) { return std::pair{false, make_map(0)}; }
+
+    // store to static_map with keys as field keys[index], and values as keys[found_keys[index]]
+    auto reverse_map        = make_map(num_keys);
+    auto matching_keys_iter = thrust::make_permutation_iterator(keys.begin(), found_keys.begin());
+    auto pair_iter =
+      thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), matching_keys_iter));
+    reverse_map.insert_async(pair_iter, pair_iter + num_keys, stream);
+    return std::pair{is_need_remap, std::move(reverse_map)};
+  };
+  auto [is_need_remap, reverse_map] = get_utf8_matched_field_nodes();
+
   auto const get_hash_value =
-    [key_set = key_set.ref(cuco::op::find)] __device__(auto node_id) -> size_type {
+    [key_set       = key_set.ref(cuco::op::find),
+     is_need_remap = is_need_remap,
+     rm            = reverse_map.ref(cuco::op::find)] __device__(auto node_id) -> size_type {
     auto const it = key_set.find(node_id);
+    if (it != key_set.end() and is_need_remap) {
+      auto const it2 = rm.find(*it);
+      return (it2 == rm.end()) ? size_type{0} : it2->second;
+    }
     return (it == key_set.end()) ? size_type{0} : *it;
   };
 
@@ -771,6 +902,8 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
  * @param d_tree Tree representation of the JSON
  * @param is_array_of_arrays Whether the tree is an array of arrays
  * @param is_enabled_lines Whether the input is a line-delimited JSON
+ * @param is_enabled_experimental Whether the experimental feature is enabled such as
+ * utf8 field name support
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return column_id, parent_column_id
@@ -780,6 +913,7 @@ std::pair<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<NodeIndexT>> gene
   tree_meta_t const& d_tree,
   bool is_array_of_arrays,
   bool is_enabled_lines,
+  bool is_enabled_experimental,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
@@ -793,7 +927,7 @@ std::pair<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<NodeIndexT>> gene
   auto [col_id, unique_keys] = [&]() {
     // Convert node_category + field_name to node_type.
     rmm::device_uvector<size_type> node_type =
-      hash_node_type_with_field_name(d_input, d_tree, stream);
+      hash_node_type_with_field_name(d_input, d_tree, is_enabled_experimental, stream);
 
     // hash entire path from node to root.
     return hash_node_path(d_tree.node_levels,
@@ -948,12 +1082,13 @@ records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               tree_meta_t const& d_tree,
                               bool is_array_of_arrays,
                               bool is_enabled_lines,
+                              bool is_enabled_experimental,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  auto [new_col_id, new_parent_col_id] =
-    generate_column_id(d_input, d_tree, is_array_of_arrays, is_enabled_lines, stream, mr);
+  auto [new_col_id, new_parent_col_id] = generate_column_id(
+    d_input, d_tree, is_array_of_arrays, is_enabled_lines, is_enabled_experimental, stream, mr);
 
   auto row_offsets = compute_row_offsets(
     std::move(new_parent_col_id), d_tree, is_array_of_arrays, is_enabled_lines, stream, mr);
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 93ef2b46be1..3d9a51833e0 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -316,6 +316,8 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
  * index, level, begin index, and end index in the input JSON string
  * @param is_array_of_arrays Whether the tree is an array of arrays
  * @param is_enabled_lines Whether the input is a line-delimited JSON
+ * @param is_enabled_experimental Whether to enable experimental features such as utf-8 field name
+ * support
  * @param stream The CUDA stream to which kernels are dispatched
  * @param mr Optional, resource with which to allocate
  * @return A tuple of the output column indices and the row offsets within each column for each node
@@ -326,6 +328,7 @@ records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               tree_meta_t const& d_tree,
                               bool is_array_of_arrays,
                               bool is_enabled_lines,
+                              bool is_enabled_experimental,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
@@ -352,7 +355,7 @@ get_array_children_indices(TreeDepthT row_array_children_level,
 /**
  * @brief Reduces node tree representation to column tree representation.
  *
- * @param node_tree Node tree representation of JSON string
+ * @param tree Node tree representation of JSON string
  * @param original_col_ids Column ids of nodes
  * @param sorted_col_ids Sorted column ids of nodes
  * @param ordered_node_ids Node ids of nodes sorted by column ids
@@ -365,7 +368,7 @@ get_array_children_indices(TreeDepthT row_array_children_level,
  */
 CUDF_EXPORT
 std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
-reduce_to_column_tree(tree_meta_t& node_tree,
+reduce_to_column_tree(tree_meta_t const& tree,
                       device_span<NodeIndexT const> original_col_ids,
                       device_span<NodeIndexT const> sorted_col_ids,
                       device_span<NodeIndexT const> ordered_node_ids,
@@ -393,14 +396,30 @@ reduce_to_column_tree(tree_meta_t& node_tree,
  * of child_offets and validity members of `d_json_column`
  */
 void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t& tree,
-                             device_span<NodeIndexT> col_ids,
-                             device_span<size_type> row_offsets,
+                             tree_meta_t const& tree,
+                             device_span<NodeIndexT const> col_ids,
+                             device_span<size_type const> row_offsets,
                              device_json_column& root,
                              bool is_array_of_arrays,
                              cudf::io::json_reader_options const& options,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
+
+namespace experimental {
+/**
+ * @copydoc cudf::io::json::detail::make_device_json_column
+ */
+void make_device_json_column(device_span<SymbolT const> input,
+                             tree_meta_t const& tree,
+                             device_span<NodeIndexT const> col_ids,
+                             device_span<size_type const> row_offsets,
+                             device_json_column& root,
+                             bool is_array_of_arrays,
+                             cudf::io::json_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
+}  // namespace experimental
+
 /**
  * @brief Retrieves the parse_options to be used for type inference and type casting
  *
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 48bc982d0e3..68ec255b39d 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2856,6 +2856,59 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren)
   }
 }
 
+TEST_F(JsonReaderTest, MixedTypesWithSchema)
+{
+  std::string data = "{\"data\": {\"A\": 0, \"B\": 1}}\n{\"data\": [1,0]}\n";
+
+  std::map<std::string, cudf::io::schema_element> data_types;
+  std::map<std::string, cudf::io::schema_element> child_types;
+  child_types.insert(
+    std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING, 0}, {}}});
+  data_types.insert(std::pair{
+    "data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST, 0}, child_types}});
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .dtypes(data_types)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+      .normalize_single_quotes(true)
+      .normalize_whitespace(true)
+      .mixed_types_as_string(true)
+      .experimental(true)
+      .keep_quotes(true)
+      .lines(true);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 2);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::LIST);
+  EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::STRING);
+}
+
+TEST_F(JsonReaderTest, UnicodeFieldname)
+{
+  // unicode at nested and leaf levels
+  std::string data = R"({"data": {"a": 0, "b	c": 1}}
+  {"data": {"\u0061": 2, "\u0062\tc": 3}}
+  {"d\u0061ta": {"a": 4}})";
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+      .experimental(true)
+      .lines(true);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 3);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+  EXPECT_EQ(result.tbl->get_column(0).num_children(), 2);
+  EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info.at(0).name, "data");
+  EXPECT_EQ(result.metadata.schema_info.at(0).children.at(0).name, "a");
+  EXPECT_EQ(result.metadata.schema_info.at(0).children.at(1).name, "b\tc");
+  EXPECT_EQ(result.metadata.schema_info.at(0).children.size(), 2);
+}
+
 TEST_F(JsonReaderTest, JsonDtypeSchema)
 {
   std::string data = R"(
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 875cc467b6a..15682c6ae6b 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -889,6 +889,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
                                                      gpu_tree,
                                                      is_array_of_arrays,
                                                      json_lines,
+                                                     false,
                                                      stream,
                                                      cudf::get_current_device_resource_ref());
 #if LIBCUDF_JSON_DEBUG_DUMP
diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu
index a336b327732..f988ae24b38 100644
--- a/cpp/tests/io/json/json_tree_csr.cu
+++ b/cpp/tests/io/json/json_tree_csr.cu
@@ -168,6 +168,7 @@ void run_test(std::string const& input, bool enable_lines = true)
                                                      gpu_tree,
                                                      is_array_of_arrays,
                                                      options.is_enabled_lines(),
+                                                     false,
                                                      stream,
                                                      rmm::mr::get_current_device_resource());
   auto& gpu_col_id      = std::get<0>(tup);
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 2bb74c3e3b1..e41cc15712f 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -39,6 +39,7 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean allowNonNumericNumbers;
   private final boolean allowUnquotedControlChars;
   private final boolean cudfPruneSchema;
+  private final boolean experimental;
   private final byte lineDelimiter;
 
   private JSONOptions(Builder builder) {
@@ -55,6 +56,7 @@ private JSONOptions(Builder builder) {
     allowNonNumericNumbers = builder.allowNonNumericNumbers;
     allowUnquotedControlChars = builder.allowUnquotedControlChars;
     cudfPruneSchema = builder.cudfPruneSchema;
+    experimental = builder.experimental;
     lineDelimiter = builder.lineDelimiter;
   }
 
@@ -111,6 +113,10 @@ public boolean unquotedControlChars() {
     return allowUnquotedControlChars;
   }
 
+  public boolean experimental() {
+    return experimental;
+  }
+
   @Override
   String[] getIncludeColumnNames() {
     throw new UnsupportedOperationException("JSON reader didn't support column prune");
@@ -136,6 +142,7 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean keepQuotes = false;
 
     private boolean cudfPruneSchema = false;
+    private boolean experimental = false;
     private byte lineDelimiter = '\n';
 
     public Builder withCudfPruneSchema(boolean prune) {
@@ -159,6 +166,14 @@ public Builder withStrictValidation(boolean isAllowed) {
       return this;
     }
 
+    /**
+     * Should experimental features be enabled or not
+     */
+    public Builder withExperimental(boolean isAllowed) {
+      experimental = isAllowed;
+      return this;
+    }
+
     /**
      * Should leading zeros on numbers be allowed or not. Strict validation
      * must be enabled for this to have any effect.
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 6d370ca27b2..dbee53640aa 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -260,6 +260,7 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean allowNonNumericNumbers,
                                         boolean allowUnquotedControl,
                                         boolean pruneColumns,
+                                        boolean experimental,
                                         byte lineDelimiter) throws CudfException;
 
   private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
@@ -275,6 +276,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean allowNonNumericNumbers,
                                       boolean allowUnquotedControl,
                                       boolean pruneColumns,
+                                      boolean experimental,
                                       byte lineDelimiter,
                                       long dsHandle) throws CudfException;
 
@@ -288,6 +290,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool
                                       boolean allowLeadingZeros,
                                       boolean allowNonNumericNumbers,
                                       boolean allowUnquotedControl,
+                                      boolean experimental,
                                       byte lineDelimiter,
                                       long dsHandle) throws CudfException;
 
@@ -303,6 +306,7 @@ private static native long readAndInferJSON(long address, long length,
                                               boolean allowLeadingZeros,
                                               boolean allowNonNumericNumbers,
                                               boolean allowUnquotedControl,
+                                              boolean experimental,
                                               byte lineDelimiter) throws CudfException;
 
   /**
@@ -1333,6 +1337,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     opts.nonNumericNumbersAllowed(),
                     opts.unquotedControlChars(),
                     cudfPruneSchema,
+                    opts.experimental(),
                     opts.getLineDelimiter()))) {
 
       return gatherJSONColumns(schema, twm, -1);
@@ -1417,6 +1422,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
         opts.leadingZerosAllowed(),
         opts.nonNumericNumbersAllowed(),
         opts.unquotedControlChars(),
+        opts.experimental(),
         opts.getLineDelimiter()));
   }
 
@@ -1439,6 +1445,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
           opts.leadingZerosAllowed(),
           opts.nonNumericNumbersAllowed(),
           opts.unquotedControlChars(),
+          opts.experimental(),
           opts.getLineDelimiter(),
           dsHandle));
         return twm;
@@ -1499,6 +1506,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
             opts.nonNumericNumbersAllowed(),
             opts.unquotedControlChars(),
             cudfPruneSchema,
+            opts.experimental(),
             opts.getLineDelimiter()))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
     }
@@ -1543,6 +1551,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
         opts.nonNumericNumbersAllowed(),
         opts.unquotedControlChars(),
         cudfPruneSchema,
+        opts.experimental(),
         opts.getLineDelimiter(),
         dsHandle))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 0f77da54152..0a667978ca3 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1627,6 +1627,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
                                                          jboolean allow_leading_zeros,
                                                          jboolean allow_nonnumeric_numbers,
                                                          jboolean allow_unquoted_control,
+                                                         jboolean experimental,
                                                          jbyte line_delimiter,
                                                          jlong ds_handle)
 {
@@ -1649,6 +1650,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
         .mixed_types_as_string(mixed_types_as_string)
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
+        .experimental(experimental)
         .keep_quotes(keep_quotes)
         .prune_columns(false);
     if (strict_validation) {
@@ -1680,6 +1682,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
                                            jboolean allow_leading_zeros,
                                            jboolean allow_nonnumeric_numbers,
                                            jboolean allow_unquoted_control,
+                                           jboolean experimental,
                                            jbyte line_delimiter)
 {
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
@@ -1705,6 +1708,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
         .strict_validation(strict_validation)
         .mixed_types_as_string(mixed_types_as_string)
         .prune_columns(false)
+        .experimental(experimental)
         .delimiter(static_cast<char>(line_delimiter))
         .keep_quotes(keep_quotes);
     if (strict_validation) {
@@ -1821,6 +1825,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
                                                  jboolean allow_nonnumeric_numbers,
                                                  jboolean allow_unquoted_control,
                                                  jboolean prune_columns,
+                                                 jboolean experimental,
                                                  jbyte line_delimiter,
                                                  jlong ds_handle)
 {
@@ -1859,7 +1864,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes)
-        .prune_columns(prune_columns);
+        .prune_columns(prune_columns)
+        .experimental(experimental);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
@@ -1920,6 +1926,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
                                                            jboolean allow_nonnumeric_numbers,
                                                            jboolean allow_unquoted_control,
                                                            jboolean prune_columns,
+                                                           jboolean experimental,
                                                            jbyte line_delimiter)
 {
   bool read_buffer = true;
@@ -1972,7 +1979,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .delimiter(static_cast<char>(line_delimiter))
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes)
-        .prune_columns(prune_columns);
+        .prune_columns(prune_columns)
+        .experimental(experimental);
     if (strict_validation) {
       opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)

From 61af76978e97d94c1c9c7297fc73900d7827b433 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 25 Sep 2024 16:48:51 -1000
Subject: [PATCH 07/11] Add io/timezone APIs to pylibcudf (#16771)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16771
---
 .../api_docs/pylibcudf/io/index.rst           |  1 +
 .../api_docs/pylibcudf/io/timezone.rst        |  6 +++
 python/cudf/cudf/_lib/timezone.pyx            | 27 ++----------
 python/pylibcudf/pylibcudf/io/CMakeLists.txt  |  4 +-
 python/pylibcudf/pylibcudf/io/__init__.pxd    |  2 +-
 python/pylibcudf/pylibcudf/io/__init__.py     |  2 +-
 python/pylibcudf/pylibcudf/io/timezone.pxd    |  6 +++
 python/pylibcudf/pylibcudf/io/timezone.pyx    | 43 +++++++++++++++++++
 .../pylibcudf/tests/io/test_timezone.py       | 16 +++++++
 9 files changed, 81 insertions(+), 26 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst
 create mode 100644 python/pylibcudf/pylibcudf/io/timezone.pxd
 create mode 100644 python/pylibcudf/pylibcudf/io/timezone.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_timezone.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index c8933981736..53638f071cc 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -19,3 +19,4 @@ I/O Functions
     csv
     json
     parquet
+    timezone
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst
new file mode 100644
index 00000000000..20c1ffc2e93
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst
@@ -0,0 +1,6 @@
+========
+Timezone
+========
+
+.. automodule:: pylibcudf.io.timezone
+   :members:
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index bff3b2c4ce4..54624a5a2fd 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -1,29 +1,10 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.optional cimport make_optional
-from libcpp.string cimport string
-from libcpp.utility cimport move
+import pylibcudf as plc
 
-from pylibcudf.libcudf.io.timezone cimport (
-    make_timezone_transition_table as cpp_make_timezone_transition_table,
-)
-from pylibcudf.libcudf.table.table cimport table
-
-from cudf._lib.utils cimport columns_from_unique_ptr
+from cudf._lib.column cimport Column
 
 
 def make_timezone_transition_table(tzdir, tzname):
-    cdef unique_ptr[table] c_result
-    cdef string c_tzdir = tzdir.encode()
-    cdef string c_tzname = tzname.encode()
-
-    with nogil:
-        c_result = move(
-            cpp_make_timezone_transition_table(
-                make_optional[string](c_tzdir),
-                c_tzname
-            )
-        )
-
-    return columns_from_unique_ptr(move(c_result))
+    plc_table = plc.io.timezone.make_timezone_transition_table(tzdir, tzname)
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()]
diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
index 529a71a48ce..965724a47b1 100644
--- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx types.pyx)
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx
+                   types.pyx
+)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd
index 5927a19dc69..1bcc0a3f963 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/io/__init__.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 # CSV is removed since it is def not cpdef (to force kw-only arguments)
-from . cimport avro, datasource, json, orc, parquet, types
+from . cimport avro, datasource, json, orc, parquet, timezone, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
index 5d899ee0808..2e4f215b12c 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.py
+++ b/python/pylibcudf/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, csv, datasource, json, orc, parquet, types
+from . import avro, csv, datasource, json, orc, parquet, timezone, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pxd b/python/pylibcudf/pylibcudf/io/timezone.pxd
new file mode 100644
index 00000000000..2aa755dbbd8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/timezone.pxd
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from ..table cimport Table
+
+
+cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name)
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx
new file mode 100644
index 00000000000..e02239d7252
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/timezone.pyx
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport make_optional
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from pylibcudf.libcudf.io.timezone cimport (
+    make_timezone_transition_table as cpp_make_timezone_transition_table,
+)
+from pylibcudf.libcudf.table.table cimport table
+
+from ..table cimport Table
+
+
+cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name):
+    """
+    Creates a transition table to convert ORC timestamps to UTC.
+
+    Parameters
+    ----------
+    tzif_dir : str
+        The directory where the TZif files are located
+    timezone_name : str
+        standard timezone name
+
+    Returns
+    -------
+    Table
+        The transition table for the given timezone.
+    """
+    cdef unique_ptr[table] c_result
+    cdef string c_tzdir = tzif_dir.encode()
+    cdef string c_tzname = timezone_name.encode()
+
+    with nogil:
+        c_result = move(
+            cpp_make_timezone_transition_table(
+                make_optional[string](c_tzdir),
+                c_tzname
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
new file mode 100644
index 00000000000..76b0424b2af
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import zoneinfo
+
+import pylibcudf as plc
+import pytest
+
+
+def test_make_timezone_transition_table():
+    if len(zoneinfo.TZPATH) == 0:
+        pytest.skip("No TZPATH available.")
+    tz_path = zoneinfo.TZPATH[0]
+    result = plc.io.timezone.make_timezone_transition_table(
+        tz_path, "America/Los_Angeles"
+    )
+    assert isinstance(result, plc.Table)
+    assert result.num_rows() > 0

From b00a718a7980fadc91c8b37d6bbe829e4b8549e8 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 25 Sep 2024 16:51:18 -1000
Subject: [PATCH 08/11] Add partitioning APIs to pylibcudf (#16781)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16781
---
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 .../api_docs/pylibcudf/partitioning.rst       |   6 +
 python/cudf/cudf/_lib/hash.pyx                |  35 ++---
 python/cudf/cudf/_lib/partitioning.pyx        |  35 +----
 python/pylibcudf/pylibcudf/CMakeLists.txt     |   1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |   2 +
 python/pylibcudf/pylibcudf/__init__.py        |   2 +
 .../pylibcudf/libcudf/partitioning.pxd        |   7 +
 python/pylibcudf/pylibcudf/partitioning.pxd   |  19 +++
 python/pylibcudf/pylibcudf/partitioning.pyx   | 120 ++++++++++++++++++
 .../pylibcudf/tests/test_partitioning.py      |  55 ++++++++
 11 files changed, 229 insertions(+), 54 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst
 create mode 100644 python/pylibcudf/pylibcudf/partitioning.pxd
 create mode 100644 python/pylibcudf/pylibcudf/partitioning.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_partitioning.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index edb0963ed29..e21536e2e97 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -25,6 +25,7 @@ This page provides API documentation for pylibcudf.
     lists
     merge
     null_mask
+    partitioning
     quantiles
     reduce
     replace
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst
new file mode 100644
index 00000000000..6951dbecca0
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst
@@ -0,0 +1,6 @@
+============
+partitioning
+============
+
+.. automodule:: pylibcudf.partitioning
+   :members:
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 48f75b12a73..9b7ab0888d2 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -3,11 +3,8 @@
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
 from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
-cimport pylibcudf.libcudf.types as libcudf_types
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.hash cimport (
     md5,
@@ -19,37 +16,23 @@ from pylibcudf.libcudf.hash cimport (
     sha512,
     xxhash_64,
 )
-from pylibcudf.libcudf.partitioning cimport (
-    hash_partition as cpp_hash_partition,
-)
-from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
+from cudf._lib.utils cimport table_view_from_columns
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
-def hash_partition(list source_columns, object columns_to_hash,
+def hash_partition(list source_columns, list columns_to_hash,
                    int num_partitions):
-    cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash
-    cdef int c_num_partitions = num_partitions
-    cdef table_view c_source_view = table_view_from_columns(source_columns)
-
-    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
-    with nogil:
-        c_result = move(
-            cpp_hash_partition(
-                c_source_view,
-                c_columns_to_hash,
-                c_num_partitions
-            )
-        )
-
-    return (
-        columns_from_unique_ptr(move(c_result.first)),
-        list(c_result.second)
+    plc_table, offsets = plc.partitioning.hash_partition(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),
+        columns_to_hash,
+        num_partitions
     )
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index d94f0e1b564..13997da8403 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -2,24 +2,13 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.partitioning cimport partition as cpp_partition
-from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.table.table_view cimport table_view
-
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
+
+import pylibcudf as plc
 
 from cudf._lib.reduce import minmax
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
-cimport pylibcudf.libcudf.types as libcudf_types
-
 
 @acquire_spill_lock()
 def partition(list source_columns, Column partition_map,
@@ -50,25 +39,15 @@ def partition(list source_columns, Column partition_map,
 
     if num_partitions is None:
         num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True)
-    cdef int c_num_partitions = num_partitions
-    cdef table_view c_source_view = table_view_from_columns(source_columns)
-
-    cdef column_view c_partition_map_view = partition_map.view()
 
-    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
     if partition_map.size > 0:
         lo, hi = minmax(partition_map)
         if lo < 0 or hi >= num_partitions:
             raise ValueError("Partition map has invalid values")
-    with nogil:
-        c_result = move(
-            cpp_partition(
-                c_source_view,
-                c_partition_map_view,
-                c_num_partitions
-            )
-        )
 
-    return (
-        columns_from_unique_ptr(move(c_result.first)), list(c_result.second)
+    plc_table, offsets = plc.partitioning.partition(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),
+        partition_map.to_pylibcudf(mode="read"),
+        num_partitions
     )
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index fb3a6c13a70..a7cb66d7b16 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -31,6 +31,7 @@ set(cython_sources
     lists.pyx
     merge.pyx
     null_mask.pyx
+    partitioning.pyx
     quantiles.pyx
     reduce.pyx
     replace.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index 66d9c3d6165..a384edd456d 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -17,6 +17,7 @@ from . cimport (
     lists,
     merge,
     null_mask,
+    partitioning,
     quantiles,
     reduce,
     replace,
@@ -61,6 +62,7 @@ __all__ = [
     "lists",
     "merge",
     "null_mask",
+    "partitioning",
     "quantiles",
     "reduce",
     "replace",
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 0a3615fa941..2a5365e8fad 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -28,6 +28,7 @@
     lists,
     merge,
     null_mask,
+    partitioning,
     quantiles,
     reduce,
     replace,
@@ -75,6 +76,7 @@
     "lists",
     "merge",
     "null_mask",
+    "partitioning",
     "quantiles",
     "reduce",
     "replace",
diff --git a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
index 1ea10e8a194..89bddbffab5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
@@ -25,3 +25,10 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
         const column_view& partition_map,
         int num_partitions
     ) except +
+
+    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] \
+        round_robin_partition "cudf::round_robin_partition" (
+        const table_view& input,
+        int num_partitions,
+        int start_partition
+    ) except +
diff --git a/python/pylibcudf/pylibcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/partitioning.pxd
new file mode 100644
index 00000000000..aad60149fc4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/partitioning.pxd
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef tuple[Table, list] hash_partition(
+    Table input,
+    list columns_to_hash,
+    int num_partitions
+)
+
+cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions)
+
+cpdef tuple[Table, list] round_robin_partition(
+    Table input,
+    int num_partitions,
+    int start_partition=*
+)
diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx
new file mode 100644
index 00000000000..8fa70daab5a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/partitioning.pyx
@@ -0,0 +1,120 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cimport pylibcudf.libcudf.types as libcudf_types
+from libcpp.memory cimport unique_ptr
+from libcpp.pair cimport pair
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.libcudf cimport partitioning as cpp_partitioning
+from pylibcudf.libcudf.table.table cimport table
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef tuple[Table, list] hash_partition(
+    Table input,
+    list columns_to_hash,
+    int num_partitions
+):
+    """
+    Partitions rows from the input table into multiple output tables.
+
+    For details, see :cpp:func:`hash_partition`.
+
+    Parameters
+    ----------
+    input : Table
+        The table to partition
+    columns_to_hash : list[int]
+        Indices of input columns to hash
+    num_partitions : int
+        The number of partitions to use
+
+    Returns
+    -------
+    tuple[Table, list[int]]
+        An output table and a vector of row offsets to each partition
+    """
+    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
+    cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash
+    cdef int c_num_partitions = num_partitions
+
+    with nogil:
+        c_result = move(
+            cpp_partitioning.hash_partition(
+                input.view(), c_columns_to_hash, c_num_partitions
+            )
+        )
+
+    return Table.from_libcudf(move(c_result.first)), list(c_result.second)
+
+cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions):
+    """
+    Partitions rows of `t` according to the mapping specified by `partition_map`.
+
+    For details, see :cpp:func:`partition`.
+
+    Parameters
+    ----------
+    t : Table
+        The table to partition
+    partition_map : Column
+        Non-nullable column of integer values that map each row
+        in `t` to it's partition.
+    num_partitions : int
+        The total number of partitions
+
+    Returns
+    -------
+    tuple[Table, list[int]]
+        An output table and a list of row offsets to each partition
+    """
+    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
+    cdef int c_num_partitions = num_partitions
+
+    with nogil:
+        c_result = move(
+            cpp_partitioning.partition(t.view(), partition_map.view(), c_num_partitions)
+        )
+
+    return Table.from_libcudf(move(c_result.first)), list(c_result.second)
+
+
+cpdef tuple[Table, list] round_robin_partition(
+    Table input,
+    int num_partitions,
+    int start_partition=0
+):
+    """
+    Round-robin partition.
+
+    For details, see :cpp:func:`round_robin_partition`.
+
+    Parameters
+    ----------
+    input : Table
+        The input table to be round-robin partitioned
+    num_partitions : int
+        Number of partitions for the table
+    start_partition : int, default 0
+        Index of the 1st partition
+
+    Returns
+    -------
+    tuple[Table, list[int]]
+        The partitioned table and the partition offsets
+        for each partition within the table.
+    """
+    cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
+    cdef int c_num_partitions = num_partitions
+    cdef int c_start_partition = start_partition
+
+    with nogil:
+        c_result = move(
+            cpp_partitioning.round_robin_partition(
+                input.view(), c_num_partitions, c_start_partition
+            )
+        )
+
+    return Table.from_libcudf(move(c_result.first)), list(c_result.second)
diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
new file mode 100644
index 00000000000..444d0089d2c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_table_eq
+
+
+@pytest.fixture(scope="module")
+def partitioning_data():
+    data = {"a": [1, 2, 3], "b": [1, 2, 5], "c": [1, 2, 10]}
+    pa_table = pa.table(data)
+    plc_table = plc.interop.from_arrow(pa_table)
+    return data, plc_table, pa_table
+
+
+def test_partition(partitioning_data):
+    raw_data, plc_table, pa_table = partitioning_data
+    result, result_offsets = plc.partitioning.partition(
+        plc_table,
+        plc.interop.from_arrow(pa.array([0, 0, 0])),
+        1,
+    )
+    expected = pa.table(
+        list(raw_data.values()),
+        schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3),
+    )
+    assert_table_eq(expected, result)
+    assert result_offsets == [0, 3]
+
+
+def test_hash_partition(partitioning_data):
+    raw_data, plc_table, pa_table = partitioning_data
+    result, result_offsets = plc.partitioning.hash_partition(
+        plc_table, [0, 1], 1
+    )
+    expected = pa.table(
+        list(raw_data.values()),
+        schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3),
+    )
+    assert_table_eq(expected, result)
+    assert result_offsets == [0]
+
+
+def test_round_robin_partition(partitioning_data):
+    raw_data, plc_table, pa_table = partitioning_data
+    result, result_offsets = plc.partitioning.round_robin_partition(
+        plc_table, 1, 0
+    )
+    expected = pa.table(
+        list(raw_data.values()),
+        schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3),
+    )
+    assert_table_eq(expected, result)
+    assert result_offsets == [0]

From b1e1c9c060cc6b4b35b8590209177584336444bc Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 26 Sep 2024 11:00:00 -0700
Subject: [PATCH 09/11] Reapply `mixed_semi_join` refactoring and bug fixes
 (#16859)

This PR reapplies changes from #16230 and adds bug fixes and performance improvements for mixed_semi_join.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16859
---
 cpp/src/join/join_common_utils.hpp       |   6 -
 cpp/src/join/mixed_join_common_utils.cuh |  34 ++++++
 cpp/src/join/mixed_join_kernels_semi.cu  |  51 ++++----
 cpp/src/join/mixed_join_kernels_semi.cuh |   6 +-
 cpp/src/join/mixed_join_semi.cu          |  92 +++++---------
 cpp/tests/join/mixed_join_tests.cu       | 147 +++++++++++++++++++++++
 6 files changed, 239 insertions(+), 97 deletions(-)

diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 86402a0e7de..573101cefd9 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -22,7 +22,6 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <cuco/static_map.cuh>
 #include <cuco/static_multimap.cuh>
 #include <cuda/atomic>
 
@@ -51,11 +50,6 @@ using mixed_multimap_type =
                         cudf::detail::cuco_allocator<char>,
                         cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
-using semi_map_type = cuco::legacy::static_map<hash_value_type,
-                                               size_type,
-                                               cuda::thread_scope_device,
-                                               cudf::detail::cuco_allocator<char>>;
-
 using row_hash_legacy =
   cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
 
diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh
index 19701816867..4a52cfe098a 100644
--- a/cpp/src/join/mixed_join_common_utils.cuh
+++ b/cpp/src/join/mixed_join_common_utils.cuh
@@ -25,6 +25,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cub/cub.cuh>
+#include <cuco/static_set.cuh>
 
 namespace cudf {
 namespace detail {
@@ -160,6 +161,39 @@ struct pair_expression_equality : public expression_equality<has_nulls> {
   }
 };
 
+/**
+ * @brief Equality comparator that composes two row_equality comparators.
+ */
+struct double_row_equality_comparator {
+  row_equality const equality_comparator;
+  row_equality const conditional_comparator;
+
+  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
+  {
+    using experimental::row::lhs_index_type;
+    using experimental::row::rhs_index_type;
+
+    return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
+           conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
+  }
+};
+
+// A CUDA Cooperative Group of 1 thread for the hash set for mixed semi.
+auto constexpr DEFAULT_MIXED_SEMI_JOIN_CG_SIZE = 1;
+
+// The hash set type used by mixed_semi_join with the build_table.
+using hash_set_type =
+  cuco::static_set<size_type,
+                   cuco::extent<size_t>,
+                   cuda::thread_scope_device,
+                   double_row_equality_comparator,
+                   cuco::linear_probing<DEFAULT_MIXED_SEMI_JOIN_CG_SIZE, row_hash>,
+                   cudf::detail::cuco_allocator<char>,
+                   cuco::storage<1>>;
+
+// The hash_set_ref_type used by mixed_semi_join kerenels for probing.
+using hash_set_ref_type = hash_set_type::ref_type<cuco::contains_tag>;
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 7459ac3e99c..bd8c80652a0 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -38,38 +38,48 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                   table_device_view right_table,
                   table_device_view probe,
                   table_device_view build,
-                  row_hash const hash_probe,
                   row_equality const equality_probe,
-                  cudf::detail::semi_map_type::device_view hash_table_view,
+                  hash_set_ref_type set_ref,
                   cudf::device_span<bool> left_table_keep_mask,
                   cudf::ast::detail::expression_device_view device_expression_data)
 {
+  auto constexpr cg_size = hash_set_ref_type::cg_size;
+
+  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
   // used to circumvent conflicts between arrays of different types between
   // different template instantiations due to the extern specifier.
   extern __shared__ char raw_intermediate_storage[];
-  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+  auto intermediate_storage =
     reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
   auto thread_intermediate_storage =
-    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+    intermediate_storage + (tile.meta_group_rank() * device_expression_data.num_intermediates);
 
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = left_num_rows;
+  // Equality evaluator to use
+  auto const evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
 
-  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
+  // Make sure to swap_tables here as hash_set will use probe table as the left one
+  auto constexpr swap_tables = true;
+  auto const equality        = single_expression_equality<has_nulls>{
+    evaluator, thread_intermediate_storage, swap_tables, equality_probe};
 
-  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
-    left_table, right_table, device_expression_data);
+  // Create set ref with the new equality comparator
+  auto const set_ref_equality = set_ref.with_key_eq(equality);
 
-  if (outer_row_index < outer_num_rows) {
-    // Figure out the number of elements for this key.
-    auto equality = single_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, false, equality_probe};
+  // Total number of rows to query the set
+  auto const outer_num_rows = left_table.num_rows();
+  // Grid stride for the tile
+  auto const cg_grid_stride = cudf::detail::grid_1d::grid_stride<block_size>() / cg_size;
 
-    left_table_keep_mask[outer_row_index] =
-      hash_table_view.contains(outer_row_index, hash_probe, equality);
+  // Find all the rows in the left table that are in the hash table
+  for (auto outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>() / cg_size;
+       outer_row_index < outer_num_rows;
+       outer_row_index += cg_grid_stride) {
+    auto const result = set_ref_equality.contains(tile, outer_row_index);
+    if (tile.thread_rank() == 0) { left_table_keep_mask[outer_row_index] = result; }
   }
 }
 
@@ -78,9 +88,8 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
-                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            hash_set_ref_type set_ref,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
@@ -94,9 +103,8 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
-        hash_probe,
         equality_probe,
-        hash_table_view,
+        set_ref,
         left_table_keep_mask,
         device_expression_data);
   } else {
@@ -106,9 +114,8 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
-        hash_probe,
         equality_probe,
-        hash_table_view,
+        set_ref,
         left_table_keep_mask,
         device_expression_data);
   }
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index 43714ffb36a..b08298e64e4 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -45,9 +45,8 @@ namespace detail {
  * @param[in] right_table The right table
  * @param[in] probe The table with which to probe the hash table for matches.
  * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
  * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] hash_table_view The hash table built from `build`.
+ * @param[in] set_ref The hash table device view built from `build`.
  * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating
  * the corresponding index from left table is present in output
  * @param[in] device_expression_data Container of device data required to evaluate the desired
@@ -58,9 +57,8 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
-                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            hash_set_ref_type set_ref,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index aa4fa281159..83a55eca50f 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -45,45 +45,6 @@
 namespace cudf {
 namespace detail {
 
-namespace {
-/**
- * @brief Device functor to create a pair of hash value and index for a given row.
- */
-struct make_pair_function_semi {
-  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
-  {
-    // The value is irrelevant since we only ever use the hash map to check for
-    // membership of a particular row index.
-    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
-  }
-};
-
-/**
- * @brief Equality comparator that composes two row_equality comparators.
- */
-class double_row_equality {
- public:
-  double_row_equality(row_equality equality_comparator, row_equality conditional_comparator)
-    : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator}
-  {
-  }
-
-  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
-  {
-    using experimental::row::lhs_index_type;
-    using experimental::row::rhs_index_type;
-
-    return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
-           _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
-  }
-
- private:
-  row_equality _equality_comparator;
-  row_equality _conditional_comparator;
-};
-
-}  // namespace
-
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -95,7 +56,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
+  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and
                  (join_type != join_kind::FULL_JOIN),
                "Inner, left, and full joins should use mixed_join.");
 
@@ -136,7 +97,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // output column and follow the null-supporting expression evaluation code
   // path.
   auto const has_nulls = cudf::nullate::DYNAMIC{
-    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
+    cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or
     binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)};
 
   auto const parser = ast::detail::expression_parser{
@@ -155,27 +116,20 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto right_conditional_view = table_device_view::create(right_conditional, stream);
 
   auto const preprocessed_build =
-    experimental::row::equality::preprocessed_table::create(build, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(build, stream);
   auto const preprocessed_probe =
-    experimental::row::equality::preprocessed_table::create(probe, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
   auto const row_comparator =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
+    cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe};
   auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
 
-  semi_map_type hash_table{
-    compute_hash_table_size(build.num_rows()),
-    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-    cuco::empty_value{cudf::detail::JoinNoneValue},
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
   // Create hash table containing all keys found in right table
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const build_nulls    = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
   auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build};
-  auto const hash_build     = row_hash_build.device_hasher(build_nulls);
+
   // Since we may see multiple rows that are identical in the equality tables
   // but differ in the conditional tables, the equality comparator used for
   // insertion must account for both sets of tables. An alternative solution
@@ -190,20 +144,28 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const equality_build_equality =
     row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
   auto const preprocessed_build_condtional =
-    experimental::row::equality::preprocessed_table::create(right_conditional, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream);
   auto const row_comparator_conditional_build =
     cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
                                                             preprocessed_build_condtional};
   auto const equality_build_conditional =
     row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
-  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
-  make_pair_function_semi pair_func_build{};
 
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
+  hash_set_type row_set{
+    {compute_hash_table_size(build.num_rows())},
+    cuco::empty_key{JoinNoneValue},
+    {equality_build_equality, equality_build_conditional},
+    {row_hash_build.device_hasher(build_nulls)},
+    {},
+    {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    {stream.value()}};
+
+  auto iter = thrust::make_counting_iterator(0);
 
   // skip rows that are null here.
   if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
+    row_set.insert_async(iter, iter + right_num_rows, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
     auto const [row_bitmask, _] =
@@ -211,18 +173,19 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
-    hash_table.insert_if(
-      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
+    row_set.insert_if_async(iter, iter + right_num_rows, stencil, pred, stream.value());
   }
 
-  auto hash_table_view = hash_table.get_device_view();
-
-  detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+  detail::grid_1d const config(outer_num_rows * hash_set_type::cg_size, DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block =
+    parser.shmem_per_thread *
+    cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size);
 
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
+  hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);
+
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
@@ -231,9 +194,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
                          *right_conditional_view,
                          *probe_view,
                          *build_view,
-                         hash_probe,
                          equality_probe,
-                         hash_table_view,
+                         row_set_ref,
                          cudf::device_span<bool>(left_table_keep_mask),
                          parser.device_expression_data,
                          config,
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index 6c147c8a128..9041969bec7 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -778,6 +778,138 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality)
              {1});
 }
 
+TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_one_greater_right_one =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {1},
+             left_one_greater_right_one,
+             {2, 7, 8});
+}
+
+TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge)
+{
+  using T1 = double;
+
+  // Number of rows in each column
+  auto constexpr N = 10000;
+
+  // Generate column data for left and right tables
+  auto const [left_col0, right_col0] = gen_random_nullable_repeated_columns<T1>(N, 200);
+  auto const [left_col1, right_col1] = gen_random_nullable_repeated_columns<T1>(N, 100);
+
+  // Setup data and nulls for the left table
+  std::vector<std::pair<std::vector<T1>, std::vector<bool>>> lefts = {
+    {left_col0.first, left_col0.second}, {left_col1.first, left_col1.second}};
+  std::vector<cudf::test::fixed_width_column_wrapper<T1>> left_wrappers;
+  std::vector<cudf::column_view> left_columns;
+  for (auto [data, valids] : lefts) {
+    left_wrappers.emplace_back(
+      cudf::test::fixed_width_column_wrapper<T1>(data.begin(), data.end(), valids.begin()));
+    left_columns.emplace_back(left_wrappers.back());
+  };
+
+  // Setup data and nulls for the right table
+  std::vector<std::pair<std::vector<T1>, std::vector<bool>>> rights = {
+    {right_col0.first, right_col0.second}, {right_col1.first, right_col1.second}};
+  std::vector<cudf::test::fixed_width_column_wrapper<T1>> right_wrappers;
+  std::vector<cudf::column_view> right_columns;
+  for (auto [data, valids] : rights) {
+    right_wrappers.emplace_back(
+      cudf::test::fixed_width_column_wrapper<T1>(data.begin(), data.end(), valids.begin()));
+    right_columns.emplace_back(left_wrappers.back());
+  };
+
+  // Left and right table views.
+  auto const left_table  = cudf::table_view{left_columns};
+  auto const right_table = cudf::table_view{right_columns};
+
+  // Using the zeroth column for equality.
+  auto const left_equality  = left_table.select({0});
+  auto const right_equality = right_table.select({0});
+
+  // Column references for equality column.
+  auto const col_ref_left_0  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_zero_eq_right_zero =
+    cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
+
+  // Mixed semi join with zeroth column equality
+  {
+    // Expected left_semi_join result
+    auto const expected_mixed_semi_join =
+      cudf::conditional_left_semi_join(left_table, right_table, left_zero_eq_right_zero);
+
+    // Actual mixed_left_semi_join result
+    auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality,
+                                                            right_equality,
+                                                            left_table,
+                                                            right_table,
+                                                            left_zero_eq_right_zero,
+                                                            cudf::null_equality::UNEQUAL);
+
+    // Copy data back to host for comparisons
+    auto expected_indices = cudf::detail::make_std_vector_async<int32_t>(
+      cudf::device_span<int32_t>(*expected_mixed_semi_join), cudf::get_default_stream());
+    auto result_indices = cudf::detail::make_std_vector_sync<int32_t>(
+      cudf::device_span<int32_t>(*mixed_semi_join), cudf::get_default_stream());
+
+    // Sort the indices for 1-1 comparison
+    std::sort(expected_indices.begin(), expected_indices.end());
+    std::sort(result_indices.begin(), result_indices.end());
+
+    // Expected and actual vectors must match.
+    EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size());
+    EXPECT_TRUE(
+      std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin()));
+  }
+
+  // Mixed semi join with zeroth column equality and first column GREATER conditional
+  {
+    // Column references for conditional column.
+    auto const col_ref_left_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
+    auto const col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+    auto left_one_gt_right_one =
+      cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+    // Expected left_semi_join result
+    auto const expected_mixed_semi_join = cudf::conditional_left_semi_join(
+      left_table,
+      right_table,
+      cudf::ast::operation(
+        cudf::ast::ast_operator::LOGICAL_AND, left_zero_eq_right_zero, left_one_gt_right_one));
+
+    // Actual left_semi_join result
+    auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality,
+                                                            right_equality,
+                                                            left_table,
+                                                            right_table,
+                                                            left_one_gt_right_one,
+                                                            cudf::null_equality::UNEQUAL);
+
+    // Copy data back to host for comparisons
+    auto expected_indices = cudf::detail::make_std_vector_async<int32_t>(
+      cudf::device_span<int32_t>(*expected_mixed_semi_join), cudf::get_default_stream());
+    auto result_indices = cudf::detail::make_std_vector_sync<int32_t>(
+      cudf::device_span<int32_t>(*mixed_semi_join), cudf::get_default_stream());
+
+    // Sort the indices for 1-1 comparison
+    std::sort(expected_indices.begin(), expected_indices.end());
+    std::sort(result_indices.begin(), result_indices.end());
+
+    // Expected and actual vectors must match.
+    EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size());
+    EXPECT_TRUE(
+      std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin()));
+  }
+}
+
 TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)
 {
   this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}},
@@ -900,3 +1032,18 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality)
              left_zero_eq_right_zero,
              {0, 1, 3});
 }
+
+TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_one_greater_right_one =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {1},
+             left_one_greater_right_one,
+             {0, 1, 3, 4, 5, 6, 9});
+}

From d69e4b6fbdff9ad402a37de7940d64ed16b7d329 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 26 Sep 2024 08:07:48 -1000
Subject: [PATCH 10/11] Respect groupby.nunique(dropna=False) (#16921)

closes https://github.com/rapidsai/cudf/issues/16861

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16921
---
 python/cudf/cudf/_lib/aggregation.pyx    |  7 +++++--
 python/cudf/cudf/core/groupby/groupby.py | 16 ++++++++++++++++
 python/cudf/cudf/tests/test_groupby.py   | 17 +++++++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 7c91533cf93..3c96b90f0a1 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -78,8 +78,11 @@ class Aggregation:
         )
 
     @classmethod
-    def nunique(cls):
-        return cls(pylibcudf.aggregation.nunique(pylibcudf.types.NullPolicy.EXCLUDE))
+    def nunique(cls, dropna=True):
+        return cls(pylibcudf.aggregation.nunique(
+            pylibcudf.types.NullPolicy.EXCLUDE
+            if dropna else pylibcudf.types.NullPolicy.INCLUDE
+        ))
 
     @classmethod
     def nth(cls, size):
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index cb8cd0cd28b..be05075a2cd 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2232,6 +2232,22 @@ def func(x):
 
         return self.agg(func)
 
+    @_performance_tracking
+    def nunique(self, dropna: bool = True):
+        """
+        Return number of unique elements in the group.
+
+        Parameters
+        ----------
+        dropna : bool, default True
+            Don't include NaN in the counts.
+        """
+
+        def func(x):
+            return getattr(x, "nunique")(dropna=dropna)
+
+        return self.agg(func)
+
     @_performance_tracking
     def std(
         self,
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 848bc259e7b..14ba9894fd3 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1940,6 +1940,23 @@ def test_groupby_nunique(agg, by):
     assert_groupby_results_equal(expect, got, check_dtype=False)
 
 
+@pytest.mark.parametrize("dropna", [True, False])
+def test_nunique_dropna(dropna):
+    gdf = cudf.DataFrame(
+        {
+            "a": [1, 1, 2],
+            "b": [4, None, 5],
+            "c": [None, None, 7],
+            "d": [1, 1, 3],
+        }
+    )
+    pdf = gdf.to_pandas()
+
+    result = gdf.groupby("a")["b"].nunique(dropna=dropna)
+    expected = pdf.groupby("a")["b"].nunique(dropna=dropna)
+    assert_groupby_results_equal(result, expected, check_dtype=False)
+
+
 @pytest.mark.parametrize(
     "n",
     [0, 1, 2, 10],

From 742eaadb92b0c5159d92be49e647a17e8c1d0b9b Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 26 Sep 2024 14:27:37 -0500
Subject: [PATCH 11/11] Fix links in Dask cuDF documentation (#16929)

More follow-up fixes to the recent Dask-cuDF documentation additions.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16929
---
 docs/dask_cudf/source/best_practices.rst | 15 +++++++++------
 docs/dask_cudf/source/conf.py            |  1 +
 docs/dask_cudf/source/index.rst          | 11 +++++------
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst
index 83039f86fed..41263ebf589 100644
--- a/docs/dask_cudf/source/best_practices.rst
+++ b/docs/dask_cudf/source/best_practices.rst
@@ -81,7 +81,7 @@ representations, native cuDF spilling may be insufficient. For these cases,
 `JIT-unspill <https://docs.rapids.ai/api/dask-cuda/nightly/spilling/#jit-unspill>`__
 is likely to produce better protection from out-of-memory (OOM) errors.
 Please see `Dask-CUDA's spilling documentation
-<https://docs.rapids.ai/api/dask-cuda/24.10/spilling/>`__ for further details
+<https://docs.rapids.ai/api/dask-cuda/stable/spilling/>`__ for further details
 and guidance.
 
 Use RMM
@@ -160,7 +160,7 @@ of the underlying task graph to materialize the collection.
 
 :func:`sort_values` / :func:`set_index` : These operations both require Dask to
 eagerly collect quantile information about the column(s) being targeted by the
-global sort operation. See `Avoid Sorting`__ for notes on sorting considerations.
+global sort operation. See the next section for notes on sorting considerations.
 
 .. note::
   When using :func:`set_index`, be sure to pass in ``sort=False`` whenever the
@@ -297,11 +297,14 @@ bottleneck is typically device-to-host memory spilling.
 Although every workflow is different, the following guidelines
 are often recommended:
 
-* `Use a distributed cluster with Dask-CUDA workers <Use Dask-CUDA>`_
-* `Use native cuDF spilling whenever possible <Enable cuDF Spilling>`_
+* Use a distributed cluster with `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__ workers
+
+* Use native cuDF spilling whenever possible (`Dask-CUDA spilling documentation <https://docs.rapids.ai/api/dask-cuda/stable/spilling/>`__)
+
 * Avoid shuffling whenever possible
-  * Use ``split_out=1`` for low-cardinality groupby aggregations
-  * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``)
+    * Use ``split_out=1`` for low-cardinality groupby aggregations
+    * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``)
+
 * `Use UCX <https://docs.rapids.ai/api/dask-cuda/nightly/examples/ucx/>`__ if communication is a bottleneck.
 
 .. note::
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
index dc40254312e..5daa8245695 100644
--- a/docs/dask_cudf/source/conf.py
+++ b/docs/dask_cudf/source/conf.py
@@ -78,6 +78,7 @@
     "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None),
     "dask": ("https://docs.dask.org/en/stable/", None),
     "pandas": ("https://pandas.pydata.org/docs/", None),
+    "dask-cuda": ("https://docs.rapids.ai/api/dask-cuda/stable/", None),
 }
 
 numpydoc_show_inherited_class_members = True
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 6eb755d7854..c2891ebc15e 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -16,10 +16,9 @@ as the ``"cudf"`` dataframe backend for
   Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU
   or multi-node execution on their own. You must also deploy a
   `dask.distributed <https://distributed.dask.org/en/stable/>`__ cluster
-  to leverage multiple GPUs. We strongly recommend using `Dask-CUDA
-  <https://docs.rapids.ai/api/dask-cuda/stable/>`__ to simplify the
-  setup of the cluster, taking advantage of all features of the GPU
-  and networking hardware.
+  to leverage multiple GPUs. We strongly recommend using :doc:`dask-cuda:index`
+  to simplify the setup of the cluster, taking advantage of all features
+  of the GPU and networking hardware.
 
 If you are familiar with Dask and `pandas <pandas.pydata.org>`__ or
 `cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask cuDF
@@ -161,7 +160,7 @@ out-of-core computing. This also means that the compute tasks can be
 executed in parallel over a multi-GPU cluster.
 
 In order to execute your Dask workflow on multiple GPUs, you will
-typically need to use `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+typically need to use :doc:`dask-cuda:index`
 to deploy distributed Dask cluster, and
 `Distributed <https://distributed.dask.org/en/stable/client.html>`__
 to define a client object. For example::
@@ -192,7 +191,7 @@ to define a client object. For example::
   <https://distributed.dask.org/en/stable/manage-computation.html>`__
   for more details.
 
-Please see the `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+Please see the :doc:`dask-cuda:index`
 documentation for more information about deploying GPU-aware clusters
 (including `best practices
 <https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/>`__).