From c81198789be183e7e1eb288eb98dd16f65b57e44 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 17 Jan 2024 13:17:35 -0600
Subject: [PATCH 01/60] Defer PTX file load to runtime (#13690)

This PR fixes an issue where cuDF fails to import on machines with no NVIDIA GPU present.

cc @shwina

Authors:
  - https://github.com/brandon-b-miller
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/13690
---
 python/cudf/cudf/core/udf/utils.py         | 16 +++++++++++-----
 python/cudf/cudf/tests/test_no_device.py   | 16 ++++++++++++++++
 python/cudf/cudf/tests/test_string_udfs.py |  6 ++++--
 3 files changed, 31 insertions(+), 7 deletions(-)
 create mode 100644 python/cudf/cudf/tests/test_no_device.py

diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index bd57db6b620..12baf1ea6d1 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+import functools
 import os
 from typing import Any, Callable, Dict
 
@@ -60,10 +61,15 @@
 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
 launch_arg_getters: Dict[Any, Any] = {}
 
-_PTX_FILE = _get_ptx_file(
-    os.path.join(os.path.dirname(strings_udf.__file__), "..", "core", "udf"),
-    "shim_",
-)
+
+@functools.cache
+def _ptx_file():
+    return _get_ptx_file(
+        os.path.join(
+            os.path.dirname(strings_udf.__file__), "..", "core", "udf"
+        ),
+        "shim_",
+    )
 
 
 @_cudf_nvtx_annotate
@@ -286,7 +292,7 @@ def _get_kernel(kernel_string, globals_, sig, func):
     exec(kernel_string, globals_)
     _kernel = globals_["_kernel"]
     kernel = cuda.jit(
-        sig, link=[_PTX_FILE], extensions=[str_view_arg_handler]
+        sig, link=[_ptx_file()], extensions=[str_view_arg_handler]
     )(_kernel)
 
     return kernel
diff --git a/python/cudf/cudf/tests/test_no_device.py b/python/cudf/cudf/tests/test_no_device.py
new file mode 100644
index 00000000000..722762b2d0c
--- /dev/null
+++ b/python/cudf/cudf/tests/test_no_device.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import os
+import subprocess
+
+
+def test_cudf_import_no_device():
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "-1"
+    output = subprocess.run(
+        ["python", "-c", "import cudf"],
+        env=env,
+        capture_output=True,
+        text=True,
+        cwd="/",
+    )
+    assert output.returncode == 0
diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py
index 88c73ccf964..5dbb86fe27d 100644
--- a/python/cudf/cudf/tests/test_string_udfs.py
+++ b/python/cudf/cudf/tests/test_string_udfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import numba
 import numpy as np
@@ -20,10 +20,12 @@
     string_view,
     udf_string,
 )
-from cudf.core.udf.utils import _PTX_FILE, _get_extensionty_size
+from cudf.core.udf.utils import _get_extensionty_size, _ptx_file
 from cudf.testing._utils import assert_eq, sv_to_udf_str
 from cudf.utils._numba import _CUDFNumbaConfig
 
+_PTX_FILE = _ptx_file()
+
 
 def get_kernels(func, dtype, size):
     """

From 42e946f4056bed7942d7a355aeb23317506305e5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 17 Jan 2024 13:00:14 -0800
Subject: [PATCH 02/60] Reorganize ORC reader into multiple files and perform
 some small fixes to cuIO code (#14665)

This refactors the ORC reader, moving ORC code around to facilitate the upcoming support for chunked reading of the input files.

No new functionality/implementation is added in this PR. Only the existing code is moving around, except that some small issues of the related ORC/cuIO code are also fixed.

Authors:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14665
---
 cpp/CMakeLists.txt                         |    2 +
 cpp/src/io/orc/aggregate_orc_metadata.cpp  |   45 +-
 cpp/src/io/orc/aggregate_orc_metadata.hpp  |   38 +-
 cpp/src/io/orc/reader_impl.cu              | 1289 +-------------------
 cpp/src/io/orc/reader_impl.hpp             |   51 +-
 cpp/src/io/orc/reader_impl_chunking.hpp    |   42 +
 cpp/src/io/orc/reader_impl_helpers.cpp     |  157 +++
 cpp/src/io/orc/reader_impl_helpers.hpp     |  153 +++
 cpp/src/io/orc/reader_impl_preprocess.cu   | 1048 ++++++++++++++++
 cpp/src/io/utilities/hostdevice_vector.hpp |    6 +-
 10 files changed, 1514 insertions(+), 1317 deletions(-)
 create mode 100644 cpp/src/io/orc/reader_impl_chunking.hpp
 create mode 100644 cpp/src/io/orc/reader_impl_helpers.cpp
 create mode 100644 cpp/src/io/orc/reader_impl_helpers.hpp
 create mode 100644 cpp/src/io/orc/reader_impl_preprocess.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cb1fdb1f557..2c0f601ca74 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -380,6 +380,8 @@ add_library(
   src/io/orc/dict_enc.cu
   src/io/orc/orc.cpp
   src/io/orc/reader_impl.cu
+  src/io/orc/reader_impl_helpers.cpp
+  src/io/orc/reader_impl_preprocess.cu
   src/io/orc/stats_enc.cu
   src/io/orc/stripe_data.cu
   src/io/orc/stripe_enc.cu
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 2e5eeab7298..8cae1ff5309 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@
 
 #include <algorithm>
 #include <numeric>
-#include <optional>
 
 namespace cudf::io::orc::detail {
 
@@ -220,27 +219,25 @@ aggregate_orc_metadata::select_stripes(
   }
 
   // Read each stripe's stripefooter metadata
-  if (not selected_stripes_mapping.empty()) {
-    for (auto& mapping : selected_stripes_mapping) {
-      // Resize to all stripe_info for the source level
-      per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
-
-      for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
-        auto const stripe         = mapping.stripe_info[i].first;
-        auto const sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
-        auto const sf_comp_length = stripe->footerLength;
-        CUDF_EXPECTS(
-          sf_comp_offset + sf_comp_length < per_file_metadata[mapping.source_idx].source->size(),
-          "Invalid stripe information");
-        auto const buffer =
-          per_file_metadata[mapping.source_idx].source->host_read(sf_comp_offset, sf_comp_length);
-        auto sf_data = per_file_metadata[mapping.source_idx].decompressor->decompress_blocks(
-          {buffer->data(), buffer->size()}, stream);
-        ProtobufReader(sf_data.data(), sf_data.size())
-          .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
-        mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
-        if (stripe->indexLength == 0) { row_grp_idx_present = false; }
-      }
+  for (auto& mapping : selected_stripes_mapping) {
+    // Resize to all stripe_info for the source level
+    per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
+
+    for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
+      auto const stripe         = mapping.stripe_info[i].first;
+      auto const sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
+      auto const sf_comp_length = stripe->footerLength;
+      CUDF_EXPECTS(
+        sf_comp_offset + sf_comp_length < per_file_metadata[mapping.source_idx].source->size(),
+        "Invalid stripe information");
+      auto const buffer =
+        per_file_metadata[mapping.source_idx].source->host_read(sf_comp_offset, sf_comp_length);
+      auto sf_data = per_file_metadata[mapping.source_idx].decompressor->decompress_blocks(
+        {buffer->data(), buffer->size()}, stream);
+      ProtobufReader(sf_data.data(), sf_data.size())
+        .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
+      mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
+      if (stripe->indexLength == 0) { row_grp_idx_present = false; }
     }
   }
 
@@ -270,7 +267,7 @@ column_hierarchy aggregate_orc_metadata::select_columns(
       CUDF_EXPECTS(name_found, "Unknown column name: " + std::string(path));
     }
   }
-  return {std::move(selected_columns)};
+  return column_hierarchy{std::move(selected_columns)};
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 587684ccc0d..f05946a4346 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include "orc.hpp"
 
 #include <map>
@@ -33,8 +35,8 @@ struct column_hierarchy {
   // Each element contains column at the given nesting level
   std::vector<std::vector<orc_column_meta>> levels;
 
-  column_hierarchy(nesting_map child_map);
-  auto num_levels() const { return levels.size(); }
+  explicit column_hierarchy(nesting_map child_map);
+  [[nodiscard]] auto num_levels() const { return levels.size(); }
 };
 
 /**
@@ -50,11 +52,6 @@ class aggregate_orc_metadata {
    */
   [[nodiscard]] int64_t calc_num_rows() const;
 
-  /**
-   * @brief Number of columns in a ORC file.
-   */
-  [[nodiscard]] size_type calc_num_cols() const;
-
   /**
    * @brief Sums up the number of stripes of each source
    */
@@ -69,22 +66,23 @@ class aggregate_orc_metadata {
   aggregate_orc_metadata(std::vector<std::unique_ptr<datasource>> const& sources,
                          rmm::cuda_stream_view stream);
 
-  [[nodiscard]] auto const& get_schema(int schema_idx) const
+  [[nodiscard]] auto get_col_type(int col_idx) const
   {
-    return per_file_metadata[0].ff.types[schema_idx];
+    return per_file_metadata[0].ff.types[col_idx];
   }
 
-  auto get_col_type(int col_idx) const { return per_file_metadata[0].ff.types[col_idx]; }
-
   [[nodiscard]] auto get_num_rows() const { return num_rows; }
 
-  auto get_num_cols() const { return per_file_metadata[0].get_num_columns(); }
+  [[nodiscard]] auto get_num_cols() const { return per_file_metadata[0].get_num_columns(); }
 
   [[nodiscard]] auto get_num_stripes() const { return num_stripes; }
 
   [[nodiscard]] auto const& get_types() const { return per_file_metadata[0].ff.types; }
 
-  [[nodiscard]] int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
+  [[nodiscard]] int get_row_index_stride() const
+  {
+    return static_cast<int>(per_file_metadata[0].ff.rowIndexStride);
+  }
 
   [[nodiscard]] auto is_row_grp_idx_present() const { return row_grp_idx_present; }
 
@@ -115,11 +113,11 @@ class aggregate_orc_metadata {
    *
    * Stripes are potentially selected from multiple files.
    */
-  std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>> select_stripes(
-    std::vector<std::vector<size_type>> const& user_specified_stripes,
-    uint64_t skip_rows,
-    std::optional<size_type> const& num_rows,
-    rmm::cuda_stream_view stream);
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
+  select_stripes(std::vector<std::vector<size_type>> const& user_specified_stripes,
+                 uint64_t skip_rows,
+                 std::optional<size_type> const& num_rows,
+                 rmm::cuda_stream_view stream);
 
   /**
    * @brief Filters ORC file to a selection of columns, based on their paths in the file.
@@ -131,7 +129,7 @@ class aggregate_orc_metadata {
    * `nullopt` if user did not select columns to read
    * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level
    */
-  column_hierarchy select_columns(
+  [[nodiscard]] column_hierarchy select_columns(
     std::optional<std::vector<std::string>> const& column_paths) const;
 };
 
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 46f6861e789..cf3121fe659 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -14,930 +14,11 @@
  * limitations under the License.
  */
 
-/**
- * @file reader_impl.cu
- * @brief cuDF-IO ORC reader class implementation
- */
-
-#include "orc.hpp"
-#include "orc_gpu.hpp"
-
 #include "reader_impl.hpp"
-
-#include <io/comp/gpuinflate.hpp>
-#include <io/comp/nvcomp_adapter.hpp>
-#include <io/utilities/config_utils.hpp>
-
-#include <cudf/detail/timezone.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/pair.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
-
-#include <algorithm>
-#include <iterator>
+#include "reader_impl_chunking.hpp"
+#include "reader_impl_helpers.hpp"
 
 namespace cudf::io::orc::detail {
-using namespace cudf::io::detail;
-
-namespace {
-
-/**
- * @brief Keeps track of orc mapping and child column details.
- */
-struct reader_column_meta {
-  // Mapping between column id in orc to processing order.
-  std::vector<std::vector<size_type>> orc_col_map;
-
-  // Number of rows in child columns.
-  std::vector<uint32_t> num_child_rows;
-
-  // Consists of parent column valid_map and null count.
-  std::vector<column_validity_info> parent_column_data;
-
-  std::vector<size_type> parent_column_index;
-
-  // Start row of child columns [stripe][column].
-  std::vector<uint32_t> child_start_row;
-
-  // Number of rows of child columns [stripe][column].
-  std::vector<uint32_t> num_child_rows_per_stripe;
-
-  struct row_group_meta {
-    uint32_t num_rows;   // number of rows in a column in a row group
-    uint32_t start_row;  // start row in a column in a row group
-  };
-
-  // Row group metadata [rowgroup][column].
-  std::vector<row_group_meta> rwgrp_meta;
-};
-
-/**
- * @brief Struct that maps ORC streams to columns
- */
-struct orc_stream_info {
-  explicit orc_stream_info(uint64_t offset_,
-                           std::size_t dst_pos_,
-                           uint32_t length_,
-                           uint32_t stripe_idx_)
-    : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_)
-  {
-  }
-  uint64_t offset;      // offset in file
-  std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
-  std::size_t length;   // length in file
-  uint32_t stripe_idx;  // stripe index
-};
-
-/**
- * @brief Function that populates column descriptors stream/chunk
- */
-std::size_t gather_stream_info(std::size_t stripe_index,
-                               orc::StripeInformation const* stripeinfo,
-                               orc::StripeFooter const* stripefooter,
-                               host_span<int const> orc2gdf,
-                               host_span<orc::SchemaType const> types,
-                               bool use_index,
-                               bool apply_struct_map,
-                               std::size_t* num_dictionary_entries,
-                               std::vector<orc_stream_info>& stream_info,
-                               cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
-{
-  uint64_t src_offset = 0;
-  uint64_t dst_offset = 0;
-
-  auto const get_stream_index_type = [](orc::StreamKind kind) {
-    switch (kind) {
-      case orc::DATA: return gpu::CI_DATA;
-      case orc::LENGTH:
-      case orc::SECONDARY: return gpu::CI_DATA2;
-      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
-      case orc::PRESENT: return gpu::CI_PRESENT;
-      case orc::ROW_INDEX: return gpu::CI_INDEX;
-      default:
-        // Skip this stream as it's not strictly required
-        return gpu::CI_NUM_STREAMS;
-    }
-  };
-
-  for (auto const& stream : stripefooter->streams) {
-    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
-      dst_offset += stream.length;
-      continue;
-    }
-
-    auto const column_id = *stream.column_id;
-    auto col             = orc2gdf[column_id];
-
-    if (col == -1 and apply_struct_map) {
-      // A struct-type column has no data itself, but rather child columns
-      // for each of its fields. There is only a PRESENT stream, which
-      // needs to be included for the reader.
-      auto const schema_type = types[column_id];
-      if (not schema_type.subtypes.empty()) {
-        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
-          for (auto const& idx : schema_type.subtypes) {
-            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
-            if (child_idx >= 0) {
-              col                             = child_idx;
-              auto& chunk                     = chunks[stripe_index][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = stream_info.size();
-              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
-            }
-          }
-        }
-      }
-    }
-    if (col != -1) {
-      if (src_offset >= stripeinfo->indexLength || use_index) {
-        auto& chunk           = chunks[stripe_index][col];
-        auto const index_type = get_stream_index_type(stream.kind);
-        if (index_type < gpu::CI_NUM_STREAMS) {
-          chunk.strm_id[index_type]  = stream_info.size();
-          chunk.strm_len[index_type] = stream.length;
-          // NOTE: skip_count field is temporarily used to track the presence of index streams
-          chunk.skip_count |= 1 << index_type;
-
-          if (index_type == gpu::CI_DICTIONARY) {
-            chunk.dictionary_start = *num_dictionary_entries;
-            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
-            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
-          }
-        }
-      }
-      stream_info.emplace_back(
-        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
-      dst_offset += stream.length;
-    }
-    src_offset += stream.length;
-  }
-
-  return dst_offset;
-}
-
-/**
- * @brief Decompresses the stripe data, at stream granularity.
- *
- * @param decompressor Block decompressor
- * @param stripe_data List of source stripe column data
- * @param stream_info List of stream to column mappings
- * @param chunks Vector of list of column chunk descriptors
- * @param row_groups Vector of list of row index descriptors
- * @param num_stripes Number of stripes making up column chunks
- * @param row_index_stride Distance between each row index
- * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Device buffer to decompressed page data
- */
-rmm::device_buffer decompress_stripe_data(
-  OrcDecompressor const& decompressor,
-  host_span<rmm::device_buffer const> stripe_data,
-  host_span<orc_stream_info> stream_info,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-  cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-  std::size_t num_stripes,
-  std::size_t row_index_stride,
-  bool use_base_stride,
-  rmm::cuda_stream_view stream)
-{
-  // Parse the columns' compressed info
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-    0, stream_info.size(), stream);
-  for (auto const& info : stream_info) {
-    compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-      info.length));
-  }
-  compinfo.host_to_device_async(stream);
-
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
-  compinfo.device_to_host_sync(stream);
-
-  // Count the exact number of compressed blocks
-  std::size_t num_compressed_blocks   = 0;
-  std::size_t num_uncompressed_blocks = 0;
-  std::size_t total_decomp_size       = 0;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    num_compressed_blocks += compinfo[i].num_compressed_blocks;
-    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
-    total_decomp_size += compinfo[i].max_uncompressed_size;
-  }
-  CUDF_EXPECTS(
-    not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
-    "Inconsistent info on compression blocks");
-
-  // Buffer needs to be padded.
-  // Required by `gpuDecodeOrcColumnData`.
-  rmm::device_buffer decomp_data(
-    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
-  if (decomp_data.is_empty()) { return decomp_data; }
-
-  rmm::device_uvector<device_span<uint8_t const>> inflate_in(
-    num_compressed_blocks + num_uncompressed_blocks, stream);
-  rmm::device_uvector<device_span<uint8_t>> inflate_out(
-    num_compressed_blocks + num_uncompressed_blocks, stream);
-  rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
-  thrust::fill(rmm::exec_policy(stream),
-               inflate_res.begin(),
-               inflate_res.end(),
-               compression_result{0, compression_status::FAILURE});
-
-  // Parse again to populate the decompression input/output buffers
-  std::size_t decomp_offset      = 0;
-  uint32_t max_uncomp_block_size = 0;
-  uint32_t start_pos             = 0;
-  auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
-    compinfo[i].uncompressed_data = dst_base + decomp_offset;
-    compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
-    compinfo[i].dec_out_ctl       = inflate_out.data() + start_pos;
-    compinfo[i].dec_res      = {inflate_res.data() + start_pos, compinfo[i].num_compressed_blocks};
-    compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
-    compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
-
-    stream_info[i].dst_pos = decomp_offset;
-    decomp_offset += compinfo[i].max_uncompressed_size;
-    start_pos += compinfo[i].num_compressed_blocks;
-    start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
-    max_uncomp_block_size =
-      std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
-  }
-  compinfo.host_to_device_async(stream);
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
-
-  // Value for checking whether we decompress successfully.
-  // It doesn't need to be atomic as there is no race condition: we only write `true` if needed.
-  cudf::detail::hostdevice_vector<bool> any_block_failure(1, stream);
-  any_block_failure[0] = false;
-  any_block_failure.host_to_device_async(stream);
-
-  // Dispatch batches of blocks to decompress
-  if (num_compressed_blocks > 0) {
-    device_span<device_span<uint8_t const>> inflate_in_view{inflate_in.data(),
-                                                            num_compressed_blocks};
-    device_span<device_span<uint8_t>> inflate_out_view{inflate_out.data(), num_compressed_blocks};
-    switch (decompressor.compression()) {
-      case compression_type::ZLIB:
-        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
-          gpuinflate(
-            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
-        } else {
-          nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE,
-                                     inflate_in_view,
-                                     inflate_out_view,
-                                     inflate_res,
-                                     max_uncomp_block_size,
-                                     total_decomp_size,
-                                     stream);
-        }
-        break;
-      case compression_type::SNAPPY:
-        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::SNAPPY)) {
-          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
-        } else {
-          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     inflate_in_view,
-                                     inflate_out_view,
-                                     inflate_res,
-                                     max_uncomp_block_size,
-                                     total_decomp_size,
-                                     stream);
-        }
-        break;
-      case compression_type::ZSTD:
-        if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD);
-            reason) {
-          CUDF_FAIL("Decompression error: " + reason.value());
-        }
-        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   inflate_in_view,
-                                   inflate_out_view,
-                                   inflate_res,
-                                   max_uncomp_block_size,
-                                   total_decomp_size,
-                                   stream);
-        break;
-      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
-    }
-
-    // Check if any block has been failed to decompress.
-    // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(std::size_t{0}),
-      thrust::make_counting_iterator(inflate_res.size()),
-      [results           = inflate_res.begin(),
-       any_block_failure = any_block_failure.device_ptr()] __device__(auto const idx) {
-        if (results[idx].status != compression_status::SUCCESS) { *any_block_failure = true; }
-      });
-  }
-
-  if (num_uncompressed_blocks > 0) {
-    device_span<device_span<uint8_t const>> copy_in_view{inflate_in.data() + num_compressed_blocks,
-                                                         num_uncompressed_blocks};
-    device_span<device_span<uint8_t>> copy_out_view{inflate_out.data() + num_compressed_blocks,
-                                                    num_uncompressed_blocks};
-    gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
-  }
-
-  // Copy without stream sync, thus need to wait for stream sync below to access.
-  any_block_failure.device_to_host_async(stream);
-
-  gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
-  compinfo.device_to_host_sync(stream);  // This also sync stream for `any_block_failure`.
-
-  // We can check on host after stream synchronize
-  CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
-
-  auto const num_columns = chunks.size().second;
-
-  // Update the stream information with the updated uncompressed info
-  // TBD: We could update the value from the information we already
-  // have in stream_info[], but using the gpu results also updates
-  // max_uncompressed_size to the actual uncompressed size, or zero if
-  // decompression failed.
-  for (std::size_t i = 0; i < num_stripes; ++i) {
-    for (std::size_t j = 0; j < num_columns; ++j) {
-      auto& chunk = chunks[i][j];
-      for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
-        if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
-          chunk.streams[k]  = compinfo[chunk.strm_id[k]].uncompressed_data;
-          chunk.strm_len[k] = compinfo[chunk.strm_id[k]].max_uncompressed_size;
-        }
-      }
-    }
-  }
-
-  if (row_groups.size().first) {
-    chunks.host_to_device_async(stream);
-    row_groups.host_to_device_async(stream);
-    gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                            compinfo.device_ptr(),
-                            chunks.base_device_ptr(),
-                            num_columns,
-                            num_stripes,
-                            row_groups.size().first,
-                            row_index_stride,
-                            use_base_stride,
-                            stream);
-  }
-
-  return decomp_data;
-}
-
-/**
- * @brief Updates null mask of columns whose parent is a struct column.
- *
- * If struct column has null element, that row would be skipped while writing child column in ORC,
- * so we need to insert the missing null elements in child column. There is another behavior from
- * pyspark, where if the child column doesn't have any null elements, it will not have present
- * stream, so in that case parent null mask need to be copied to child column.
- *
- * @param chunks Vector of list of column chunk descriptors
- * @param out_buffers Output columns' device buffers
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource to use for device memory allocation
- */
-void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                      host_span<column_buffer> out_buffers,
-                      rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
-{
-  auto const num_stripes = chunks.size().first;
-  auto const num_columns = chunks.size().second;
-  bool is_mask_updated   = false;
-
-  for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
-    if (chunks[0][col_idx].parent_validity_info.valid_map_base != nullptr) {
-      if (not is_mask_updated) {
-        chunks.device_to_host_sync(stream);
-        is_mask_updated = true;
-      }
-
-      auto parent_valid_map_base = chunks[0][col_idx].parent_validity_info.valid_map_base;
-      auto child_valid_map_base  = out_buffers[col_idx].null_mask();
-      auto child_mask_len =
-        chunks[0][col_idx].column_num_rows - chunks[0][col_idx].parent_validity_info.null_count;
-      auto parent_mask_len = chunks[0][col_idx].column_num_rows;
-
-      if (child_valid_map_base != nullptr) {
-        rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
-        // Copy indexes at which the parent has valid value.
-        thrust::copy_if(rmm::exec_policy(stream),
-                        thrust::make_counting_iterator(0),
-                        thrust::make_counting_iterator(0) + parent_mask_len,
-                        dst_idx.begin(),
-                        [parent_valid_map_base] __device__(auto idx) {
-                          return bit_is_set(parent_valid_map_base, idx);
-                        });
-
-        auto merged_null_mask = cudf::detail::create_null_mask(
-          parent_mask_len, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
-        auto merged_mask      = static_cast<bitmask_type*>(merged_null_mask.data());
-        uint32_t* dst_idx_ptr = dst_idx.data();
-        // Copy child valid bits from child column to valid indexes, this will merge both child
-        // and parent null masks
-        thrust::for_each(rmm::exec_policy(stream),
-                         thrust::make_counting_iterator(0),
-                         thrust::make_counting_iterator(0) + dst_idx.size(),
-                         [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
-                           if (bit_is_set(child_valid_map_base, idx)) {
-                             cudf::set_bit(merged_mask, dst_idx_ptr[idx]);
-                           };
-                         });
-
-        out_buffers[col_idx].set_null_mask(std::move(merged_null_mask));
-
-      } else {
-        // Since child column doesn't have a mask, copy parent null mask
-        auto mask_size = bitmask_allocation_size_bytes(parent_mask_len);
-        out_buffers[col_idx].set_null_mask(
-          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr));
-      }
-    }
-  }
-
-  if (is_mask_updated) {
-    // Update chunks with pointers to column data which might have been changed.
-    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
-      for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
-        auto& chunk          = chunks[stripe_idx][col_idx];
-        chunk.valid_map_base = out_buffers[col_idx].null_mask();
-      }
-    }
-    chunks.host_to_device_sync(stream);
-  }
-}
-
-/**
- * @brief Converts the stripe column data and outputs to columns.
- *
- * @param num_dicts Number of dictionary entries required
- * @param skip_rows Number of rows to offset from start
- * @param row_index_stride Distance between each row index
- * @param level Current nesting level being processed
- * @param tz_table Local time to UTC conversion table
- * @param chunks Vector of list of column chunk descriptors
- * @param row_groups Vector of list of row index descriptors
- * @param out_buffers Output columns' device buffers
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
- */
-void decode_stream_data(std::size_t num_dicts,
-                        std::size_t skip_rows,
-                        std::size_t row_index_stride,
-                        std::size_t level,
-                        table_view const& tz_table,
-                        cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                        cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
-                        std::vector<column_buffer>& out_buffers,
-                        rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr)
-{
-  auto const num_stripes = chunks.size().first;
-  auto const num_columns = chunks.size().second;
-  thrust::counting_iterator<int> col_idx_it(0);
-  thrust::counting_iterator<int> stripe_idx_it(0);
-
-  // Update chunks with pointers to column data
-  std::for_each(stripe_idx_it, stripe_idx_it + num_stripes, [&](auto stripe_idx) {
-    std::for_each(col_idx_it, col_idx_it + num_columns, [&](auto col_idx) {
-      auto& chunk            = chunks[stripe_idx][col_idx];
-      chunk.column_data_base = out_buffers[col_idx].data();
-      chunk.valid_map_base   = out_buffers[col_idx].null_mask();
-    });
-  });
-
-  // Allocate global dictionary for deserializing
-  rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
-
-  chunks.host_to_device_sync(stream);
-  gpu::DecodeNullsAndStringDictionaries(
-    chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
-
-  if (level > 0) {
-    // Update nullmasks for children if parent was a struct and had null mask
-    update_null_mask(chunks, out_buffers, stream, mr);
-  }
-
-  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
-  rmm::device_scalar<size_type> error_count(0, stream);
-  // Update the null map for child columns
-  gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
-                           global_dict.data(),
-                           row_groups,
-                           num_columns,
-                           num_stripes,
-                           skip_rows,
-                           *tz_table_dptr,
-                           row_groups.size().first,
-                           row_index_stride,
-                           level,
-                           error_count.data(),
-                           stream);
-  chunks.device_to_host_async(stream);
-  // `value` synchronizes
-  auto const num_errors = error_count.value(stream);
-  CUDF_EXPECTS(num_errors == 0, "ORC data decode failed");
-
-  std::for_each(col_idx_it + 0, col_idx_it + num_columns, [&](auto col_idx) {
-    out_buffers[col_idx].null_count() =
-      std::accumulate(stripe_idx_it + 0,
-                      stripe_idx_it + num_stripes,
-                      0,
-                      [&](auto null_count, auto const stripe_idx) {
-                        return null_count + chunks[stripe_idx][col_idx].null_count;
-                      });
-  });
-}
-
-/**
- * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
- * layer.
- */
-void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
-                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
-                      rmm::cuda_stream_view stream)
-{
-  auto const num_stripes = chunks.size().first;
-  if (num_stripes == 0) return;
-
-  auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
-  for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
-    // Null counts sums are only needed for children of struct columns
-    if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
-    }
-  }
-  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
-    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
-
-  thrust::for_each(rmm::exec_policy(stream),
-                   d_prefix_sums_to_update.begin(),
-                   d_prefix_sums_to_update.end(),
-                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
-                     auto const& idx_psums) {
-                     auto const col_idx = idx_psums.first;
-                     auto const psums   = idx_psums.second;
-
-                     thrust::transform(
-                       thrust::seq,
-                       thrust::make_counting_iterator(0),
-                       thrust::make_counting_iterator(0) + psums.size(),
-                       psums.begin(),
-                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
-
-                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
-                   });
-  // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
-  stream.synchronize();
-}
-
-/**
- * @brief Aggregate child metadata from parent column chunks.
- */
-void aggregate_child_meta(std::size_t level,
-                          column_hierarchy const& selected_columns,
-                          cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
-                          cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
-                          host_span<orc_column_meta const> list_col,
-                          host_span<column_buffer> out_buffers,
-                          reader_column_meta& col_meta)
-{
-  auto const num_of_stripes         = chunks.size().first;
-  auto const num_of_rowgroups       = row_groups.size().first;
-  auto const num_child_cols         = selected_columns.levels[level + 1].size();
-  auto const number_of_child_chunks = num_child_cols * num_of_stripes;
-  auto& num_child_rows              = col_meta.num_child_rows;
-  auto& parent_column_data          = col_meta.parent_column_data;
-
-  // Reset the meta to store child column details.
-  num_child_rows.resize(selected_columns.levels[level + 1].size());
-  std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
-  parent_column_data.resize(number_of_child_chunks);
-  col_meta.parent_column_index.resize(number_of_child_chunks);
-  col_meta.child_start_row.resize(number_of_child_chunks);
-  col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
-  col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
-
-  auto child_start_row = cudf::detail::host_2dspan<uint32_t>(
-    col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
-  auto num_child_rows_per_stripe = cudf::detail::host_2dspan<uint32_t>(
-    col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
-  auto rwgrp_meta = cudf::detail::host_2dspan<reader_column_meta::row_group_meta>(
-    col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
-
-  int index = 0;  // number of child column processed
-
-  // For each parent column, update its child column meta for each stripe.
-  std::for_each(list_col.begin(), list_col.end(), [&](auto const p_col) {
-    auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
-    auto start_row            = 0;
-    auto processed_row_groups = 0;
-
-    for (std::size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
-      // Aggregate num_rows and start_row from processed parent columns per row groups
-      if (num_of_rowgroups) {
-        auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
-        auto processed_child_rows  = 0;
-
-        for (std::size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
-             rowgroup_id++, processed_row_groups++) {
-          auto const child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
-          for (size_type id = 0; id < p_col.num_children; id++) {
-            auto const child_col_idx                                  = index + id;
-            rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
-            rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
-          }
-          processed_child_rows += child_rows;
-        }
-      }
-
-      // Aggregate start row, number of rows per chunk and total number of rows in a column
-      auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
-      for (size_type id = 0; id < p_col.num_children; id++) {
-        auto const child_col_idx = index + id;
-
-        num_child_rows[child_col_idx] += child_rows;
-        num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
-        // start row could be different for each column when there is nesting at each stripe level
-        child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
-      }
-      start_row += child_rows;
-    }
-
-    // Parent column null mask and null count would be required for child column
-    // to adjust its nullmask.
-    auto type              = out_buffers[parent_col_idx].type.id();
-    auto parent_null_count = static_cast<uint32_t>(out_buffers[parent_col_idx].null_count());
-    auto parent_valid_map  = out_buffers[parent_col_idx].null_mask();
-    auto num_rows          = out_buffers[parent_col_idx].size;
-
-    for (size_type id = 0; id < p_col.num_children; id++) {
-      auto const child_col_idx                    = index + id;
-      col_meta.parent_column_index[child_col_idx] = parent_col_idx;
-      if (type == type_id::STRUCT) {
-        parent_column_data[child_col_idx] = {parent_valid_map, parent_null_count};
-        // Number of rows in child will remain same as parent in case of struct column
-        num_child_rows[child_col_idx] = num_rows;
-      } else {
-        parent_column_data[child_col_idx] = {nullptr, 0};
-      }
-    }
-    index += p_col.num_children;
-  });
-}
-
-/**
- * @brief struct to store buffer data and size of list buffer
- */
-struct list_buffer_data {
-  size_type* data;
-  size_type size;
-};
-
-// Generates offsets for list buffer from number of elements in a row.
-void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_stream_view stream)
-{
-  for (auto& list_data : buff_data) {
-    thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
-                           list_data.data,
-                           list_data.data + list_data.size,
-                           list_data.data);
-  }
-}
-
-/**
- * @brief Function that translates ORC data kind to cuDF type enum
- */
-constexpr type_id to_cudf_type(orc::TypeKind kind,
-                               bool use_np_dtypes,
-                               type_id timestamp_type_id,
-                               type_id decimal_type_id)
-{
-  switch (kind) {
-    case orc::BOOLEAN: return type_id::BOOL8;
-    case orc::BYTE: return type_id::INT8;
-    case orc::SHORT: return type_id::INT16;
-    case orc::INT: return type_id::INT32;
-    case orc::LONG: return type_id::INT64;
-    case orc::FLOAT: return type_id::FLOAT32;
-    case orc::DOUBLE: return type_id::FLOAT64;
-    case orc::STRING:
-    case orc::BINARY:
-    case orc::VARCHAR:
-    case orc::CHAR:
-      // Variable-length types can all be mapped to STRING
-      return type_id::STRING;
-    case orc::TIMESTAMP:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_NANOSECONDS;
-    case orc::DATE:
-      // There isn't a (DAYS -> np.dtype) mapping
-      return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
-    case orc::DECIMAL: return decimal_type_id;
-    // Need to update once cuDF plans to support map type
-    case orc::MAP:
-    case orc::LIST: return type_id::LIST;
-    case orc::STRUCT: return type_id::STRUCT;
-    default: break;
-  }
-
-  return type_id::EMPTY;
-}
-
-/**
- * @brief Determines cuDF type of an ORC Decimal column.
- */
-type_id to_cudf_decimal_type(host_span<std::string const> decimal128_columns,
-                             aggregate_orc_metadata const& metadata,
-                             int column_index)
-{
-  if (metadata.get_col_type(column_index).kind != DECIMAL) { return type_id::EMPTY; }
-
-  if (std::find(decimal128_columns.begin(),
-                decimal128_columns.end(),
-                metadata.column_path(0, column_index)) != decimal128_columns.end()) {
-    return type_id::DECIMAL128;
-  }
-
-  auto const precision = metadata.get_col_type(column_index)
-                           .precision.value_or(cuda::std::numeric_limits<int64_t>::digits10);
-  if (precision <= cuda::std::numeric_limits<int32_t>::digits10) { return type_id::DECIMAL32; }
-  if (precision <= cuda::std::numeric_limits<int64_t>::digits10) { return type_id::DECIMAL64; }
-  return type_id::DECIMAL128;
-}
-
-std::string get_map_child_col_name(std::size_t const idx) { return (idx == 0) ? "key" : "value"; }
-
-/**
- * @brief Create empty columns and respective schema information from the buffer.
- */
-std::unique_ptr<column> create_empty_column(size_type orc_col_id,
-                                            aggregate_orc_metadata const& metadata,
-                                            host_span<std::string const> decimal128_columns,
-                                            bool use_np_dtypes,
-                                            data_type timestamp_type,
-                                            column_name_info& schema_info,
-                                            rmm::cuda_stream_view stream)
-{
-  schema_info.name = metadata.column_name(0, orc_col_id);
-  auto const kind  = metadata.get_col_type(orc_col_id).kind;
-  auto const type  = to_cudf_type(kind,
-                                 use_np_dtypes,
-                                 timestamp_type.id(),
-                                 to_cudf_decimal_type(decimal128_columns, metadata, orc_col_id));
-
-  switch (kind) {
-    case orc::LIST: {
-      schema_info.children.emplace_back("offsets");
-      schema_info.children.emplace_back("");
-      return make_lists_column(0,
-                               make_empty_column(type_id::INT32),
-                               create_empty_column(metadata.get_col_type(orc_col_id).subtypes[0],
-                                                   metadata,
-                                                   decimal128_columns,
-                                                   use_np_dtypes,
-                                                   timestamp_type,
-                                                   schema_info.children.back(),
-                                                   stream),
-                               0,
-                               rmm::device_buffer{0, stream},
-                               stream);
-    }
-    case orc::MAP: {
-      schema_info.children.emplace_back("offsets");
-      schema_info.children.emplace_back("struct");
-      auto const child_column_ids = metadata.get_col_type(orc_col_id).subtypes;
-      auto& children_schema       = schema_info.children.back().children;
-      std::vector<std::unique_ptr<column>> child_columns;
-      for (std::size_t idx = 0; idx < metadata.get_col_type(orc_col_id).subtypes.size(); idx++) {
-        children_schema.emplace_back("");
-        child_columns.push_back(create_empty_column(child_column_ids[idx],
-                                                    metadata,
-                                                    decimal128_columns,
-                                                    use_np_dtypes,
-                                                    timestamp_type,
-                                                    schema_info.children.back().children.back(),
-                                                    stream));
-        children_schema[idx].name = get_map_child_col_name(idx);
-      }
-      return make_lists_column(
-        0,
-        make_empty_column(type_id::INT32),
-        make_structs_column(0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream),
-        0,
-        rmm::device_buffer{0, stream},
-        stream);
-    }
-
-    case orc::STRUCT: {
-      std::vector<std::unique_ptr<column>> child_columns;
-      for (auto const col : metadata.get_col_type(orc_col_id).subtypes) {
-        schema_info.children.emplace_back("");
-        child_columns.push_back(create_empty_column(col,
-                                                    metadata,
-                                                    decimal128_columns,
-                                                    use_np_dtypes,
-                                                    timestamp_type,
-                                                    schema_info.children.back(),
-                                                    stream));
-      }
-      return make_structs_column(
-        0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream);
-    }
-
-    case orc::DECIMAL: {
-      int32_t scale = 0;
-      if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
-        scale = -static_cast<int32_t>(metadata.get_types()[orc_col_id].scale.value_or(0));
-      }
-      return make_empty_column(data_type(type, scale));
-    }
-
-    default: return make_empty_column(type);
-  }
-}
-
-/**
- * @brief Assemble the buffer with child columns.
- */
-column_buffer assemble_buffer(size_type orc_col_id,
-                              std::size_t level,
-                              reader_column_meta const& col_meta,
-                              aggregate_orc_metadata const& metadata,
-                              column_hierarchy const& selected_columns,
-                              std::vector<std::vector<column_buffer>>& col_buffers,
-                              rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
-{
-  auto const col_id = col_meta.orc_col_map[level][orc_col_id];
-  auto& col_buffer  = col_buffers[level][col_id];
-
-  col_buffer.name = metadata.column_name(0, orc_col_id);
-  auto kind       = metadata.get_col_type(orc_col_id).kind;
-  switch (kind) {
-    case orc::LIST:
-    case orc::STRUCT: {
-      auto const& children_indices = selected_columns.children.at(orc_col_id);
-      for (auto const child_id : children_indices) {
-        col_buffer.children.emplace_back(assemble_buffer(
-          child_id, level + 1, col_meta, metadata, selected_columns, col_buffers, stream, mr));
-      }
-    } break;
-
-    case orc::MAP: {
-      std::vector<column_buffer> child_col_buffers;
-      // Get child buffers
-      auto const& children_indices = selected_columns.children.at(orc_col_id);
-      for (std::size_t idx = 0; idx < children_indices.size(); idx++) {
-        auto const col = children_indices[idx];
-        child_col_buffers.emplace_back(assemble_buffer(
-          col, level + 1, col_meta, metadata, selected_columns, col_buffers, stream, mr));
-        child_col_buffers.back().name = get_map_child_col_name(idx);
-      }
-      // Create a struct buffer
-      auto num_rows = child_col_buffers[0].size;
-      auto struct_buffer =
-        column_buffer(cudf::data_type(type_id::STRUCT), num_rows, false, stream, mr);
-      struct_buffer.children = std::move(child_col_buffers);
-      struct_buffer.name     = "struct";
-
-      col_buffer.children.emplace_back(std::move(struct_buffer));
-    } break;
-
-    default: break;
-  }
-
-  return std::move(col_buffer);
-}
-
-}  // namespace
 
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    orc_reader_options const& options,
@@ -945,14 +26,14 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    rmm::mr::device_memory_resource* mr)
   : _stream(stream),
     _mr(mr),
-    _sources(std::move(sources)),
-    _metadata{_sources, stream},
-    _selected_columns{_metadata.select_columns(options.get_columns())},
     _timestamp_type{options.get_timestamp_type()},
     _use_index{options.is_enabled_use_index()},
     _use_np_dtypes{options.is_enabled_use_np_dtypes()},
     _decimal128_columns{options.get_decimal128_columns()},
-    _col_meta{std::make_unique<reader_column_meta>()}
+    _col_meta{std::make_unique<reader_column_meta>()},
+    _sources(std::move(sources)),
+    _metadata{_sources, stream},
+    _selected_columns{_metadata.select_columns(options.get_columns())}
 {
 }
 
@@ -960,23 +41,21 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
                                        std::optional<size_type> const& num_rows_opt,
                                        std::vector<std::vector<size_type>> const& stripes)
 {
-  // Selected columns at different levels of nesting are stored in different elements
-  // of `selected_columns`; thus, size == 1 means no nested columns
-  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
-               "skip_rows is not supported by nested columns");
-
-  // There are no columns in the table
-  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+  prepare_data(skip_rows, num_rows_opt, stripes);
+  return read_chunk_internal();
+}
 
-  std::vector<std::vector<column_buffer>> out_buffers(_selected_columns.num_levels());
-  std::vector<std::unique_ptr<column>> out_columns;
-  table_metadata out_metadata;
+table_metadata reader::impl::make_output_metadata()
+{
+  if (_output_metadata) { return table_metadata{*_output_metadata}; }
 
   // Copy user data to the output metadata.
+  table_metadata out_metadata;
+  out_metadata.per_file_user_data.reserve(_metadata.per_file_metadata.size());
   std::transform(_metadata.per_file_metadata.cbegin(),
                  _metadata.per_file_metadata.cend(),
                  std::back_inserter(out_metadata.per_file_user_data),
-                 [](auto& meta) {
+                 [](auto const& meta) {
                    std::unordered_map<std::string, std::string> kv_map;
                    std::transform(meta.ff.metadata.cbegin(),
                                   meta.ff.metadata.cend(),
@@ -989,12 +68,22 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
   out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
                             out_metadata.per_file_user_data[0].end()};
 
-  // Select only stripes required (aka row groups)
-  auto const [rows_to_skip, rows_to_read, selected_stripes] =
-    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
+  // Save the output table metadata into `_output_metadata` for reuse next time.
+  _output_metadata = std::make_unique<table_metadata>(out_metadata);
+
+  return out_metadata;
+}
+
+table_with_metadata reader::impl::read_chunk_internal()
+{
+  // There is no columns in the table.
+  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+
+  std::vector<std::unique_ptr<column>> out_columns;
+  auto out_metadata = make_output_metadata();
 
   // If no rows or stripes to read, return empty columns
-  if (rows_to_read == 0 || selected_stripes.empty()) {
+  if (_file_itm_data->rows_to_read == 0 || _file_itm_data->selected_stripes.empty()) {
     std::transform(_selected_columns.levels[0].begin(),
                    _selected_columns.levels[0].end(),
                    std::back_inserter(out_columns),
@@ -1011,324 +100,6 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
     return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
   }
 
-  // Set up table for converting timestamp columns from local to UTC time
-  auto const tz_table = [&, &selected_stripes = selected_stripes] {
-    auto const has_timestamp_column = std::any_of(
-      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
-        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
-          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
-        });
-      });
-
-    return has_timestamp_column
-             ? cudf::detail::make_timezone_transition_table(
-                 {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream)
-             : std::make_unique<cudf::table>();
-  }();
-
-  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data(_selected_columns.num_levels());
-  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
-
-  // Iterates through levels of nested columns, child column will be one level down
-  // compared to parent column.
-  auto& col_meta = *_col_meta;
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& columns_level = _selected_columns.levels[level];
-    // Association between each ORC column and its cudf::column
-    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
-    std::vector<orc_column_meta> nested_col;
-
-    // Get a list of column data types
-    std::vector<data_type> column_types;
-    for (auto& col : columns_level) {
-      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
-                                   _use_np_dtypes,
-                                   _timestamp_type.id(),
-                                   to_cudf_decimal_type(_decimal128_columns, _metadata, col.id));
-      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
-          col_type == type_id::DECIMAL128) {
-        // sign of the scale is changed since cuDF follows c++ libraries like CNL
-        // which uses negative scaling, but liborc and other libraries
-        // follow positive scaling.
-        auto const scale =
-          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
-        column_types.emplace_back(col_type, scale);
-      } else {
-        column_types.emplace_back(col_type);
-      }
-
-      // Map each ORC column to its column
-      col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
-      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
-        nested_col.emplace_back(col);
-      }
-    }
-
-    // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes =
-      std::accumulate(selected_stripes.begin(),
-                      selected_stripes.end(),
-                      0,
-                      [](std::size_t sum, auto& stripe_source_mapping) {
-                        return sum + stripe_source_mapping.stripe_info.size();
-                      });
-    auto const num_columns = columns_level.size();
-    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
-      total_num_stripes, num_columns, _stream);
-    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
-
-    const bool use_index =
-      _use_index &&
-      // Do stripes have row group index
-      _metadata.is_row_grp_idx_present() &&
-      // Only use if we don't have much work with complete columns & stripes
-      // TODO: Consider nrows, gpu, and tune the threshold
-      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
-      // Only use if first row is aligned to a stripe boundary
-      // TODO: Fix logic to handle unaligned rows
-      (rows_to_skip == 0);
-
-    // Logically view streams as columns
-    std::vector<orc_stream_info> stream_info;
-
-    null_count_prefix_sums.emplace_back();
-    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
-    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                    _selected_columns.levels[level].size(),
-                    [&]() {
-                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
-                    });
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
-
-    std::size_t stripe_start_row = 0;
-    std::size_t num_dict_entries = 0;
-    std::size_t num_rowgroups    = 0;
-    int stripe_idx               = 0;
-
-    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe_source_mapping : selected_stripes) {
-      // Iterate through the source files selected stripes
-      for (auto const& stripe : stripe_source_mapping.stripe_info) {
-        auto const stripe_info   = stripe.first;
-        auto const stripe_footer = stripe.second;
-
-        auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(stripe_idx,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        col_meta.orc_col_map[level],
-                                                        _metadata.get_types(),
-                                                        use_index,
-                                                        level == 0,
-                                                        &num_dict_entries,
-                                                        stream_info,
-                                                        chunks);
-
-        auto const is_stripe_data_empty = total_data_size == 0;
-        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                     "Invalid index rowgroup stream data");
-
-        // Buffer needs to be padded.
-        // Required by `copy_uncompressed_kernel`.
-        stripe_data.emplace_back(
-          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
-        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
-
-        // Coalesce consecutive streams into one read
-        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
-          auto const offset = stream_info[stream_count].offset;
-          auto len          = stream_info[stream_count].length;
-          stream_count++;
-
-          while (stream_count < stream_info.size() &&
-                 stream_info[stream_count].offset == offset + len) {
-            len += stream_info[stream_count].length;
-            stream_count++;
-          }
-          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                .source->is_device_read_preferred(len)) {
-            read_tasks.push_back(
-              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                          .source->device_read_async(offset, len, d_dst, _stream),
-                        len));
-
-          } else {
-            auto const buffer =
-              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-                offset, len);
-            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-            CUDF_CUDA_TRY(
-              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-            _stream.synchronize();
-          }
-        }
-
-        auto const num_rows_per_stripe = stripe_info->numberOfRows;
-        auto const rowgroup_id         = num_rowgroups;
-        auto stripe_num_rowgroups      = 0;
-        if (use_index) {
-          stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
-                                 _metadata.get_row_index_stride();
-        }
-        // Update chunks to reference streams pointers
-        for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-          auto& chunk = chunks[stripe_idx][col_idx];
-          // start row, number of rows in a each stripe and total number of rows
-          // may change in lower levels of nesting
-          chunk.start_row = (level == 0)
-                              ? stripe_start_row
-                              : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
-          chunk.num_rows =
-            (level == 0) ? stripe_info->numberOfRows
-                         : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
-          chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
-          chunk.parent_validity_info =
-            (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
-          chunk.parent_null_count_prefix_sums =
-            (level == 0)
-              ? nullptr
-              : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
-          chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
-          chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                              .ff.types[columns_level[col_idx].id]
-                              .kind;
-          // num_child_rows for a struct column will be same, for other nested types it will be
-          // calculated.
-          chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-          chunk.dtype_id       = column_types[col_idx].id();
-          chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                                  .ff.types[columns_level[col_idx].id]
-                                  .scale.value_or(0);
-
-          chunk.rowgroup_id   = rowgroup_id;
-          chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
-                                  ? sizeof(string_index_pair)
-                                : ((column_types[col_idx].id() == type_id::LIST) or
-                               (column_types[col_idx].id() == type_id::STRUCT))
-                                  ? sizeof(size_type)
-                                  : cudf::size_of(column_types[col_idx]);
-          chunk.num_rowgroups = stripe_num_rowgroups;
-          if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
-          if (not is_stripe_data_empty) {
-            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-            }
-          }
-        }
-        stripe_start_row += num_rows_per_stripe;
-        num_rowgroups += stripe_num_rowgroups;
-
-        stripe_idx++;
-      }
-    }
-    for (auto& task : read_tasks) {
-      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
-    }
-
-    if (stripe_data.empty()) { continue; }
-
-    // Process dataset chunk pages into output columns
-    auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
-    if (level > 0 and row_groups.size().first) {
-      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_columns);
-      auto& rw_grp_meta = col_meta.rwgrp_meta;
-
-      // Update start row and num rows per row group
-      std::transform(rw_grp_meta.begin(),
-                     rw_grp_meta.end(),
-                     row_groups_span.begin(),
-                     rw_grp_meta.begin(),
-                     [&](auto meta, auto& row_grp) {
-                       row_grp.num_rows  = meta.num_rows;
-                       row_grp.start_row = meta.start_row;
-                       return meta;
-                     });
-    }
-    // Setup row group descriptors if using indexes
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
-                                                stripe_data,
-                                                stream_info,
-                                                chunks,
-                                                row_groups,
-                                                total_num_stripes,
-                                                _metadata.get_row_index_stride(),
-                                                level == 0,
-                                                _stream);
-      stripe_data.clear();
-      stripe_data.push_back(std::move(decomp_data));
-    } else {
-      if (row_groups.size().first) {
-        chunks.host_to_device_async(_stream);
-        row_groups.host_to_device_async(_stream);
-        gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                                nullptr,
-                                chunks.base_device_ptr(),
-                                num_columns,
-                                total_num_stripes,
-                                num_rowgroups,
-                                _metadata.get_row_index_stride(),
-                                level == 0,
-                                _stream);
-      }
-    }
-
-    for (std::size_t i = 0; i < column_types.size(); ++i) {
-      bool is_nullable = false;
-      for (std::size_t j = 0; j < total_num_stripes; ++j) {
-        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
-          is_nullable = true;
-          break;
-        }
-      }
-      auto is_list_type = (column_types[i].id() == type_id::LIST);
-      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
-      // For list column, offset column will be always size + 1
-      if (is_list_type) n_rows++;
-      out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
-    }
-
-    decode_stream_data(num_dict_entries,
-                       rows_to_skip,
-                       _metadata.get_row_index_stride(),
-                       level,
-                       tz_table->view(),
-                       chunks,
-                       row_groups,
-                       out_buffers[level],
-                       _stream,
-                       _mr);
-
-    if (nested_col.size()) {
-      // Extract information to process nested child columns
-      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
-
-      row_groups.device_to_host_sync(_stream);
-      aggregate_child_meta(
-        level, _selected_columns, chunks, row_groups, nested_col, out_buffers[level], col_meta);
-
-      // ORC stores number of elements at each row, so we need to generate offsets from that
-      std::vector<list_buffer_data> buff_data;
-      std::for_each(
-        out_buffers[level].begin(), out_buffers[level].end(), [&buff_data](auto& out_buffer) {
-          if (out_buffer.type.id() == type_id::LIST) {
-            auto data = static_cast<size_type*>(out_buffer.data());
-            buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
-          }
-        });
-
-      if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
-    }
-  }
-
   // Create columns from buffer with respective schema information.
   std::transform(
     _selected_columns.levels[0].begin(),
@@ -1337,7 +108,7 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
     [&](auto const& orc_col_meta) {
       out_metadata.schema_info.emplace_back("");
       auto col_buffer = assemble_buffer(
-        orc_col_meta.id, 0, col_meta, _metadata, _selected_columns, out_buffers, _stream, _mr);
+        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
       return make_column(col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
     });
 
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 4a7771687f6..6561c08f2d9 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -17,11 +17,8 @@
 #pragma once
 
 #include "aggregate_orc_metadata.hpp"
-#include "orc.hpp"
-#include "orc_gpu.hpp"
 
 #include <io/utilities/column_buffer.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/orc.hpp>
@@ -30,15 +27,13 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <memory>
-#include <string>
-#include <utility>
+#include <optional>
 #include <vector>
 
 namespace cudf::io::orc::detail {
 
-namespace {
 struct reader_column_meta;
-}
+struct file_intermediate_data;
 
 /**
  * @brief Implementation for ORC reader.
@@ -62,7 +57,7 @@ class reader::impl {
    * @brief Read an entire set or a subset of data and returns a set of columns
    *
    * @param skip_rows Number of rows to skip from the start
-   * @param num_rows_opt Optional number of rows to read
+   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
    * @param stripes Indices of individual stripes to load if non-empty
    * @return The set of columns along with metadata
    */
@@ -71,18 +66,50 @@ class reader::impl {
                            std::vector<std::vector<size_type>> const& stripes);
 
  private:
+  /**
+   * @brief Perform all the necessary data preprocessing before creating an output table.
+   *
+   * @param skip_rows Number of rows to skip from the start
+   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
+   * @param stripes Indices of individual stripes to load if non-empty
+   */
+  void prepare_data(uint64_t skip_rows,
+                    std::optional<size_type> const& num_rows_opt,
+                    std::vector<std::vector<size_type>> const& stripes);
+
+  /**
+   * @brief Create the output table metadata from file metadata.
+   *
+   * @return Columns' metadata to output with the table read from file
+   */
+  table_metadata make_output_metadata();
+
+  /**
+   * @brief Read a chunk of data from the input source and return an output table with metadata.
+   *
+   * This function is called internally and expects all preprocessing steps have already been done.
+   *
+   * @return The output table along with columns' metadata
+   */
+  table_with_metadata read_chunk_internal();
+
   rmm::cuda_stream_view const _stream;
   rmm::mr::device_memory_resource* const _mr;
 
-  std::vector<std::unique_ptr<datasource>> const _sources;  // Unused but owns data for `_metadata`
-  aggregate_orc_metadata _metadata;
-  column_hierarchy const _selected_columns;  // Need to be after _metadata
-
+  // Reader configs
   data_type const _timestamp_type;  // Override output timestamp resolution
   bool const _use_index;            // Enable or disable attempt to use row index for parsing
   bool const _use_np_dtypes;        // Enable or disable the conversion to numpy-compatible dtypes
   std::vector<std::string> const _decimal128_columns;   // Control decimals conversion
   std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
+
+  // Intermediate data for internal processing.
+  std::vector<std::unique_ptr<datasource>> const _sources;  // Unused but owns data for `_metadata`
+  aggregate_orc_metadata _metadata;
+  column_hierarchy const _selected_columns;  // Construct from `_metadata` thus declare after it
+  std::unique_ptr<file_intermediate_data> _file_itm_data;
+  std::unique_ptr<table_metadata> _output_metadata;
+  std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
new file mode 100644
index 00000000000..44ece671155
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "orc_gpu.hpp"
+
+#include <io/utilities/hostdevice_vector.hpp>
+
+#include <cudf/types.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cudf::io::orc::detail {
+
+/**
+ * @brief Struct to store file-level data that remains constant for all chunks being read.
+ */
+struct file_intermediate_data {
+  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
+  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
+
+  int64_t rows_to_skip;
+  size_type rows_to_read;
+  std::vector<metadata::stripe_source_mapping> selected_stripes;
+};
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
new file mode 100644
index 00000000000..ea4e5dcfaab
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl_helpers.hpp"
+
+namespace cudf::io::orc::detail {
+
+std::unique_ptr<column> create_empty_column(size_type orc_col_id,
+                                            aggregate_orc_metadata const& metadata,
+                                            host_span<std::string const> decimal128_columns,
+                                            bool use_np_dtypes,
+                                            data_type timestamp_type,
+                                            column_name_info& schema_info,
+                                            rmm::cuda_stream_view stream)
+{
+  schema_info.name = metadata.column_name(0, orc_col_id);
+  auto const kind  = metadata.get_col_type(orc_col_id).kind;
+  auto const type  = to_cudf_type(kind,
+                                 use_np_dtypes,
+                                 timestamp_type.id(),
+                                 to_cudf_decimal_type(decimal128_columns, metadata, orc_col_id));
+
+  switch (kind) {
+    case orc::LIST: {
+      schema_info.children.emplace_back("offsets");
+      schema_info.children.emplace_back("");
+      return make_lists_column(0,
+                               make_empty_column(type_id::INT32),
+                               create_empty_column(metadata.get_col_type(orc_col_id).subtypes[0],
+                                                   metadata,
+                                                   decimal128_columns,
+                                                   use_np_dtypes,
+                                                   timestamp_type,
+                                                   schema_info.children.back(),
+                                                   stream),
+                               0,
+                               rmm::device_buffer{0, stream},
+                               stream);
+    }
+    case orc::MAP: {
+      schema_info.children.emplace_back("offsets");
+      schema_info.children.emplace_back("struct");
+      auto const child_column_ids = metadata.get_col_type(orc_col_id).subtypes;
+      auto& children_schema       = schema_info.children.back().children;
+      std::vector<std::unique_ptr<column>> child_columns;
+      for (std::size_t idx = 0; idx < metadata.get_col_type(orc_col_id).subtypes.size(); idx++) {
+        children_schema.emplace_back("");
+        child_columns.push_back(create_empty_column(child_column_ids[idx],
+                                                    metadata,
+                                                    decimal128_columns,
+                                                    use_np_dtypes,
+                                                    timestamp_type,
+                                                    schema_info.children.back().children.back(),
+                                                    stream));
+        children_schema[idx].name = get_map_child_col_name(idx);
+      }
+      return make_lists_column(
+        0,
+        make_empty_column(type_id::INT32),
+        make_structs_column(0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream),
+        0,
+        rmm::device_buffer{0, stream},
+        stream);
+    }
+
+    case orc::STRUCT: {
+      std::vector<std::unique_ptr<column>> child_columns;
+      for (auto const col : metadata.get_col_type(orc_col_id).subtypes) {
+        schema_info.children.emplace_back("");
+        child_columns.push_back(create_empty_column(col,
+                                                    metadata,
+                                                    decimal128_columns,
+                                                    use_np_dtypes,
+                                                    timestamp_type,
+                                                    schema_info.children.back(),
+                                                    stream));
+      }
+      return make_structs_column(
+        0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream);
+    }
+
+    case orc::DECIMAL: {
+      int32_t scale = 0;
+      if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
+        scale = -static_cast<int32_t>(metadata.get_types()[orc_col_id].scale.value_or(0));
+      }
+      return make_empty_column(data_type(type, scale));
+    }
+
+    default: return make_empty_column(type);
+  }
+}
+
+column_buffer assemble_buffer(size_type orc_col_id,
+                              std::size_t level,
+                              reader_column_meta const& col_meta,
+                              aggregate_orc_metadata const& metadata,
+                              column_hierarchy const& selected_columns,
+                              std::vector<std::vector<column_buffer>>& col_buffers,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+{
+  auto const col_id = col_meta.orc_col_map[level][orc_col_id];
+  auto& col_buffer  = col_buffers[level][col_id];
+
+  col_buffer.name = metadata.column_name(0, orc_col_id);
+  auto kind       = metadata.get_col_type(orc_col_id).kind;
+  switch (kind) {
+    case orc::LIST:
+    case orc::STRUCT: {
+      auto const& children_indices = selected_columns.children.at(orc_col_id);
+      for (auto const child_id : children_indices) {
+        col_buffer.children.emplace_back(assemble_buffer(
+          child_id, level + 1, col_meta, metadata, selected_columns, col_buffers, stream, mr));
+      }
+    } break;
+
+    case orc::MAP: {
+      std::vector<column_buffer> child_col_buffers;
+      // Get child buffers
+      auto const& children_indices = selected_columns.children.at(orc_col_id);
+      for (std::size_t idx = 0; idx < children_indices.size(); idx++) {
+        auto const col = children_indices[idx];
+        child_col_buffers.emplace_back(assemble_buffer(
+          col, level + 1, col_meta, metadata, selected_columns, col_buffers, stream, mr));
+        child_col_buffers.back().name = get_map_child_col_name(idx);
+      }
+      // Create a struct buffer
+      auto num_rows = child_col_buffers[0].size;
+      auto struct_buffer =
+        column_buffer(cudf::data_type(type_id::STRUCT), num_rows, false, stream, mr);
+      struct_buffer.children = std::move(child_col_buffers);
+      struct_buffer.name     = "struct";
+
+      col_buffer.children.emplace_back(std::move(struct_buffer));
+    } break;
+
+    default: break;
+  }
+
+  return std::move(col_buffer);
+}
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
new file mode 100644
index 00000000000..f0d91c75fc3
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "aggregate_orc_metadata.hpp"
+#include "orc.hpp"
+
+#include <io/utilities/column_buffer.hpp>
+
+#include <cudf/io/orc.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace cudf::io::orc::detail {
+using namespace cudf::io::detail;
+
+/**
+ * @brief Keeps track of orc mapping and child column details.
+ */
+struct reader_column_meta {
+  // Mapping between column id in orc to processing order.
+  std::vector<std::vector<size_type>> orc_col_map;
+
+  // Number of rows in child columns.
+  std::vector<uint32_t> num_child_rows;
+
+  // Consists of parent column valid_map and null count.
+  std::vector<column_validity_info> parent_column_data;
+
+  std::vector<size_type> parent_column_index;
+
+  // Start row of child columns [stripe][column].
+  std::vector<uint32_t> child_start_row;
+
+  // Number of rows of child columns [stripe][column].
+  std::vector<uint32_t> num_child_rows_per_stripe;
+
+  struct row_group_meta {
+    uint32_t num_rows;   // number of rows in a column in a row group
+    uint32_t start_row;  // start row in a column in a row group
+  };
+
+  // Row group metadata [rowgroup][column].
+  std::vector<row_group_meta> rwgrp_meta;
+};
+
+/**
+ * @brief Function that translates ORC data kind to cuDF type enum
+ */
+inline constexpr type_id to_cudf_type(orc::TypeKind kind,
+                                      bool use_np_dtypes,
+                                      type_id timestamp_type_id,
+                                      type_id decimal_type_id)
+{
+  switch (kind) {
+    case orc::BOOLEAN: return type_id::BOOL8;
+    case orc::BYTE: return type_id::INT8;
+    case orc::SHORT: return type_id::INT16;
+    case orc::INT: return type_id::INT32;
+    case orc::LONG: return type_id::INT64;
+    case orc::FLOAT: return type_id::FLOAT32;
+    case orc::DOUBLE: return type_id::FLOAT64;
+    case orc::STRING:
+    case orc::BINARY:
+    case orc::VARCHAR:
+    case orc::CHAR:
+      // Variable-length types can all be mapped to STRING
+      return type_id::STRING;
+    case orc::TIMESTAMP:
+      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                   : type_id::TIMESTAMP_NANOSECONDS;
+    case orc::DATE:
+      // There isn't a (DAYS -> np.dtype) mapping
+      return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
+    case orc::DECIMAL: return decimal_type_id;
+    // Need to update once cuDF plans to support map type
+    case orc::MAP:
+    case orc::LIST: return type_id::LIST;
+    case orc::STRUCT: return type_id::STRUCT;
+    default: break;
+  }
+
+  return type_id::EMPTY;
+}
+
+/**
+ * @brief Determines cuDF type of an ORC Decimal column.
+ */
+inline type_id to_cudf_decimal_type(host_span<std::string const> decimal128_columns,
+                                    aggregate_orc_metadata const& metadata,
+                                    int column_index)
+{
+  if (metadata.get_col_type(column_index).kind != DECIMAL) { return type_id::EMPTY; }
+
+  if (std::find(decimal128_columns.begin(),
+                decimal128_columns.end(),
+                metadata.column_path(0, column_index)) != decimal128_columns.end()) {
+    return type_id::DECIMAL128;
+  }
+
+  auto const precision = metadata.get_col_type(column_index)
+                           .precision.value_or(cuda::std::numeric_limits<int64_t>::digits10);
+  if (precision <= cuda::std::numeric_limits<int32_t>::digits10) { return type_id::DECIMAL32; }
+  if (precision <= cuda::std::numeric_limits<int64_t>::digits10) { return type_id::DECIMAL64; }
+  return type_id::DECIMAL128;
+}
+
+inline std::string get_map_child_col_name(std::size_t const idx)
+{
+  return (idx == 0) ? "key" : "value";
+}
+
+/**
+ * @brief Create empty columns and respective schema information from the buffer.
+ */
+std::unique_ptr<column> create_empty_column(size_type orc_col_id,
+                                            aggregate_orc_metadata const& metadata,
+                                            host_span<std::string const> decimal128_columns,
+                                            bool use_np_dtypes,
+                                            data_type timestamp_type,
+                                            column_name_info& schema_info,
+                                            rmm::cuda_stream_view stream);
+
+/**
+ * @brief Assemble the buffer with child columns.
+ */
+column_buffer assemble_buffer(size_type orc_col_id,
+                              std::size_t level,
+                              reader_column_meta const& col_meta,
+                              aggregate_orc_metadata const& metadata,
+                              column_hierarchy const& selected_columns,
+                              std::vector<std::vector<column_buffer>>& col_buffers,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
new file mode 100644
index 00000000000..179afa12bd5
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -0,0 +1,1048 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+#include "reader_impl_chunking.hpp"
+#include "reader_impl_helpers.hpp"
+
+#include <io/comp/gpuinflate.hpp>
+#include <io/comp/nvcomp_adapter.hpp>
+#include <io/utilities/config_utils.hpp>
+
+#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/pair.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+#include <algorithm>
+#include <iterator>
+
+namespace cudf::io::orc::detail {
+
+namespace {
+
+/**
+ * @brief Struct that maps ORC streams to columns
+ */
+struct orc_stream_info {
+  explicit orc_stream_info(uint64_t offset_,
+                           std::size_t dst_pos_,
+                           uint32_t length_,
+                           uint32_t stripe_idx_)
+    : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_)
+  {
+  }
+  uint64_t offset;      // offset in file
+  std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
+  std::size_t length;   // length in file
+  uint32_t stripe_idx;  // stripe index
+};
+
+/**
+ * @brief Function that populates column descriptors stream/chunk
+ */
+std::size_t gather_stream_info(std::size_t stripe_index,
+                               orc::StripeInformation const* stripeinfo,
+                               orc::StripeFooter const* stripefooter,
+                               host_span<int const> orc2gdf,
+                               host_span<orc::SchemaType const> types,
+                               bool use_index,
+                               bool apply_struct_map,
+                               std::size_t* num_dictionary_entries,
+                               std::vector<orc_stream_info>& stream_info,
+                               cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
+{
+  uint64_t src_offset = 0;
+  uint64_t dst_offset = 0;
+
+  auto const get_stream_index_type = [](orc::StreamKind kind) {
+    switch (kind) {
+      case orc::DATA: return gpu::CI_DATA;
+      case orc::LENGTH:
+      case orc::SECONDARY: return gpu::CI_DATA2;
+      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
+      case orc::PRESENT: return gpu::CI_PRESENT;
+      case orc::ROW_INDEX: return gpu::CI_INDEX;
+      default:
+        // Skip this stream as it's not strictly required
+        return gpu::CI_NUM_STREAMS;
+    }
+  };
+
+  for (auto const& stream : stripefooter->streams) {
+    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
+      dst_offset += stream.length;
+      continue;
+    }
+
+    auto const column_id = *stream.column_id;
+    auto col             = orc2gdf[column_id];
+
+    if (col == -1 and apply_struct_map) {
+      // A struct-type column has no data itself, but rather child columns
+      // for each of its fields. There is only a PRESENT stream, which
+      // needs to be included for the reader.
+      auto const schema_type = types[column_id];
+      if (not schema_type.subtypes.empty()) {
+        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
+          for (auto const& idx : schema_type.subtypes) {
+            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
+            if (child_idx >= 0) {
+              col                             = child_idx;
+              auto& chunk                     = chunks[stripe_index][col];
+              chunk.strm_id[gpu::CI_PRESENT]  = stream_info.size();
+              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
+            }
+          }
+        }
+      }
+    }
+    if (col != -1) {
+      if (src_offset >= stripeinfo->indexLength || use_index) {
+        auto& chunk           = chunks[stripe_index][col];
+        auto const index_type = get_stream_index_type(stream.kind);
+        if (index_type < gpu::CI_NUM_STREAMS) {
+          chunk.strm_id[index_type]  = stream_info.size();
+          chunk.strm_len[index_type] = stream.length;
+          // NOTE: skip_count field is temporarily used to track the presence of index streams
+          chunk.skip_count |= 1 << index_type;
+
+          if (index_type == gpu::CI_DICTIONARY) {
+            chunk.dictionary_start = *num_dictionary_entries;
+            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
+            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
+          }
+        }
+      }
+      stream_info.emplace_back(
+        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
+      dst_offset += stream.length;
+    }
+    src_offset += stream.length;
+  }
+
+  return dst_offset;
+}
+
+/**
+ * @brief Decompresses the stripe data, at stream granularity.
+ *
+ * @param decompressor Block decompressor
+ * @param stripe_data List of source stripe column data
+ * @param stream_info List of stream to column mappings
+ * @param chunks Vector of list of column chunk descriptors
+ * @param row_groups Vector of list of row index descriptors
+ * @param num_stripes Number of stripes making up column chunks
+ * @param row_index_stride Distance between each row index
+ * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Device buffer to decompressed page data
+ */
+rmm::device_buffer decompress_stripe_data(
+  OrcDecompressor const& decompressor,
+  host_span<rmm::device_buffer const> stripe_data,
+  host_span<orc_stream_info> stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+  cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
+  std::size_t num_stripes,
+  std::size_t row_index_stride,
+  bool use_base_stride,
+  rmm::cuda_stream_view stream)
+{
+  // Parse the columns' compressed info
+  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
+    0, stream_info.size(), stream);
+  for (auto const& info : stream_info) {
+    compinfo.push_back(gpu::CompressedStreamInfo(
+      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+      info.length));
+  }
+  compinfo.host_to_device_async(stream);
+
+  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                 compinfo.size(),
+                                 decompressor.GetBlockSize(),
+                                 decompressor.GetLog2MaxCompressionRatio(),
+                                 stream);
+  compinfo.device_to_host_sync(stream);
+
+  // Count the exact number of compressed blocks
+  std::size_t num_compressed_blocks   = 0;
+  std::size_t num_uncompressed_blocks = 0;
+  std::size_t total_decomp_size       = 0;
+  for (std::size_t i = 0; i < compinfo.size(); ++i) {
+    num_compressed_blocks += compinfo[i].num_compressed_blocks;
+    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
+    total_decomp_size += compinfo[i].max_uncompressed_size;
+  }
+  CUDF_EXPECTS(
+    not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
+    "Inconsistent info on compression blocks");
+
+  // Buffer needs to be padded.
+  // Required by `gpuDecodeOrcColumnData`.
+  rmm::device_buffer decomp_data(
+    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+  if (decomp_data.is_empty()) { return decomp_data; }
+
+  rmm::device_uvector<device_span<uint8_t const>> inflate_in(
+    num_compressed_blocks + num_uncompressed_blocks, stream);
+  rmm::device_uvector<device_span<uint8_t>> inflate_out(
+    num_compressed_blocks + num_uncompressed_blocks, stream);
+  rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               inflate_res.begin(),
+               inflate_res.end(),
+               compression_result{0, compression_status::FAILURE});
+
+  // Parse again to populate the decompression input/output buffers
+  std::size_t decomp_offset      = 0;
+  uint32_t max_uncomp_block_size = 0;
+  uint32_t start_pos             = 0;
+  auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
+  for (std::size_t i = 0; i < compinfo.size(); ++i) {
+    auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
+    compinfo[i].uncompressed_data = dst_base + decomp_offset;
+    compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
+    compinfo[i].dec_out_ctl       = inflate_out.data() + start_pos;
+    compinfo[i].dec_res      = {inflate_res.data() + start_pos, compinfo[i].num_compressed_blocks};
+    compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
+    compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
+
+    stream_info[i].dst_pos = decomp_offset;
+    decomp_offset += compinfo[i].max_uncompressed_size;
+    start_pos += compinfo[i].num_compressed_blocks;
+    start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
+    max_uncomp_block_size =
+      std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
+  }
+  compinfo.host_to_device_async(stream);
+  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                 compinfo.size(),
+                                 decompressor.GetBlockSize(),
+                                 decompressor.GetLog2MaxCompressionRatio(),
+                                 stream);
+
+  // Value for checking whether we decompress successfully.
+  // It doesn't need to be atomic as there is no race condition: we only write `true` if needed.
+  cudf::detail::hostdevice_vector<bool> any_block_failure(1, stream);
+  any_block_failure[0] = false;
+  any_block_failure.host_to_device_async(stream);
+
+  // Dispatch batches of blocks to decompress
+  if (num_compressed_blocks > 0) {
+    device_span<device_span<uint8_t const>> inflate_in_view{inflate_in.data(),
+                                                            num_compressed_blocks};
+    device_span<device_span<uint8_t>> inflate_out_view{inflate_out.data(), num_compressed_blocks};
+    switch (decompressor.compression()) {
+      case compression_type::ZLIB:
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
+          gpuinflate(
+            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
+        } else {
+          nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE,
+                                     inflate_in_view,
+                                     inflate_out_view,
+                                     inflate_res,
+                                     max_uncomp_block_size,
+                                     total_decomp_size,
+                                     stream);
+        }
+        break;
+      case compression_type::SNAPPY:
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::SNAPPY)) {
+          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
+        } else {
+          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
+                                     inflate_in_view,
+                                     inflate_out_view,
+                                     inflate_res,
+                                     max_uncomp_block_size,
+                                     total_decomp_size,
+                                     stream);
+        }
+        break;
+      case compression_type::ZSTD:
+        if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD);
+            reason) {
+          CUDF_FAIL("Decompression error: " + reason.value());
+        }
+        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
+                                   inflate_in_view,
+                                   inflate_out_view,
+                                   inflate_res,
+                                   max_uncomp_block_size,
+                                   total_decomp_size,
+                                   stream);
+        break;
+      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
+    }
+
+    // Check if any block has been failed to decompress.
+    // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
+    thrust::for_each(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(std::size_t{0}),
+      thrust::make_counting_iterator(inflate_res.size()),
+      [results           = inflate_res.begin(),
+       any_block_failure = any_block_failure.device_ptr()] __device__(auto const idx) {
+        if (results[idx].status != compression_status::SUCCESS) { *any_block_failure = true; }
+      });
+  }
+
+  if (num_uncompressed_blocks > 0) {
+    device_span<device_span<uint8_t const>> copy_in_view{inflate_in.data() + num_compressed_blocks,
+                                                         num_uncompressed_blocks};
+    device_span<device_span<uint8_t>> copy_out_view{inflate_out.data() + num_compressed_blocks,
+                                                    num_uncompressed_blocks};
+    gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
+  }
+
+  // Copy without stream sync, thus need to wait for stream sync below to access.
+  any_block_failure.device_to_host_async(stream);
+
+  gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
+  compinfo.device_to_host_sync(stream);  // This also sync stream for `any_block_failure`.
+
+  // We can check on host after stream synchronize
+  CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
+
+  auto const num_columns = chunks.size().second;
+
+  // Update the stream information with the updated uncompressed info
+  // TBD: We could update the value from the information we already
+  // have in stream_info[], but using the gpu results also updates
+  // max_uncompressed_size to the actual uncompressed size, or zero if
+  // decompression failed.
+  for (std::size_t i = 0; i < num_stripes; ++i) {
+    for (std::size_t j = 0; j < num_columns; ++j) {
+      auto& chunk = chunks[i][j];
+      for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
+        if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
+          chunk.streams[k]  = compinfo[chunk.strm_id[k]].uncompressed_data;
+          chunk.strm_len[k] = compinfo[chunk.strm_id[k]].max_uncompressed_size;
+        }
+      }
+    }
+  }
+
+  if (row_groups.size().first) {
+    chunks.host_to_device_async(stream);
+    row_groups.host_to_device_async(stream);
+    gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
+                            compinfo.device_ptr(),
+                            chunks.base_device_ptr(),
+                            num_columns,
+                            num_stripes,
+                            row_groups.size().first,
+                            row_index_stride,
+                            use_base_stride,
+                            stream);
+  }
+
+  return decomp_data;
+}
+
+/**
+ * @brief Updates null mask of columns whose parent is a struct column.
+ *
+ * If struct column has null element, that row would be skipped while writing child column in ORC,
+ * so we need to insert the missing null elements in child column. There is another behavior from
+ * pyspark, where if the child column doesn't have any null elements, it will not have present
+ * stream, so in that case parent null mask need to be copied to child column.
+ *
+ * @param chunks Vector of list of column chunk descriptors
+ * @param out_buffers Output columns' device buffers
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource to use for device memory allocation
+ */
+void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+                      host_span<column_buffer> out_buffers,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr)
+{
+  auto const num_stripes = chunks.size().first;
+  auto const num_columns = chunks.size().second;
+  bool is_mask_updated   = false;
+
+  for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
+    if (chunks[0][col_idx].parent_validity_info.valid_map_base != nullptr) {
+      if (not is_mask_updated) {
+        chunks.device_to_host_sync(stream);
+        is_mask_updated = true;
+      }
+
+      auto parent_valid_map_base = chunks[0][col_idx].parent_validity_info.valid_map_base;
+      auto child_valid_map_base  = out_buffers[col_idx].null_mask();
+      auto child_mask_len =
+        chunks[0][col_idx].column_num_rows - chunks[0][col_idx].parent_validity_info.null_count;
+      auto parent_mask_len = chunks[0][col_idx].column_num_rows;
+
+      if (child_valid_map_base != nullptr) {
+        rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
+        // Copy indexes at which the parent has valid value.
+        thrust::copy_if(rmm::exec_policy(stream),
+                        thrust::make_counting_iterator(0),
+                        thrust::make_counting_iterator(0) + parent_mask_len,
+                        dst_idx.begin(),
+                        [parent_valid_map_base] __device__(auto idx) {
+                          return bit_is_set(parent_valid_map_base, idx);
+                        });
+
+        auto merged_null_mask = cudf::detail::create_null_mask(
+          parent_mask_len, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
+        auto merged_mask      = static_cast<bitmask_type*>(merged_null_mask.data());
+        uint32_t* dst_idx_ptr = dst_idx.data();
+        // Copy child valid bits from child column to valid indexes, this will merge both child
+        // and parent null masks
+        thrust::for_each(rmm::exec_policy(stream),
+                         thrust::make_counting_iterator(0),
+                         thrust::make_counting_iterator(0) + dst_idx.size(),
+                         [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
+                           if (bit_is_set(child_valid_map_base, idx)) {
+                             cudf::set_bit(merged_mask, dst_idx_ptr[idx]);
+                           };
+                         });
+
+        out_buffers[col_idx].set_null_mask(std::move(merged_null_mask));
+
+      } else {
+        // Since child column doesn't have a mask, copy parent null mask
+        auto mask_size = bitmask_allocation_size_bytes(parent_mask_len);
+        out_buffers[col_idx].set_null_mask(
+          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr));
+      }
+    }
+  }
+
+  if (is_mask_updated) {
+    // Update chunks with pointers to column data which might have been changed.
+    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
+      for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
+        auto& chunk          = chunks[stripe_idx][col_idx];
+        chunk.valid_map_base = out_buffers[col_idx].null_mask();
+      }
+    }
+    chunks.host_to_device_sync(stream);
+  }
+}
+
+/**
+ * @brief Converts the stripe column data and outputs to columns.
+ *
+ * @param num_dicts Number of dictionary entries required
+ * @param skip_rows Number of rows to offset from start
+ * @param row_index_stride Distance between each row index
+ * @param level Current nesting level being processed
+ * @param tz_table Local time to UTC conversion table
+ * @param chunks Vector of list of column chunk descriptors
+ * @param row_groups Vector of list of row index descriptors
+ * @param out_buffers Output columns' device buffers
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ */
+void decode_stream_data(std::size_t num_dicts,
+                        std::size_t skip_rows,
+                        std::size_t row_index_stride,
+                        std::size_t level,
+                        table_view const& tz_table,
+                        cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+                        cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
+                        std::vector<column_buffer>& out_buffers,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr)
+{
+  auto const num_stripes = chunks.size().first;
+  auto const num_columns = chunks.size().second;
+  thrust::counting_iterator<int> col_idx_it(0);
+  thrust::counting_iterator<int> stripe_idx_it(0);
+
+  // Update chunks with pointers to column data
+  std::for_each(stripe_idx_it, stripe_idx_it + num_stripes, [&](auto stripe_idx) {
+    std::for_each(col_idx_it, col_idx_it + num_columns, [&](auto col_idx) {
+      auto& chunk            = chunks[stripe_idx][col_idx];
+      chunk.column_data_base = out_buffers[col_idx].data();
+      chunk.valid_map_base   = out_buffers[col_idx].null_mask();
+    });
+  });
+
+  // Allocate global dictionary for deserializing
+  rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
+
+  chunks.host_to_device_sync(stream);
+  gpu::DecodeNullsAndStringDictionaries(
+    chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
+
+  if (level > 0) {
+    // Update nullmasks for children if parent was a struct and had null mask
+    update_null_mask(chunks, out_buffers, stream, mr);
+  }
+
+  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
+  rmm::device_scalar<size_type> error_count(0, stream);
+  // Update the null map for child columns
+  gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
+                           global_dict.data(),
+                           row_groups,
+                           num_columns,
+                           num_stripes,
+                           skip_rows,
+                           *tz_table_dptr,
+                           row_groups.size().first,
+                           row_index_stride,
+                           level,
+                           error_count.data(),
+                           stream);
+  chunks.device_to_host_async(stream);
+  // `value` synchronizes
+  auto const num_errors = error_count.value(stream);
+  CUDF_EXPECTS(num_errors == 0, "ORC data decode failed");
+
+  std::for_each(col_idx_it + 0, col_idx_it + num_columns, [&](auto col_idx) {
+    out_buffers[col_idx].null_count() =
+      std::accumulate(stripe_idx_it + 0,
+                      stripe_idx_it + num_stripes,
+                      0,
+                      [&](auto null_count, auto const stripe_idx) {
+                        return null_count + chunks[stripe_idx][col_idx].null_count;
+                      });
+  });
+}
+
+/**
+ * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
+ * layer.
+ */
+void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
+                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
+                      rmm::cuda_stream_view stream)
+{
+  auto const num_stripes = chunks.size().first;
+  if (num_stripes == 0) return;
+
+  auto const num_columns = chunks.size().second;
+  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
+  for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
+    // Null counts sums are only needed for children of struct columns
+    if (chunks[0][col_idx].type_kind == STRUCT) {
+      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
+    }
+  }
+  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
+    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
+
+  thrust::for_each(rmm::exec_policy(stream),
+                   d_prefix_sums_to_update.begin(),
+                   d_prefix_sums_to_update.end(),
+                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
+                     auto const& idx_psums) {
+                     auto const col_idx = idx_psums.first;
+                     auto const psums   = idx_psums.second;
+
+                     thrust::transform(
+                       thrust::seq,
+                       thrust::make_counting_iterator(0),
+                       thrust::make_counting_iterator(0) + psums.size(),
+                       psums.begin(),
+                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+
+                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
+                   });
+  // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
+  stream.synchronize();
+}
+
+/**
+ * @brief Aggregate child metadata from parent column chunks.
+ */
+void aggregate_child_meta(std::size_t level,
+                          cudf::io::orc::detail::column_hierarchy const& selected_columns,
+                          cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
+                          cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
+                          host_span<orc_column_meta const> nested_cols,
+                          host_span<column_buffer> out_buffers,
+                          reader_column_meta& col_meta)
+{
+  auto const num_of_stripes         = chunks.size().first;
+  auto const num_of_rowgroups       = row_groups.size().first;
+  auto const num_child_cols         = selected_columns.levels[level + 1].size();
+  auto const number_of_child_chunks = num_child_cols * num_of_stripes;
+  auto& num_child_rows              = col_meta.num_child_rows;
+  auto& parent_column_data          = col_meta.parent_column_data;
+
+  // Reset the meta to store child column details.
+  num_child_rows.resize(selected_columns.levels[level + 1].size());
+  std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
+  parent_column_data.resize(number_of_child_chunks);
+  col_meta.parent_column_index.resize(number_of_child_chunks);
+  col_meta.child_start_row.resize(number_of_child_chunks);
+  col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
+  col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
+
+  auto child_start_row = cudf::detail::host_2dspan<uint32_t>(
+    col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
+  auto num_child_rows_per_stripe = cudf::detail::host_2dspan<uint32_t>(
+    col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
+  auto rwgrp_meta = cudf::detail::host_2dspan<reader_column_meta::row_group_meta>(
+    col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
+
+  int index = 0;  // number of child column processed
+
+  // For each parent column, update its child column meta for each stripe.
+  std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
+    auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
+    auto start_row            = 0;
+    auto processed_row_groups = 0;
+
+    for (std::size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
+      // Aggregate num_rows and start_row from processed parent columns per row groups
+      if (num_of_rowgroups) {
+        auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
+        auto processed_child_rows  = 0;
+
+        for (std::size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
+             rowgroup_id++, processed_row_groups++) {
+          auto const child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
+          for (size_type id = 0; id < p_col.num_children; id++) {
+            auto const child_col_idx                                  = index + id;
+            rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
+            rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
+          }
+          processed_child_rows += child_rows;
+        }
+      }
+
+      // Aggregate start row, number of rows per chunk and total number of rows in a column
+      auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
+      for (size_type id = 0; id < p_col.num_children; id++) {
+        auto const child_col_idx = index + id;
+
+        num_child_rows[child_col_idx] += child_rows;
+        num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
+        // start row could be different for each column when there is nesting at each stripe level
+        child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
+      }
+      start_row += child_rows;
+    }
+
+    // Parent column null mask and null count would be required for child column
+    // to adjust its nullmask.
+    auto type              = out_buffers[parent_col_idx].type.id();
+    auto parent_null_count = static_cast<uint32_t>(out_buffers[parent_col_idx].null_count());
+    auto parent_valid_map  = out_buffers[parent_col_idx].null_mask();
+    auto num_rows          = out_buffers[parent_col_idx].size;
+
+    for (size_type id = 0; id < p_col.num_children; id++) {
+      auto const child_col_idx                    = index + id;
+      col_meta.parent_column_index[child_col_idx] = parent_col_idx;
+      if (type == type_id::STRUCT) {
+        parent_column_data[child_col_idx] = {parent_valid_map, parent_null_count};
+        // Number of rows in child will remain same as parent in case of struct column
+        num_child_rows[child_col_idx] = num_rows;
+      } else {
+        parent_column_data[child_col_idx] = {nullptr, 0};
+      }
+    }
+    index += p_col.num_children;
+  });
+}
+
+/**
+ * @brief struct to store buffer data and size of list buffer
+ */
+struct list_buffer_data {
+  size_type* data;
+  size_type size;
+};
+
+// Generates offsets for list buffer from number of elements in a row.
+void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_stream_view stream)
+{
+  for (auto& list_data : buff_data) {
+    thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                           list_data.data,
+                           list_data.data + list_data.size,
+                           list_data.data);
+  }
+}
+
+}  // namespace
+
+void reader::impl::prepare_data(uint64_t skip_rows,
+                                std::optional<size_type> const& num_rows_opt,
+                                std::vector<std::vector<size_type>> const& stripes)
+{
+  // Selected columns at different levels of nesting are stored in different elements
+  // of `selected_columns`; thus, size == 1 means no nested columns
+  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
+               "skip_rows is not supported by nested columns");
+
+  // There are no columns in the table
+  if (_selected_columns.num_levels() == 0) { return; }
+
+  _file_itm_data = std::make_unique<file_intermediate_data>();
+
+  // Select only stripes required (aka row groups)
+  std::tie(
+    _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) =
+    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
+  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
+  auto const rows_to_read      = _file_itm_data->rows_to_read;
+  auto const& selected_stripes = _file_itm_data->selected_stripes;
+
+  // If no rows or stripes to read, return empty columns
+  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
+
+  // Set up table for converting timestamp columns from local to UTC time
+  auto const tz_table = [&, &selected_stripes = selected_stripes] {
+    auto const has_timestamp_column = std::any_of(
+      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
+        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
+          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+        });
+      });
+
+    return has_timestamp_column
+             ? cudf::detail::make_timezone_transition_table(
+                 {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream)
+             : std::make_unique<cudf::table>();
+  }();
+
+  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
+  auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums;
+  lvl_stripe_data.resize(_selected_columns.num_levels());
+
+  _out_buffers.resize(_selected_columns.num_levels());
+
+  // Iterates through levels of nested columns, child column will be one level down
+  // compared to parent column.
+  auto& col_meta = *_col_meta;
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& columns_level = _selected_columns.levels[level];
+    // Association between each ORC column and its cudf::column
+    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
+    std::vector<orc_column_meta> nested_cols;
+
+    // Get a list of column data types
+    std::vector<data_type> column_types;
+    for (auto& col : columns_level) {
+      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
+                                   _use_np_dtypes,
+                                   _timestamp_type.id(),
+                                   to_cudf_decimal_type(_decimal128_columns, _metadata, col.id));
+      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
+      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
+          col_type == type_id::DECIMAL128) {
+        // sign of the scale is changed since cuDF follows c++ libraries like CNL
+        // which uses negative scaling, but liborc and other libraries
+        // follow positive scaling.
+        auto const scale =
+          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
+        column_types.emplace_back(col_type, scale);
+      } else {
+        column_types.emplace_back(col_type);
+      }
+
+      // Map each ORC column to its column
+      col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
+      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
+        nested_cols.emplace_back(col);
+      }
+    }
+
+    // Get the total number of stripes across all input files.
+    std::size_t total_num_stripes =
+      std::accumulate(selected_stripes.begin(),
+                      selected_stripes.end(),
+                      0,
+                      [](std::size_t sum, auto& stripe_source_mapping) {
+                        return sum + stripe_source_mapping.stripe_info.size();
+                      });
+    auto const num_columns = columns_level.size();
+    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
+      total_num_stripes, num_columns, _stream);
+    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
+
+    const bool use_index =
+      _use_index &&
+      // Do stripes have row group index
+      _metadata.is_row_grp_idx_present() &&
+      // Only use if we don't have much work with complete columns & stripes
+      // TODO: Consider nrows, gpu, and tune the threshold
+      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
+       _metadata.get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
+      // Only use if first row is aligned to a stripe boundary
+      // TODO: Fix logic to handle unaligned rows
+      (rows_to_skip == 0);
+
+    // Logically view streams as columns
+    std::vector<orc_stream_info> stream_info;
+
+    null_count_prefix_sums.emplace_back();
+    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
+    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
+                    _selected_columns.levels[level].size(),
+                    [&]() {
+                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
+                    });
+
+    // Tracker for eventually deallocating compressed and uncompressed data
+    auto& stripe_data = lvl_stripe_data[level];
+
+    std::size_t stripe_start_row = 0;
+    std::size_t num_dict_entries = 0;
+    std::size_t num_rowgroups    = 0;
+    int stripe_idx               = 0;
+
+    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+    for (auto const& stripe_source_mapping : selected_stripes) {
+      // Iterate through the source files selected stripes
+      for (auto const& stripe : stripe_source_mapping.stripe_info) {
+        auto const stripe_info   = stripe.first;
+        auto const stripe_footer = stripe.second;
+
+        auto stream_count          = stream_info.size();
+        auto const total_data_size = gather_stream_info(stripe_idx,
+                                                        stripe_info,
+                                                        stripe_footer,
+                                                        col_meta.orc_col_map[level],
+                                                        _metadata.get_types(),
+                                                        use_index,
+                                                        level == 0,
+                                                        &num_dict_entries,
+                                                        stream_info,
+                                                        chunks);
+
+        auto const is_stripe_data_empty = total_data_size == 0;
+        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                     "Invalid index rowgroup stream data");
+
+        // Buffer needs to be padded.
+        // Required by `copy_uncompressed_kernel`.
+        stripe_data.emplace_back(
+          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
+        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
+
+        // Coalesce consecutive streams into one read
+        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
+          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
+          auto const offset = stream_info[stream_count].offset;
+          auto len          = stream_info[stream_count].length;
+          stream_count++;
+
+          while (stream_count < stream_info.size() &&
+                 stream_info[stream_count].offset == offset + len) {
+            len += stream_info[stream_count].length;
+            stream_count++;
+          }
+          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                .source->is_device_read_preferred(len)) {
+            read_tasks.push_back(
+              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                          .source->device_read_async(offset, len, d_dst, _stream),
+                        len));
+
+          } else {
+            auto const buffer =
+              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
+                offset, len);
+            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
+            CUDF_CUDA_TRY(
+              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
+            _stream.synchronize();
+          }
+        }
+
+        auto const num_rows_per_stripe = stripe_info->numberOfRows;
+        auto const rowgroup_id         = num_rowgroups;
+        auto stripe_num_rowgroups      = 0;
+        if (use_index) {
+          stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
+                                 _metadata.get_row_index_stride();
+        }
+        // Update chunks to reference streams pointers
+        for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
+          auto& chunk = chunks[stripe_idx][col_idx];
+          // start row, number of rows in a each stripe and total number of rows
+          // may change in lower levels of nesting
+          chunk.start_row = (level == 0)
+                              ? stripe_start_row
+                              : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
+          chunk.num_rows =
+            (level == 0) ? stripe_info->numberOfRows
+                         : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
+          chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
+          chunk.parent_validity_info =
+            (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
+          chunk.parent_null_count_prefix_sums =
+            (level == 0)
+              ? nullptr
+              : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
+          chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
+          chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                              .ff.types[columns_level[col_idx].id]
+                              .kind;
+          // num_child_rows for a struct column will be same, for other nested types it will be
+          // calculated.
+          chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+          chunk.dtype_id       = column_types[col_idx].id();
+          chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                                  .ff.types[columns_level[col_idx].id]
+                                  .scale.value_or(0);
+
+          chunk.rowgroup_id   = rowgroup_id;
+          chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
+                                  ? sizeof(string_index_pair)
+                                : ((column_types[col_idx].id() == type_id::LIST) or
+                               (column_types[col_idx].id() == type_id::STRUCT))
+                                  ? sizeof(size_type)
+                                  : cudf::size_of(column_types[col_idx]);
+          chunk.num_rowgroups = stripe_num_rowgroups;
+          if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
+          if (not is_stripe_data_empty) {
+            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
+            }
+          }
+        }
+        stripe_start_row += num_rows_per_stripe;
+        num_rowgroups += stripe_num_rowgroups;
+
+        stripe_idx++;
+      }
+    }
+    for (auto& task : read_tasks) {
+      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+    }
+
+    if (stripe_data.empty()) { continue; }
+
+    // Process dataset chunk pages into output columns
+    auto row_groups =
+      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
+    if (level > 0 and row_groups.size().first) {
+      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
+                                                     num_rowgroups * num_columns);
+      auto& rw_grp_meta = col_meta.rwgrp_meta;
+
+      // Update start row and num rows per row group
+      std::transform(rw_grp_meta.begin(),
+                     rw_grp_meta.end(),
+                     row_groups_span.begin(),
+                     rw_grp_meta.begin(),
+                     [&](auto meta, auto& row_grp) {
+                       row_grp.num_rows  = meta.num_rows;
+                       row_grp.start_row = meta.start_row;
+                       return meta;
+                     });
+    }
+    // Setup row group descriptors if using indexes
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
+                                                stripe_data,
+                                                stream_info,
+                                                chunks,
+                                                row_groups,
+                                                total_num_stripes,
+                                                _metadata.get_row_index_stride(),
+                                                level == 0,
+                                                _stream);
+      stripe_data.clear();
+      stripe_data.push_back(std::move(decomp_data));
+    } else {
+      if (row_groups.size().first) {
+        chunks.host_to_device_async(_stream);
+        row_groups.host_to_device_async(_stream);
+        row_groups.host_to_device_async(_stream);
+        gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
+                                nullptr,
+                                chunks.base_device_ptr(),
+                                num_columns,
+                                total_num_stripes,
+                                num_rowgroups,
+                                _metadata.get_row_index_stride(),
+                                level == 0,
+                                _stream);
+      }
+    }
+
+    for (std::size_t i = 0; i < column_types.size(); ++i) {
+      bool is_nullable = false;
+      for (std::size_t j = 0; j < total_num_stripes; ++j) {
+        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+          is_nullable = true;
+          break;
+        }
+      }
+      auto is_list_type = (column_types[i].id() == type_id::LIST);
+      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
+      // For list column, offset column will be always size + 1
+      if (is_list_type) n_rows++;
+      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
+    }
+
+    decode_stream_data(num_dict_entries,
+                       rows_to_skip,
+                       _metadata.get_row_index_stride(),
+                       level,
+                       tz_table->view(),
+                       chunks,
+                       row_groups,
+                       _out_buffers[level],
+                       _stream,
+                       _mr);
+
+    if (nested_cols.size()) {
+      // Extract information to process nested child columns
+      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
+
+      row_groups.device_to_host_sync(_stream);
+      aggregate_child_meta(
+        level, _selected_columns, chunks, row_groups, nested_cols, _out_buffers[level], col_meta);
+
+      // ORC stores number of elements at each row, so we need to generate offsets from that
+      std::vector<list_buffer_data> buff_data;
+      std::for_each(
+        _out_buffers[level].begin(), _out_buffers[level].end(), [&buff_data](auto& out_buffer) {
+          if (out_buffer.type.id() == type_id::LIST) {
+            auto data = static_cast<size_type*>(out_buffer.data());
+            buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
+          }
+        });
+
+      if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
+    }
+  }  // end loop level
+}
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index a6a93c41472..3cd70801cdf 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -204,8 +204,10 @@ class hostdevice_vector {
 template <typename T>
 class hostdevice_2dvector {
  public:
+  hostdevice_2dvector() : hostdevice_2dvector(0, 0, cudf::get_default_stream()) {}
+
   hostdevice_2dvector(size_t rows, size_t columns, rmm::cuda_stream_view stream)
-    : _size{rows, columns}, _data{rows * columns, stream}
+    : _data{rows * columns, stream}, _size{rows, columns}
   {
   }
 

From 49dd5bf8f6daa6ab9c4ca414d51b4402f50a1d1d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 17 Jan 2024 17:24:48 -0500
Subject: [PATCH 03/60] Generate unified Python/C++ docs (#13846)

This PR leverages [Breathe](https://breathe.readthedocs.io/en/latest/) to pull the cudf C++ API documentation into the python Sphinx docs build, generating a single unified build of the documentation that supports cross-linking between language libraries and also simplifies cross-linking from other libraries that wish to link here.

This PR also revealed numerous other issues with our doxygen docs. I've submitted those as separate patches to control the diff here, but it's worth noting that Sphinx is much louder with warnings than doxygen and will help us avoid many more issues with broken documentation than doxygen alone could.

Resolves #11481

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Ashwin Srinath (https://github.com/shwina)
  - Karthikeyan (https://github.com/karthikeyann)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/13846
---
 ci/build_docs.sh                              |   6 +-
 .../all_cuda-118_arch-x86_64.yaml             |   1 +
 .../all_cuda-120_arch-x86_64.yaml             |   1 +
 cpp/doxygen/developer_guide/TESTING.md        |   6 +-
 .../cudf/strings/strings_column_view.hpp      |   2 +-
 dependencies.yaml                             |   1 +
 docs/cudf/source/conf.py                      | 256 +++++++++++++++++-
 docs/cudf/source/index.rst                    |   1 +
 .../api_docs/aggregation_factories.rst        |   5 +
 .../api_docs/aggregation_groupby.rst          |   5 +
 .../api_docs/aggregation_reduction.rst        |   5 +
 .../api_docs/aggregation_rolling.rst          |   5 +
 .../api_docs/column_aggregation.rst           |  14 +
 .../libcudf_docs/api_docs/column_apis.rst     |  23 ++
 .../libcudf_docs/api_docs/column_classes.rst  |  16 ++
 .../libcudf_docs/api_docs/column_copy.rst     |  16 ++
 .../api_docs/column_factories.rst             |   5 +
 .../libcudf_docs/api_docs/column_hash.rst     |   5 +
 .../libcudf_docs/api_docs/column_interop.rst  |  12 +
 .../libcudf_docs/api_docs/column_join.rst     |   5 +
 .../libcudf_docs/api_docs/column_merge.rst    |   5 +
 .../libcudf_docs/api_docs/column_nullmask.rst |   5 +
 .../api_docs/column_quantiles.rst             |   5 +
 .../libcudf_docs/api_docs/column_reorder.rst  |  12 +
 .../libcudf_docs/api_docs/column_reshape.rst  |  11 +
 .../libcudf_docs/api_docs/column_search.rst   |   5 +
 .../libcudf_docs/api_docs/column_sort.rst     |   5 +
 .../api_docs/column_transformation.rst        |  15 +
 .../api_docs/copy_concatenate.rst             |   5 +
 .../libcudf_docs/api_docs/copy_gather.rst     |   5 +
 .../libcudf_docs/api_docs/copy_scatter.rst    |   5 +
 .../libcudf_docs/api_docs/copy_shift.rst      |   5 +
 .../libcudf_docs/api_docs/copy_slice.rst      |   5 +
 .../libcudf_docs/api_docs/copy_split.rst      |   5 +
 .../libcudf_docs/api_docs/cudf_classes.rst    |  14 +
 .../libcudf_docs/api_docs/cudf_namespace.rst  |  28 ++
 .../libcudf_docs/api_docs/datetime_apis.rst   |  12 +
 .../api_docs/datetime_compute.rst             |   5 +
 .../api_docs/datetime_extract.rst             |   5 +
 .../libcudf_docs/api_docs/default_stream.rst  |   5 +
 .../libcudf_docs/api_docs/dictionary_apis.rst |  13 +
 .../api_docs/dictionary_classes.rst           |   5 +
 .../api_docs/dictionary_encode.rst            |   5 +
 .../api_docs/dictionary_search.rst            |   5 +
 .../api_docs/dictionary_update.rst            |   5 +
 .../libcudf_docs/api_docs/expressions.rst     |   5 +
 .../api_docs/fixed_point_classes.rst          |   6 +
 .../source/libcudf_docs/api_docs/index.rst    |  29 ++
 .../libcudf_docs/api_docs/interop_arrow.rst   |   5 +
 .../libcudf_docs/api_docs/interop_dlpack.rst  |   5 +
 .../source/libcudf_docs/api_docs/io_apis.rst  |  15 +
 .../libcudf_docs/api_docs/io_datasinks.rst    |   5 +
 .../libcudf_docs/api_docs/io_datasources.rst  |   5 +
 .../libcudf_docs/api_docs/io_readers.rst      |   5 +
 .../source/libcudf_docs/api_docs/io_types.rst |   5 +
 .../libcudf_docs/api_docs/io_writers.rst      |   5 +
 .../libcudf_docs/api_docs/json_apis.rst       |  11 +
 .../libcudf_docs/api_docs/json_object.rst     |   5 +
 .../libcudf_docs/api_docs/label_bins.rst      |   5 +
 .../libcudf_docs/api_docs/labeling_apis.rst   |  11 +
 .../libcudf_docs/api_docs/lists_apis.rst      |  20 ++
 .../libcudf_docs/api_docs/lists_classes.rst   |   5 +
 .../libcudf_docs/api_docs/lists_combine.rst   |   5 +
 .../libcudf_docs/api_docs/lists_contains.rst  |   5 +
 .../libcudf_docs/api_docs/lists_elements.rst  |   5 +
 .../libcudf_docs/api_docs/lists_extract.rst   |   5 +
 .../libcudf_docs/api_docs/lists_filling.rst   |   5 +
 .../libcudf_docs/api_docs/lists_filtering.rst |   5 +
 .../libcudf_docs/api_docs/lists_gather.rst    |   5 +
 .../libcudf_docs/api_docs/lists_modify.rst    |   5 +
 .../libcudf_docs/api_docs/lists_sort.rst      |   5 +
 .../libcudf_docs/api_docs/nvtext_apis.rst     |  18 ++
 .../api_docs/nvtext_edit_distance.rst         |   5 +
 .../libcudf_docs/api_docs/nvtext_jaccard.rst  |   5 +
 .../libcudf_docs/api_docs/nvtext_minhash.rst  |   5 +
 .../libcudf_docs/api_docs/nvtext_ngrams.rst   |   5 +
 .../api_docs/nvtext_normalize.rst             |   5 +
 .../libcudf_docs/api_docs/nvtext_replace.rst  |   5 +
 .../libcudf_docs/api_docs/nvtext_stemmer.rst  |   5 +
 .../libcudf_docs/api_docs/nvtext_tokenize.rst |   5 +
 .../libcudf_docs/api_docs/reorder_compact.rst |   5 +
 .../api_docs/reorder_partition.rst            |   5 +
 .../api_docs/reshape_transpose.rst            |   5 +
 .../libcudf_docs/api_docs/scalar_classes.rst  |  11 +
 .../api_docs/scalar_factories.rst             |   5 +
 .../libcudf_docs/api_docs/set_operations.rst  |   5 +
 .../libcudf_docs/api_docs/strings_apis.rst    |  23 ++
 .../libcudf_docs/api_docs/strings_case.rst    |   5 +
 .../libcudf_docs/api_docs/strings_classes.rst |   5 +
 .../libcudf_docs/api_docs/strings_combine.rst |   5 +
 .../api_docs/strings_contains.rst             |   5 +
 .../libcudf_docs/api_docs/strings_convert.rst |   5 +
 .../libcudf_docs/api_docs/strings_copy.rst    |   5 +
 .../libcudf_docs/api_docs/strings_extract.rst |   5 +
 .../libcudf_docs/api_docs/strings_find.rst    |   5 +
 .../libcudf_docs/api_docs/strings_modify.rst  |   5 +
 .../libcudf_docs/api_docs/strings_regex.rst   |   5 +
 .../libcudf_docs/api_docs/strings_replace.rst |   5 +
 .../libcudf_docs/api_docs/strings_slice.rst   |   5 +
 .../libcudf_docs/api_docs/strings_split.rst   |   5 +
 .../libcudf_docs/api_docs/strings_types.rst   |   5 +
 .../libcudf_docs/api_docs/structs_classes.rst |   5 +
 .../libcudf_docs/api_docs/table_classes.rst   |   5 +
 .../source/libcudf_docs/api_docs/tdigest.rst  |   5 +
 .../api_docs/timestamp_classes.rst            |   5 +
 .../api_docs/transformation_binaryops.rst     |   5 +
 .../api_docs/transformation_fill.rst          |   5 +
 .../api_docs/transformation_replace.rst       |   5 +
 .../api_docs/transformation_transform.rst     |   5 +
 .../api_docs/transformation_unaryops.rst      |   5 +
 .../libcudf_docs/api_docs/utility_apis.rst    |  15 +
 .../libcudf_docs/api_docs/utility_bitmask.rst |   5 +
 .../api_docs/utility_dispatcher.rst           |   5 +
 .../libcudf_docs/api_docs/utility_error.rst   |   5 +
 .../libcudf_docs/api_docs/utility_span.rst    |   5 +
 .../libcudf_docs/api_docs/utility_types.rst   |   5 +
 docs/cudf/source/libcudf_docs/index.rst       |  17 ++
 docs/cudf/source/libcudf_docs/md_regex.rst    |   4 +
 .../libcudf_docs/unicode_limitations.rst      |   4 +
 119 files changed, 1060 insertions(+), 14 deletions(-)
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/aggregation_factories.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/aggregation_groupby.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/aggregation_reduction.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/aggregation_rolling.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_aggregation.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_apis.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_classes.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_copy.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_factories.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_hash.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_interop.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_join.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_merge.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_nullmask.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_quantiles.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_reorder.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_reshape.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_search.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_sort.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/column_transformation.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/copy_concatenate.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/copy_gather.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/copy_scatter.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/copy_shift.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/copy_slice.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/copy_split.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/cudf_classes.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/cudf_namespace.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/datetime_apis.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/datetime_compute.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/datetime_extract.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/default_stream.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/dictionary_apis.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/dictionary_classes.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/dictionary_encode.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/dictionary_search.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/dictionary_update.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/expressions.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/fixed_point_classes.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/index.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/interop_arrow.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/interop_dlpack.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/io_apis.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/io_datasinks.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/io_datasources.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/io_types.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/io_writers.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/json_apis.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/json_object.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/label_bins.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/labeling_apis.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/lists_apis.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/lists_classes.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/lists_combine.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/lists_contains.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/lists_elements.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/lists_extract.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/lists_filling.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/lists_filtering.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/lists_gather.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/lists_modify.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/lists_sort.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/nvtext_apis.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/nvtext_edit_distance.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/nvtext_jaccard.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/nvtext_minhash.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/nvtext_ngrams.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/nvtext_normalize.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/nvtext_replace.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/nvtext_stemmer.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/nvtext_tokenize.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/reorder_compact.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/reorder_partition.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/reshape_transpose.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/scalar_classes.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/scalar_factories.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/set_operations.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_apis.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_case.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_classes.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_combine.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_contains.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_copy.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_extract.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_find.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_modify.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_regex.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_replace.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_slice.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_split.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/strings_types.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/structs_classes.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/table_classes.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/tdigest.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/timestamp_classes.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/transformation_binaryops.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/transformation_fill.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/transformation_replace.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/transformation_transform.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/transformation_unaryops.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/utility_apis.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/utility_bitmask.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/utility_dispatcher.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/utility_error.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/utility_span.rst
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/utility_types.rst
 create mode 100644 docs/cudf/source/libcudf_docs/index.rst
 create mode 100644 docs/cudf/source/libcudf_docs/md_regex.rst
 create mode 100644 docs/cudf/source/libcudf_docs/unicode_limitations.rst

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 67d611340c5..ceab29c2473 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -40,8 +40,8 @@ popd
 
 rapids-logger "Build Python docs"
 pushd docs/cudf
-make dirhtml
-make text
+make dirhtml O="-j 4"
+make text O="-j 4"
 mkdir -p "${RAPIDS_DOCS_DIR}/cudf/"{html,txt}
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html"
 mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a5e3ea4c531..15bcf207b1b 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -12,6 +12,7 @@ dependencies:
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
+- breathe>=4.35.0
 - c-compiler
 - cachetools
 - clang-tools=16.0.6
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 579bbb6d52d..ccad6a366fb 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -12,6 +12,7 @@ dependencies:
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
+- breathe>=4.35.0
 - c-compiler
 - cachetools
 - clang-tools=16.0.6
diff --git a/cpp/doxygen/developer_guide/TESTING.md b/cpp/doxygen/developer_guide/TESTING.md
index c19976a956b..a4ffe0f575b 100644
--- a/cpp/doxygen/developer_guide/TESTING.md
+++ b/cpp/doxygen/developer_guide/TESTING.md
@@ -464,9 +464,9 @@ the host (`to_host`).
 
 ### Background
 
-libcudf employs a custom-built [preload library
-docs](https://man7.org/linux/man-pages/man8/ld.so.8.html) to validate its internal stream usage (the
-code may be found
+libcudf employs a custom-built [preload
+library](https://man7.org/linux/man-pages/man8/ld.so.8.html) to validate its internal stream usage
+(the code may be found
 [`here`](https://github.com/rapidsai/cudf/blob/main/cpp/tests/utilities/identify_stream_usage.cpp)).
 This library wraps every asynchronous CUDA runtime API call that accepts a stream with a check to
 ensure that the passed CUDA stream is a valid one, immediately throwing an exception if an invalid
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index 36054f7c229..e6546777f3f 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -106,7 +106,7 @@ class strings_column_view : private column_view {
   /**
    * @brief Returns the internal column of chars
    *
-   * @throw cudf::logic error if this is an empty column
+   * @throw cudf::logic_error if this is an empty column
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return The chars column
    */
diff --git a/dependencies.yaml b/dependencies.yaml
index 20998847a75..1c6d5086bf3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -457,6 +457,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
+          - breathe>=4.35.0
           - dask-cuda==24.2.*
           - *doxygen
           - make
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index d98d1fa5d81..b997c78fba8 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # cudf documentation build configuration file, created by
 # sphinx-quickstart on Wed May  3 10:59:22 2017.
@@ -16,11 +16,33 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
+import glob
 import os
+import re
 import sys
+import xml.etree.ElementTree as ET
 
 from docutils.nodes import Text
 from sphinx.addnodes import pending_xref
+from sphinx.highlighting import lexers
+from sphinx.ext import intersphinx
+from pygments.lexer import RegexLexer
+from pygments.token import Text as PText
+
+
+class PseudoLexer(RegexLexer):
+    """Trivial lexer for pseudocode."""
+
+    name = 'pseudocode'
+    aliases = ['pseudo']
+    tokens = {
+        'root': [
+            (r'.*\n', PText),
+        ]
+    }
+
+
+lexers['pseudo'] = PseudoLexer()
 
 # -- Custom Extensions ----------------------------------------------------
 sys.path.append(os.path.abspath("./_ext"))
@@ -35,6 +57,7 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
+    "breathe",
     "sphinx.ext.intersphinx",
     "sphinx.ext.autodoc",
     "sphinx.ext.autosummary",
@@ -46,6 +69,67 @@
     "myst_nb",
 ]
 
+# Preprocess doxygen xml for compatibility with latest Breathe
+def clean_definitions(root):
+    # Breathe can't handle SFINAE properly:
+    # https://github.com/breathe-doc/breathe/issues/624
+    seen_ids = set()
+    for sectiondef in root.findall(".//sectiondef"):
+        for memberdef in sectiondef.findall("./memberdef"):
+            id_ = memberdef.get("id")
+            for tparamlist in memberdef.findall("./templateparamlist"):
+                for param in tparamlist.findall("./param"):
+                    for type_ in param.findall("./type"):
+                        # CUDF_ENABLE_IF or std::enable_if
+                        if "enable_if" in ET.tostring(type_).decode().lower():
+                            if id_ not in seen_ids:
+                                # If this is the first time we're seeing this function,
+                                # just remove the template parameter.
+                                seen_ids.add(id_)
+                                tparamlist.remove(param)
+                            else:
+                                # Otherwise, remove the overload altogether and just
+                                # rely on documenting one of the SFINAE overloads.
+                                sectiondef.remove(memberdef)
+                            break
+
+                        # In addition to enable_if, check for overloads set up by
+                        # ...*=nullptr.
+                        for type_ in param.findall("./defval"):
+                            if "nullptr" in ET.tostring(type_).decode():
+                                try:
+                                    tparamlist.remove(param)
+                                except ValueError:
+                                    # May have already been removed in above,
+                                    # so skip.
+                                    pass
+                                break
+
+
+    # All of these in type declarations cause Breathe to choke.
+    # For friend, see https://github.com/breathe-doc/breathe/issues/916
+    strings_to_remove = ("__forceinline__", "CUDF_HOST_DEVICE", "decltype(auto)", "friend")
+    for field in (".//type", ".//definition"):
+        for type_ in root.findall(field):
+            if type_.text is not None:
+                for string in strings_to_remove:
+                    type_.text = type_.text.replace(string, "")
+
+
+def clean_all_xml_files(path):
+    for fn in glob.glob(os.path.join(path, "*.xml")):
+        tree = ET.parse(fn)
+        clean_definitions(tree.getroot())
+        tree.write(fn)
+
+
+# Breathe Configuration
+breathe_projects = {"libcudf": "../../../cpp/doxygen/xml"}
+for project_path in breathe_projects.values():
+    clean_all_xml_files(project_path)
+breathe_default_project = "libcudf"
+
+
 nb_execution_excludepatterns = ['performance-comparisons.ipynb']
 
 nb_execution_mode = "force"
@@ -195,11 +279,13 @@
 
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
-    "python": ("https://docs.python.org/3", None),
     "cupy": ("https://docs.cupy.dev/en/stable/", None),
+    "dlpack": ("https://dmlc.github.io/dlpack/latest/", None),
     "numpy": ("https://numpy.org/doc/stable", None),
-    "pyarrow": ("https://arrow.apache.org/docs/", None),
     "pandas": ("https://pandas.pydata.org/docs/", None),
+    "pyarrow": ("https://arrow.apache.org/docs/", None),
+    "python": ("https://docs.python.org/3", None),
+    "rmm": ("https://docs.rapids.ai/api/rmm/nightly/", None),
     "typing_extensions": ("https://typing-extensions.readthedocs.io/en/stable/", None),
 }
 
@@ -238,14 +324,170 @@ def resolve_aliases(app, doctree):
             text_node.parent.replace(text_node, Text(text_to_render, ""))
 
 
-def ignore_internal_references(app, env, node, contnode):
-    name = node.get("reftarget", None)
-    if name == "cudf.core.index.GenericIndex":
+def _generate_namespaces(namespaces):
+    all_namespaces = []
+    for base_namespace, other_namespaces in namespaces.items():
+        all_namespaces.append(base_namespace + "::")
+        for other_namespace in other_namespaces:
+            all_namespaces.append(f"{other_namespace}::")
+            all_namespaces.append(f"{base_namespace}::{other_namespace}::")
+    return all_namespaces
+
+_all_namespaces = _generate_namespaces({
+    # Note that io::datasource is actually a nested class
+    "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression"},
+    "numeric": {},
+    "nvtext": {},
+})
+
+_names_to_skip = {
+    # External names
+    "thrust",
+    "cuda",
+    "arrow",
+    # Unknown types
+    "int8_t",
+    "int16_t",
+    "int32_t",
+    "int64_t",
+    "__int128_t",
+    "size_t",
+    "uint8_t",
+    "uint16_t",
+    "uint32_t",
+    "uint64_t",
+    # Internal objects
+    "id_to_type_impl",
+    "type_to_scalar_type_impl",
+    "type_to_scalar_type_impl",
+    "detail",
+    # kafka objects
+    "python_callable_type",
+    "kafka_oauth_callback_wrapper_type",
+    # Template types
+    "Radix",
+    # Unsupported by Breathe
+    # https://github.com/breathe-doc/breathe/issues/355
+    "deprecated",
+    # TODO: This type is currently defined in a detail header but it's in
+    # the public namespace. However, it's used in the detail header, so it
+    # needs to be put into a public header that can be shared.
+    "char_utf8",
+    # TODO: This is currently in a src file but perhaps should be public
+    "orc::column_statistics",
+    # Sphinx doesn't know how to distinguish between the ORC and Parquet
+    # definitions because Breathe doesn't to preserve namespaces for enums.
+    "TypeKind",
+}
+
+_domain_objects = None
+_prefixed_domain_objects = None
+_intersphinx_cache = {}
+
+_intersphinx_extra_prefixes = ("rmm", "rmm::mr", "mr")
+
+
+def _cached_intersphinx_lookup(env, node, contnode):
+    """Perform an intersphinx lookup and cache the result.
+
+    Have to manually manage the intersphinx cache because lru_cache doesn't
+    handle the env object properly.
+    """
+    key = (node, contnode)
+    if key in _intersphinx_cache:
+        return _intersphinx_cache[key]
+    if (ref := intersphinx.resolve_reference_detect_inventory(env, node, contnode)) is not None:
+        _intersphinx_cache[key] = ref
+    return ref
+
+
+def on_missing_reference(app, env, node, contnode):
+    # These variables are defined outside the function to speed up the build.
+    global _all_namespaces, _names_to_skip, _intersphinx_extra_prefixes, \
+        _domain_objects, _prefixed_domain_objects, _intersphinx_cache
+
+    # Precompute and cache domains for faster lookups
+    if _domain_objects is None:
+        _domain_objects = {}
+        _prefixed_domain_objects = {}
+        for (name, _, _, docname, _, _) in env.domains["cpp"].get_objects():
+            _domain_objects[name] = docname
+            for prefix in _all_namespaces:
+                _prefixed_domain_objects[f"{prefix}{name}"] = name
+
+    reftarget = node.get("reftarget")
+    if reftarget == "cudf.core.index.GenericIndex":
         # We don't exposed docs for `cudf.core.index.GenericIndex`
         # hence we would want the docstring & mypy references to
         # use `cudf.Index`
         node["reftarget"] = "cudf.Index"
         return contnode
+    if "namespacecudf" in reftarget:
+        node["reftarget"] = "cudf"
+        return contnode
+    if "classcudf_1_1column__device__view_" in reftarget:
+        node["reftarget"] = "cudf::column_device_view"
+        return contnode
+
+    if (refid := node.get("refid")) is not None and "hpp" in refid:
+        # We don't want to link to C++ header files directly from the
+        # Sphinx docs, those are pages that doxygen automatically
+        # generates. Adding those would clutter the Sphinx output.
+        return contnode
+
+    if node["refdomain"] in ("std", "cpp") and reftarget is not None:
+        if any(toskip in reftarget for toskip in _names_to_skip):
+            return contnode
+
+        # Strip template parameters and just use the base type.
+        if match := re.search("(.*)<.*>", reftarget):
+            reftarget = match.group(1)
+
+        # Try to find the target prefixed with e.g. namespaces in case that's
+        # all that's missing.
+        # We need to do this search because the call sites may not have used
+        # the namespaces and we don't want to force them to, and we have to
+        # consider both directions because of issues like
+        # https://github.com/breathe-doc/breathe/issues/860
+        # (there may be other related issues, I haven't investigated all
+        # possible combinations of failures in depth).
+        if (name := _prefixed_domain_objects.get(reftarget)) is None:
+            for prefix in _all_namespaces:
+                if f"{prefix}{reftarget}" in _domain_objects:
+                    name = f"{prefix}{reftarget}"
+                    break
+        if name is not None:
+            return env.domains["cpp"].resolve_xref(
+                env,
+                _domain_objects[name],
+                app.builder,
+                node["reftype"],
+                name,
+                node,
+                contnode,
+            )
+
+        # Final possibility is an intersphinx lookup to see if the symbol
+        # exists in one of the other inventories. First we check the symbol
+        # itself in case it was originally templated and that caused the lookup
+        # to fail.
+        if reftarget != node["reftarget"]:
+            node["reftarget"] = reftarget
+            if (ref := _cached_intersphinx_lookup(env, node, contnode)) is not None:
+                return ref
+
+        # If the template wasn't the (only) issue, we check the various
+        # namespace prefixes that may need to be added or removed.
+        for prefix in _intersphinx_extra_prefixes:
+            if prefix not in reftarget:
+                node["reftarget"] = f"{prefix}::{reftarget}"
+                if (ref := _cached_intersphinx_lookup(env, node, contnode)) is not None:
+                    return ref
+            else:
+                node["reftarget"] = reftarget.replace(f"{prefix}::", "")
+                if (ref := _cached_intersphinx_lookup(env, node, contnode)) is not None:
+                    return ref
+
     return None
 
 
@@ -261,4 +503,4 @@ def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
     app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer")
     app.connect("doctree-read", resolve_aliases)
-    app.connect("missing-reference", ignore_internal_references)
+    app.connect("missing-reference", on_missing_reference)
diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst
index 21badd683af..3765b560a7f 100644
--- a/docs/cudf/source/index.rst
+++ b/docs/cudf/source/index.rst
@@ -29,4 +29,5 @@ other operations.
 
    user_guide/index
    cudf_pandas/index
+   libcudf_docs/index
    developer_guide/index
diff --git a/docs/cudf/source/libcudf_docs/api_docs/aggregation_factories.rst b/docs/cudf/source/libcudf_docs/api_docs/aggregation_factories.rst
new file mode 100644
index 00000000000..49677acc730
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/aggregation_factories.rst
@@ -0,0 +1,5 @@
+Aggregation Factories
+=====================
+
+.. doxygengroup:: aggregation_factories
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/aggregation_groupby.rst b/docs/cudf/source/libcudf_docs/api_docs/aggregation_groupby.rst
new file mode 100644
index 00000000000..5af85e5d74a
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/aggregation_groupby.rst
@@ -0,0 +1,5 @@
+Aggregation Groupby
+===================
+
+.. doxygengroup:: aggregation_groupby
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/aggregation_reduction.rst b/docs/cudf/source/libcudf_docs/api_docs/aggregation_reduction.rst
new file mode 100644
index 00000000000..e0f5046fe61
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/aggregation_reduction.rst
@@ -0,0 +1,5 @@
+Aggregation Reduction
+=====================
+
+.. doxygengroup:: aggregation_reduction
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/aggregation_rolling.rst b/docs/cudf/source/libcudf_docs/api_docs/aggregation_rolling.rst
new file mode 100644
index 00000000000..962f332adc3
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/aggregation_rolling.rst
@@ -0,0 +1,5 @@
+Aggregation Rolling
+===================
+
+.. doxygengroup:: aggregation_rolling
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_aggregation.rst b/docs/cudf/source/libcudf_docs/api_docs/column_aggregation.rst
new file mode 100644
index 00000000000..39c0fdd5d3c
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_aggregation.rst
@@ -0,0 +1,14 @@
+Column Aggregation
+==================
+
+.. doxygengroup:: column_aggregation
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   aggregation_factories
+   aggregation_reduction
+   aggregation_groupby
+   aggregation_rolling
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/column_apis.rst
new file mode 100644
index 00000000000..23660576a37
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_apis.rst
@@ -0,0 +1,23 @@
+Column APIs
+===========
+
+.. doxygengroup:: column_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   column_copy
+   column_nullmask
+   column_sort
+   column_search
+   column_hash
+   column_merge
+   column_join
+   column_quantiles
+   column_aggregation
+   column_transformation
+   column_reshape
+   column_reorder
+   column_interop
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/column_classes.rst
new file mode 100644
index 00000000000..cc3ed8c6626
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_classes.rst
@@ -0,0 +1,16 @@
+Column Classes
+==============
+
+.. doxygengroup:: column_classes
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   column_factories
+   dictionary_classes
+   lists_classes
+   strings_classes
+   structs_classes
+   timestamp_classes
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_copy.rst b/docs/cudf/source/libcudf_docs/api_docs/column_copy.rst
new file mode 100644
index 00000000000..a8bc72fc505
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_copy.rst
@@ -0,0 +1,16 @@
+Column Copy
+===========
+
+.. doxygengroup:: column_copy
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   copy_concatenate
+   copy_gather
+   copy_scatter
+   copy_slice
+   copy_split
+   copy_shift
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_factories.rst b/docs/cudf/source/libcudf_docs/api_docs/column_factories.rst
new file mode 100644
index 00000000000..938db2a40a5
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_factories.rst
@@ -0,0 +1,5 @@
+Column Factories
+================
+
+.. doxygengroup:: column_factories
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_hash.rst b/docs/cudf/source/libcudf_docs/api_docs/column_hash.rst
new file mode 100644
index 00000000000..cd0c2838474
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_hash.rst
@@ -0,0 +1,5 @@
+Column Hash
+===========
+
+.. doxygengroup:: column_hash
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_interop.rst b/docs/cudf/source/libcudf_docs/api_docs/column_interop.rst
new file mode 100644
index 00000000000..dd6036b0339
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_interop.rst
@@ -0,0 +1,12 @@
+Column Interop
+==============
+
+.. doxygengroup:: column_interop
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   interop_dlpack
+   interop_arrow
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_join.rst b/docs/cudf/source/libcudf_docs/api_docs/column_join.rst
new file mode 100644
index 00000000000..903319f4881
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_join.rst
@@ -0,0 +1,5 @@
+Column Join
+===========
+
+.. doxygengroup:: column_join
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_merge.rst b/docs/cudf/source/libcudf_docs/api_docs/column_merge.rst
new file mode 100644
index 00000000000..0f12ad3d169
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_merge.rst
@@ -0,0 +1,5 @@
+Column Merge
+============
+
+.. doxygengroup:: column_merge
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_nullmask.rst b/docs/cudf/source/libcudf_docs/api_docs/column_nullmask.rst
new file mode 100644
index 00000000000..35c94e0b03e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_nullmask.rst
@@ -0,0 +1,5 @@
+Column Nullmask
+===============
+
+.. doxygengroup:: column_nullmask
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_quantiles.rst b/docs/cudf/source/libcudf_docs/api_docs/column_quantiles.rst
new file mode 100644
index 00000000000..f1e1b3f843e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_quantiles.rst
@@ -0,0 +1,5 @@
+Column Quantiles
+================
+
+.. doxygengroup:: column_quantiles
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_reorder.rst b/docs/cudf/source/libcudf_docs/api_docs/column_reorder.rst
new file mode 100644
index 00000000000..ccc8a48f7df
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_reorder.rst
@@ -0,0 +1,12 @@
+Column Reorder
+==============
+
+.. doxygengroup:: column_reorder
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   reorder_partition
+   reorder_compact
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_reshape.rst b/docs/cudf/source/libcudf_docs/api_docs/column_reshape.rst
new file mode 100644
index 00000000000..59df1eaa5b9
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_reshape.rst
@@ -0,0 +1,11 @@
+Column Reshape
+==============
+
+.. doxygengroup:: column_reshape
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   reshape_transpose
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_search.rst b/docs/cudf/source/libcudf_docs/api_docs/column_search.rst
new file mode 100644
index 00000000000..1997c5618e3
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_search.rst
@@ -0,0 +1,5 @@
+Column Search
+=============
+
+.. doxygengroup:: column_search
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_sort.rst b/docs/cudf/source/libcudf_docs/api_docs/column_sort.rst
new file mode 100644
index 00000000000..4d8fe895109
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_sort.rst
@@ -0,0 +1,5 @@
+Column Sort
+===========
+
+.. doxygengroup:: column_sort
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_transformation.rst b/docs/cudf/source/libcudf_docs/api_docs/column_transformation.rst
new file mode 100644
index 00000000000..e2da7e5ad28
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_transformation.rst
@@ -0,0 +1,15 @@
+Column Transformation
+=====================
+
+.. doxygengroup:: column_transformation
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   transformation_unaryops
+   transformation_binaryops
+   transformation_transform
+   transformation_replace
+   transformation_fill
diff --git a/docs/cudf/source/libcudf_docs/api_docs/copy_concatenate.rst b/docs/cudf/source/libcudf_docs/api_docs/copy_concatenate.rst
new file mode 100644
index 00000000000..f9400bff9e8
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/copy_concatenate.rst
@@ -0,0 +1,5 @@
+Copy Concatenate
+================
+
+.. doxygengroup:: copy_concatenate
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/copy_gather.rst b/docs/cudf/source/libcudf_docs/api_docs/copy_gather.rst
new file mode 100644
index 00000000000..daf306caa6e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/copy_gather.rst
@@ -0,0 +1,5 @@
+Copy Gather
+===========
+
+.. doxygengroup:: copy_gather
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/copy_scatter.rst b/docs/cudf/source/libcudf_docs/api_docs/copy_scatter.rst
new file mode 100644
index 00000000000..d7b9a461901
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/copy_scatter.rst
@@ -0,0 +1,5 @@
+Copy Scatter
+============
+
+.. doxygengroup:: copy_scatter
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/copy_shift.rst b/docs/cudf/source/libcudf_docs/api_docs/copy_shift.rst
new file mode 100644
index 00000000000..5187100a8a6
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/copy_shift.rst
@@ -0,0 +1,5 @@
+Copy Shift
+==========
+
+.. doxygengroup:: copy_shift
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/copy_slice.rst b/docs/cudf/source/libcudf_docs/api_docs/copy_slice.rst
new file mode 100644
index 00000000000..3fb2fbe49d0
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/copy_slice.rst
@@ -0,0 +1,5 @@
+Copy Slice
+==========
+
+.. doxygengroup:: copy_slice
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/copy_split.rst b/docs/cudf/source/libcudf_docs/api_docs/copy_split.rst
new file mode 100644
index 00000000000..ddcdfd31cc4
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/copy_split.rst
@@ -0,0 +1,5 @@
+Copy Split
+==========
+
+.. doxygengroup:: copy_split
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/cudf_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/cudf_classes.rst
new file mode 100644
index 00000000000..5473dd56ebb
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/cudf_classes.rst
@@ -0,0 +1,14 @@
+Cudf Classes
+============
+
+.. doxygengroup:: cudf_classes
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   column_classes
+   table_classes
+   scalar_classes
+   fixed_point_classes
diff --git a/docs/cudf/source/libcudf_docs/api_docs/cudf_namespace.rst b/docs/cudf/source/libcudf_docs/api_docs/cudf_namespace.rst
new file mode 100644
index 00000000000..fd4f3c9b6ab
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/cudf_namespace.rst
@@ -0,0 +1,28 @@
+libcudf
+=======
+
+.. TODO: This page really only exists right now for the purpose of resolving namespace links. We may want to just ignore these instead
+
+.. doxygennamespace:: cudf
+   :desc-only:
+
+.. doxygennamespace:: cudf::ast
+   :desc-only:
+
+.. doxygennamespace:: cudf::io
+   :desc-only:
+
+.. doxygennamespace:: cudf::io::orc
+   :desc-only:
+
+.. doxygennamespace:: cudf::io::parquet
+   :desc-only:
+
+.. doxygennamespace:: cudf::hashing
+   :desc-only:
+
+.. doxygennamespace:: numeric
+   :desc-only:
+
+.. doxygennamespace:: cudf::tdigest
+   :desc-only:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/datetime_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/datetime_apis.rst
new file mode 100644
index 00000000000..8f7960444fc
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/datetime_apis.rst
@@ -0,0 +1,12 @@
+Datetime APIs
+=============
+
+.. doxygengroup:: datetime_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   datetime_extract
+   datetime_compute
diff --git a/docs/cudf/source/libcudf_docs/api_docs/datetime_compute.rst b/docs/cudf/source/libcudf_docs/api_docs/datetime_compute.rst
new file mode 100644
index 00000000000..0c7ba0f522f
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/datetime_compute.rst
@@ -0,0 +1,5 @@
+Datetime Compute
+================
+
+.. doxygengroup:: datetime_compute
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/datetime_extract.rst b/docs/cudf/source/libcudf_docs/api_docs/datetime_extract.rst
new file mode 100644
index 00000000000..da212480abc
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/datetime_extract.rst
@@ -0,0 +1,5 @@
+Datetime Extract
+================
+
+.. doxygengroup:: datetime_extract
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/default_stream.rst b/docs/cudf/source/libcudf_docs/api_docs/default_stream.rst
new file mode 100644
index 00000000000..c50493a8f60
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/default_stream.rst
@@ -0,0 +1,5 @@
+Default Stream
+==============
+
+.. doxygengroup:: default_stream
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/dictionary_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/dictionary_apis.rst
new file mode 100644
index 00000000000..8d463b0a956
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/dictionary_apis.rst
@@ -0,0 +1,13 @@
+Dictionary APIs
+===============
+
+.. doxygengroup:: dictionary_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   dictionary_encode
+   dictionary_search
+   dictionary_update
diff --git a/docs/cudf/source/libcudf_docs/api_docs/dictionary_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/dictionary_classes.rst
new file mode 100644
index 00000000000..00dec78c5f5
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/dictionary_classes.rst
@@ -0,0 +1,5 @@
+Dictionary Classes
+==================
+
+.. doxygengroup:: dictionary_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/dictionary_encode.rst b/docs/cudf/source/libcudf_docs/api_docs/dictionary_encode.rst
new file mode 100644
index 00000000000..ed77380f281
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/dictionary_encode.rst
@@ -0,0 +1,5 @@
+Dictionary Encode
+=================
+
+.. doxygengroup:: dictionary_encode
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/dictionary_search.rst b/docs/cudf/source/libcudf_docs/api_docs/dictionary_search.rst
new file mode 100644
index 00000000000..b187d29ca0b
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/dictionary_search.rst
@@ -0,0 +1,5 @@
+Dictionary Search
+=================
+
+.. doxygengroup:: dictionary_search
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/dictionary_update.rst b/docs/cudf/source/libcudf_docs/api_docs/dictionary_update.rst
new file mode 100644
index 00000000000..8b0c12c09d9
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/dictionary_update.rst
@@ -0,0 +1,5 @@
+Dictionary Update
+=================
+
+.. doxygengroup:: dictionary_update
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/expressions.rst b/docs/cudf/source/libcudf_docs/api_docs/expressions.rst
new file mode 100644
index 00000000000..c65d8a29858
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/expressions.rst
@@ -0,0 +1,5 @@
+Expression Evaluation
+=====================
+
+.. doxygengroup:: expressions
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/fixed_point_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/fixed_point_classes.rst
new file mode 100644
index 00000000000..0a1ef1b4d63
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/fixed_point_classes.rst
@@ -0,0 +1,6 @@
+
+Fixed Point Classes
+===================
+
+.. doxygengroup:: fixed_point_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/index.rst b/docs/cudf/source/libcudf_docs/api_docs/index.rst
new file mode 100644
index 00000000000..c077a7cd452
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/index.rst
@@ -0,0 +1,29 @@
+libcudf documentation
+=====================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   cudf_namespace
+   default_stream
+   cudf_classes
+   column_apis
+   datetime_apis
+   strings_apis
+   dictionary_apis
+   io_apis
+   json_apis
+   lists_apis
+   nvtext_apis
+   utility_apis
+   labeling_apis
+   expressions
+   tdigest
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/cudf/source/libcudf_docs/api_docs/interop_arrow.rst b/docs/cudf/source/libcudf_docs/api_docs/interop_arrow.rst
new file mode 100644
index 00000000000..e0f0edfc9ae
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/interop_arrow.rst
@@ -0,0 +1,5 @@
+Interop Arrow
+=============
+
+.. doxygengroup:: interop_arrow
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/interop_dlpack.rst b/docs/cudf/source/libcudf_docs/api_docs/interop_dlpack.rst
new file mode 100644
index 00000000000..4be168c5132
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/interop_dlpack.rst
@@ -0,0 +1,5 @@
+Interop Dlpack
+==============
+
+.. doxygengroup:: interop_dlpack
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/io_apis.rst
new file mode 100644
index 00000000000..a23c0948584
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_apis.rst
@@ -0,0 +1,15 @@
+Io APIs
+=======
+
+.. doxygengroup:: io_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   io_types
+   io_readers
+   io_writers
+   io_datasources
+   io_datasinks
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_datasinks.rst b/docs/cudf/source/libcudf_docs/api_docs/io_datasinks.rst
new file mode 100644
index 00000000000..15b0da5f7a7
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_datasinks.rst
@@ -0,0 +1,5 @@
+Io Datasinks
+============
+
+.. doxygengroup:: io_datasinks
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_datasources.rst b/docs/cudf/source/libcudf_docs/api_docs/io_datasources.rst
new file mode 100644
index 00000000000..3d5834892eb
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_datasources.rst
@@ -0,0 +1,5 @@
+Io Datasources
+==============
+
+.. doxygengroup:: io_datasources
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
new file mode 100644
index 00000000000..a835673dee4
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
@@ -0,0 +1,5 @@
+Io Readers
+==========
+
+.. doxygengroup:: io_readers
+   :desc-only:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_types.rst b/docs/cudf/source/libcudf_docs/api_docs/io_types.rst
new file mode 100644
index 00000000000..abdc43bc6fa
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_types.rst
@@ -0,0 +1,5 @@
+Io Types
+========
+
+.. doxygengroup:: io_types
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_writers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_writers.rst
new file mode 100644
index 00000000000..add57ecfed4
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_writers.rst
@@ -0,0 +1,5 @@
+Io Writers
+==========
+
+.. doxygengroup:: io_writers
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/json_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/json_apis.rst
new file mode 100644
index 00000000000..119dbc36fa1
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/json_apis.rst
@@ -0,0 +1,11 @@
+JSON APIs
+=========
+
+.. doxygengroup:: json_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   json_object
diff --git a/docs/cudf/source/libcudf_docs/api_docs/json_object.rst b/docs/cudf/source/libcudf_docs/api_docs/json_object.rst
new file mode 100644
index 00000000000..510a2f9eb07
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/json_object.rst
@@ -0,0 +1,5 @@
+JSON Object
+===========
+
+.. doxygengroup:: json_object
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/label_bins.rst b/docs/cudf/source/libcudf_docs/api_docs/label_bins.rst
new file mode 100644
index 00000000000..ca80912e527
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/label_bins.rst
@@ -0,0 +1,5 @@
+Label Bins
+==========
+
+.. doxygengroup:: label_bins
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/labeling_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/labeling_apis.rst
new file mode 100644
index 00000000000..24726ce2e09
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/labeling_apis.rst
@@ -0,0 +1,11 @@
+Labeling APIs
+=============
+
+.. doxygengroup:: labeling_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   label_bins
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_apis.rst
new file mode 100644
index 00000000000..d34253d909a
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_apis.rst
@@ -0,0 +1,20 @@
+Lists APIs
+==========
+
+.. doxygengroup:: lists_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   lists_combine
+   lists_modify
+   lists_extract
+   lists_filling
+   lists_contains
+   lists_gather
+   lists_elements
+   lists_filtering
+   lists_sort
+   set_operations
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_classes.rst
new file mode 100644
index 00000000000..9b89c164746
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_classes.rst
@@ -0,0 +1,5 @@
+Lists Classes
+=============
+
+.. doxygengroup:: lists_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_combine.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_combine.rst
new file mode 100644
index 00000000000..e26fd89a3c2
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_combine.rst
@@ -0,0 +1,5 @@
+Lists Combine
+=============
+
+.. doxygengroup:: lists_combine
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_contains.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_contains.rst
new file mode 100644
index 00000000000..ccb1366a6fb
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_contains.rst
@@ -0,0 +1,5 @@
+Lists Contains
+==============
+
+.. doxygengroup:: lists_contains
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_elements.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_elements.rst
new file mode 100644
index 00000000000..fb7758a46d0
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_elements.rst
@@ -0,0 +1,5 @@
+Lists Elements
+==============
+
+.. doxygengroup:: lists_elements
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_extract.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_extract.rst
new file mode 100644
index 00000000000..f721d89684e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_extract.rst
@@ -0,0 +1,5 @@
+Lists Extract
+=============
+
+.. doxygengroup:: lists_extract
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_filling.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_filling.rst
new file mode 100644
index 00000000000..5ab3c491dbe
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_filling.rst
@@ -0,0 +1,5 @@
+Lists Filling
+=============
+
+.. doxygengroup:: lists_filling
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_filtering.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_filtering.rst
new file mode 100644
index 00000000000..af9f1ebaa2b
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_filtering.rst
@@ -0,0 +1,5 @@
+Lists Filtering
+===============
+
+.. doxygengroup:: lists_filtering
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_gather.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_gather.rst
new file mode 100644
index 00000000000..4cb927b2672
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_gather.rst
@@ -0,0 +1,5 @@
+Lists Gather
+============
+
+.. doxygengroup:: lists_gather
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_modify.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_modify.rst
new file mode 100644
index 00000000000..d41020b75cd
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_modify.rst
@@ -0,0 +1,5 @@
+Lists Modify
+============
+
+.. doxygengroup:: lists_modify
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_sort.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_sort.rst
new file mode 100644
index 00000000000..63de08d1622
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_sort.rst
@@ -0,0 +1,5 @@
+Lists Sort
+==========
+
+.. doxygengroup:: lists_sort
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_apis.rst
new file mode 100644
index 00000000000..f938f2914ed
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_apis.rst
@@ -0,0 +1,18 @@
+Nvtext APIs
+===========
+
+.. doxygengroup:: nvtext_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   nvtext_ngrams
+   nvtext_normalize
+   nvtext_stemmer
+   nvtext_edit_distance
+   nvtext_tokenize
+   nvtext_replace
+   nvtext_minhash
+   nvtext_jaccard
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_edit_distance.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_edit_distance.rst
new file mode 100644
index 00000000000..e5eb2dc8c95
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_edit_distance.rst
@@ -0,0 +1,5 @@
+Nvtext Edit Distance
+====================
+
+.. doxygengroup:: nvtext_edit_distance
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_jaccard.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_jaccard.rst
new file mode 100644
index 00000000000..75124c5655a
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_jaccard.rst
@@ -0,0 +1,5 @@
+Nvtext Jaccard
+==============
+
+.. doxygengroup:: nvtext_jaccard
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_minhash.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_minhash.rst
new file mode 100644
index 00000000000..57d8445a3eb
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_minhash.rst
@@ -0,0 +1,5 @@
+Nvtext Minhash
+==============
+
+.. doxygengroup:: nvtext_minhash
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_ngrams.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_ngrams.rst
new file mode 100644
index 00000000000..27f93211f4c
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_ngrams.rst
@@ -0,0 +1,5 @@
+Nvtext Ngrams
+=============
+
+.. doxygengroup:: nvtext_ngrams
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_normalize.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_normalize.rst
new file mode 100644
index 00000000000..7654f3c19f6
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_normalize.rst
@@ -0,0 +1,5 @@
+Nvtext Normalize
+================
+
+.. doxygengroup:: nvtext_normalize
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_replace.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_replace.rst
new file mode 100644
index 00000000000..cdd2e45f48b
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_replace.rst
@@ -0,0 +1,5 @@
+Nvtext Replace
+==============
+
+.. doxygengroup:: nvtext_replace
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_stemmer.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_stemmer.rst
new file mode 100644
index 00000000000..fbe5675fe2d
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_stemmer.rst
@@ -0,0 +1,5 @@
+Nvtext Stemmer
+==============
+
+.. doxygengroup:: nvtext_stemmer
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_tokenize.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_tokenize.rst
new file mode 100644
index 00000000000..58fc422dc88
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_tokenize.rst
@@ -0,0 +1,5 @@
+Nvtext Tokenize
+===============
+
+.. doxygengroup:: nvtext_tokenize
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/reorder_compact.rst b/docs/cudf/source/libcudf_docs/api_docs/reorder_compact.rst
new file mode 100644
index 00000000000..099d08f74ab
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/reorder_compact.rst
@@ -0,0 +1,5 @@
+Reorder Compact
+===============
+
+.. doxygengroup:: reorder_compact
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/reorder_partition.rst b/docs/cudf/source/libcudf_docs/api_docs/reorder_partition.rst
new file mode 100644
index 00000000000..a8b88ab244f
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/reorder_partition.rst
@@ -0,0 +1,5 @@
+Reorder Partition
+=================
+
+.. doxygengroup:: reorder_partition
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/reshape_transpose.rst b/docs/cudf/source/libcudf_docs/api_docs/reshape_transpose.rst
new file mode 100644
index 00000000000..1ab29129c33
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/reshape_transpose.rst
@@ -0,0 +1,5 @@
+Reshape Transpose
+=================
+
+.. doxygengroup:: reshape_transpose
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/scalar_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/scalar_classes.rst
new file mode 100644
index 00000000000..3022e5d2292
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/scalar_classes.rst
@@ -0,0 +1,11 @@
+Scalar Classes
+==============
+
+.. doxygengroup:: scalar_classes
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   scalar_factories
diff --git a/docs/cudf/source/libcudf_docs/api_docs/scalar_factories.rst b/docs/cudf/source/libcudf_docs/api_docs/scalar_factories.rst
new file mode 100644
index 00000000000..782ce6cb421
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/scalar_factories.rst
@@ -0,0 +1,5 @@
+Scalar Factories
+================
+
+.. doxygengroup:: scalar_factories
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/set_operations.rst b/docs/cudf/source/libcudf_docs/api_docs/set_operations.rst
new file mode 100644
index 00000000000..22118daec00
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/set_operations.rst
@@ -0,0 +1,5 @@
+Set Operations
+==============
+
+.. doxygengroup:: set_operations
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_apis.rst
new file mode 100644
index 00000000000..c42d5479954
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_apis.rst
@@ -0,0 +1,23 @@
+Strings APIs
+============
+
+.. doxygengroup:: strings_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   strings_case
+   strings_types
+   strings_combine
+   strings_contains
+   strings_convert
+   strings_copy
+   strings_slice
+   strings_find
+   strings_modify
+   strings_replace
+   strings_split
+   strings_extract
+   strings_regex
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_case.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_case.rst
new file mode 100644
index 00000000000..ddec78a268e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_case.rst
@@ -0,0 +1,5 @@
+Strings Case
+============
+
+.. doxygengroup:: strings_case
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_classes.rst
new file mode 100644
index 00000000000..867b2ac242a
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_classes.rst
@@ -0,0 +1,5 @@
+Strings Classes
+===============
+
+.. doxygengroup:: strings_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_combine.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_combine.rst
new file mode 100644
index 00000000000..4542308e0c7
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_combine.rst
@@ -0,0 +1,5 @@
+Strings Combine
+===============
+
+.. doxygengroup:: strings_combine
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_contains.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_contains.rst
new file mode 100644
index 00000000000..250eb96e541
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_contains.rst
@@ -0,0 +1,5 @@
+Strings Contains
+================
+
+.. doxygengroup:: strings_contains
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
new file mode 100644
index 00000000000..ae5d78fb1a1
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
@@ -0,0 +1,5 @@
+Strings Convert
+===============
+
+.. doxygengroup:: strings_convert
+   :desc-only:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_copy.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_copy.rst
new file mode 100644
index 00000000000..0c2884361bf
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_copy.rst
@@ -0,0 +1,5 @@
+Strings Copy
+============
+
+.. doxygengroup:: strings_copy
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_extract.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_extract.rst
new file mode 100644
index 00000000000..8018bbc627a
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_extract.rst
@@ -0,0 +1,5 @@
+Strings Extract
+===============
+
+.. doxygengroup:: strings_extract
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_find.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_find.rst
new file mode 100644
index 00000000000..709bd138f54
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_find.rst
@@ -0,0 +1,5 @@
+Strings Find
+============
+
+.. doxygengroup:: strings_find
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_modify.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_modify.rst
new file mode 100644
index 00000000000..4bf54e5e835
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_modify.rst
@@ -0,0 +1,5 @@
+Strings Modify
+==============
+
+.. doxygengroup:: strings_modify
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_regex.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_regex.rst
new file mode 100644
index 00000000000..719b09be24e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_regex.rst
@@ -0,0 +1,5 @@
+Strings Regex
+=============
+
+.. doxygengroup:: strings_regex
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_replace.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_replace.rst
new file mode 100644
index 00000000000..884295e6d78
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_replace.rst
@@ -0,0 +1,5 @@
+Strings Replace
+===============
+
+.. doxygengroup:: strings_replace
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_slice.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_slice.rst
new file mode 100644
index 00000000000..bc1831a9dd4
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_slice.rst
@@ -0,0 +1,5 @@
+Strings Slice
+=============
+
+.. doxygengroup:: strings_slice
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_split.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_split.rst
new file mode 100644
index 00000000000..ddf7cf2f1d6
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_split.rst
@@ -0,0 +1,5 @@
+Strings Split
+=============
+
+.. doxygengroup:: strings_split
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_types.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_types.rst
new file mode 100644
index 00000000000..aff356efb71
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_types.rst
@@ -0,0 +1,5 @@
+Strings Types
+=============
+
+.. doxygengroup:: strings_types
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/structs_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/structs_classes.rst
new file mode 100644
index 00000000000..2669c2884d6
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/structs_classes.rst
@@ -0,0 +1,5 @@
+Structs Classes
+===============
+
+.. doxygengroup:: structs_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/table_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/table_classes.rst
new file mode 100644
index 00000000000..f00e315e597
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/table_classes.rst
@@ -0,0 +1,5 @@
+Table Classes
+=============
+
+.. doxygengroup:: table_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/tdigest.rst b/docs/cudf/source/libcudf_docs/api_docs/tdigest.rst
new file mode 100644
index 00000000000..9eb97df8337
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/tdigest.rst
@@ -0,0 +1,5 @@
+tdigest
+=======
+
+.. doxygengroup:: tdigest
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/timestamp_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/timestamp_classes.rst
new file mode 100644
index 00000000000..4651d8dbd32
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/timestamp_classes.rst
@@ -0,0 +1,5 @@
+Timestamp Classes
+=================
+
+.. doxygengroup:: timestamp_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/transformation_binaryops.rst b/docs/cudf/source/libcudf_docs/api_docs/transformation_binaryops.rst
new file mode 100644
index 00000000000..59be3a7b45e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/transformation_binaryops.rst
@@ -0,0 +1,5 @@
+Transformation Binaryops
+========================
+
+.. doxygengroup:: transformation_binaryops
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/transformation_fill.rst b/docs/cudf/source/libcudf_docs/api_docs/transformation_fill.rst
new file mode 100644
index 00000000000..939fba261df
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/transformation_fill.rst
@@ -0,0 +1,5 @@
+Transformation Fill
+===================
+
+.. doxygengroup:: transformation_fill
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/transformation_replace.rst b/docs/cudf/source/libcudf_docs/api_docs/transformation_replace.rst
new file mode 100644
index 00000000000..6800b7bba76
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/transformation_replace.rst
@@ -0,0 +1,5 @@
+Transformation Replace
+======================
+
+.. doxygengroup:: transformation_replace
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/transformation_transform.rst b/docs/cudf/source/libcudf_docs/api_docs/transformation_transform.rst
new file mode 100644
index 00000000000..108a680f5a1
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/transformation_transform.rst
@@ -0,0 +1,5 @@
+Transformation Transform
+========================
+
+.. doxygengroup:: transformation_transform
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/transformation_unaryops.rst b/docs/cudf/source/libcudf_docs/api_docs/transformation_unaryops.rst
new file mode 100644
index 00000000000..07b5ba6388b
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/transformation_unaryops.rst
@@ -0,0 +1,5 @@
+Transformation Unaryops
+=======================
+
+.. doxygengroup:: transformation_unaryops
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/utility_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/utility_apis.rst
new file mode 100644
index 00000000000..9f0cda74582
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/utility_apis.rst
@@ -0,0 +1,15 @@
+Utility APIs
+============
+
+.. doxygengroup:: utility_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   utility_types
+   utility_dispatcher
+   utility_bitmask
+   utility_error
+   utility_span
diff --git a/docs/cudf/source/libcudf_docs/api_docs/utility_bitmask.rst b/docs/cudf/source/libcudf_docs/api_docs/utility_bitmask.rst
new file mode 100644
index 00000000000..5dba1928baf
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/utility_bitmask.rst
@@ -0,0 +1,5 @@
+Utility Bitmask
+===============
+
+.. doxygengroup:: utility_bitmask
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/utility_dispatcher.rst b/docs/cudf/source/libcudf_docs/api_docs/utility_dispatcher.rst
new file mode 100644
index 00000000000..539505e4551
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/utility_dispatcher.rst
@@ -0,0 +1,5 @@
+Utility Dispatcher
+==================
+
+.. doxygengroup:: utility_dispatcher
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/utility_error.rst b/docs/cudf/source/libcudf_docs/api_docs/utility_error.rst
new file mode 100644
index 00000000000..acb575636c9
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/utility_error.rst
@@ -0,0 +1,5 @@
+Utility Error
+=============
+
+.. doxygengroup:: utility_error
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/utility_span.rst b/docs/cudf/source/libcudf_docs/api_docs/utility_span.rst
new file mode 100644
index 00000000000..fdb1d254c0e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/utility_span.rst
@@ -0,0 +1,5 @@
+Utility Span
+============
+
+.. doxygengroup:: utility_span
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/utility_types.rst b/docs/cudf/source/libcudf_docs/api_docs/utility_types.rst
new file mode 100644
index 00000000000..f021535d589
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/utility_types.rst
@@ -0,0 +1,5 @@
+Utility Types
+=============
+
+.. doxygengroup:: utility_types
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/index.rst b/docs/cudf/source/libcudf_docs/index.rst
new file mode 100644
index 00000000000..39ed46d8578
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/index.rst
@@ -0,0 +1,17 @@
+libcudf documentation
+=====================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   api_docs/index.rst
+   md_regex
+   unicode_limitations
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/cudf/source/libcudf_docs/md_regex.rst b/docs/cudf/source/libcudf_docs/md_regex.rst
new file mode 100644
index 00000000000..0eb0f464063
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/md_regex.rst
@@ -0,0 +1,4 @@
+.. _md_regex:
+
+.. include:: ../../../../cpp/doxygen/regex.md
+   :parser: myst_parser.sphinx_
diff --git a/docs/cudf/source/libcudf_docs/unicode_limitations.rst b/docs/cudf/source/libcudf_docs/unicode_limitations.rst
new file mode 100644
index 00000000000..1f069088160
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/unicode_limitations.rst
@@ -0,0 +1,4 @@
+.. _unicode_limitations:
+
+.. include:: ../../../../cpp/doxygen/unicode.md
+   :parser: myst_parser.sphinx_

From 9acddc08cc209e8d6b94891be6131edd63ff5b43 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 17 Jan 2024 14:29:19 -0800
Subject: [PATCH 04/60] Reduce execution time of Parquet C++ tests (#14750)

Reduced time from 90s to 25s on local system. Very few tests are impacted, and there should be no impact on code coverage.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14750
---
 cpp/tests/io/parquet_misc_test.cpp   |  5 +-
 cpp/tests/io/parquet_reader_test.cpp | 40 ++++++------
 cpp/tests/io/parquet_v2_test.cpp     | 94 +++++++---------------------
 cpp/tests/io/parquet_writer_test.cpp | 42 ++++++-------
 4 files changed, 64 insertions(+), 117 deletions(-)

diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp
index 49b6b8fd259..aa5a1cad96a 100644
--- a/cpp/tests/io/parquet_misc_test.cpp
+++ b/cpp/tests/io/parquet_misc_test.cpp
@@ -138,9 +138,8 @@ TEST_P(ParquetSizedTest, DictionaryTest)
   unsigned int const cardinality = (1 << (GetParam() - 1)) + 1;
   unsigned int const nrows       = std::max(cardinality * 3 / 2, 3'000'000U);
 
-  auto elements       = cudf::detail::make_counting_transform_iterator(0, [cardinality](auto i) {
-    return "a unique string value suffixed with " + std::to_string(i % cardinality);
-  });
+  auto const elements = cudf::detail::make_counting_transform_iterator(
+    0, [cardinality](auto i) { return std::to_string(i % cardinality); });
   auto const col0     = cudf::test::strings_column_wrapper(elements, elements + nrows);
   auto const expected = table_view{{col0}};
 
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 5cb05ac7011..abbd0c97f07 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -241,7 +241,7 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
 
 TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)
 {
-  constexpr int num_rows = 30 * 1000000;
+  constexpr int num_rows = 30 * 10000;
 
   std::mt19937 gen(6747);
   std::bernoulli_distribution bn(0.7f);
@@ -251,21 +251,23 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)
 
   cudf::test::fixed_width_column_wrapper<int> col(values, values + num_rows, valids);
 
-  // this file will have row groups of 1,000,000 each
+  // this file will have row groups of 10,000 each
   cudf::table_view tbl({col});
   auto filepath = temp_env->get_temp_filepath("UserBoundsWithNullsLarge.parquet");
   cudf::io::parquet_writer_options out_args =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl)
+      .row_group_size_rows(10000)
+      .max_page_size_rows(1000);
   cudf::io::write_parquet(out_args);
 
   // skip_rows / num_rows
   // clang-format off
-  std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {1613470, -1}, {1999999, -1},
+  std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {16130, -1}, {19999, -1},
                                            {31, 1}, {32, 1}, {33, 1},
                                            // deliberately span some row group boundaries
-                                           {999000, 1001}, {999000, 2000}, {2999999, 2}, {13999997, -1},
-                                           {16785678, 3}, {22996176, 31},
-                                           {24001231, 17}, {29000001, 989999}, {29999999, 1} };
+                                           {9900, 1001}, {9900, 2000}, {29999, 2}, {139997, -1},
+                                           {167878, 3}, {229976, 31},
+                                           {240031, 17}, {290001, 9899}, {299999, 1} };
   // clang-format on
   for (auto p : params) {
     cudf::io::parquet_reader_options read_args =
@@ -285,25 +287,27 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)
 
 TEST_F(ParquetReaderTest, ListUserBoundsWithNullsLarge)
 {
-  constexpr int num_rows = 5 * 1000000;
+  constexpr int num_rows = 5 * 10000;
   auto colp              = make_parquet_list_list_col<int>(0, num_rows, 5, 8, true);
   cudf::column_view col  = *colp;
 
-  // this file will have row groups of 1,000,000 each
+  // this file will have row groups of 10,000 each
   cudf::table_view tbl({col});
   auto filepath = temp_env->get_temp_filepath("ListUserBoundsWithNullsLarge.parquet");
   cudf::io::parquet_writer_options out_args =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl)
+      .row_group_size_rows(10000)
+      .max_page_size_rows(1000);
   cudf::io::write_parquet(out_args);
 
   // skip_rows / num_rows
   // clang-format off
-  std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {161470, -1}, {4499997, -1},
+  std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {1670, -1}, {44997, -1},
                                            {31, 1}, {32, 1}, {33, 1},
                                            // deliberately span some row group boundaries
-                                           {999000, 1001}, {999000, 2000}, {2999999, 2},
-                                           {1678567, 3}, {4299676, 31},
-                                           {4001231, 17}, {1900000, 989999}, {4999999, 1} };
+                                           {9900, 1001}, {9900, 2000}, {29999, 2},
+                                           {16567, 3}, {42976, 31},
+                                           {40231, 17}, {19000, 9899}, {49999, 1} };
   // clang-format on
   for (auto p : params) {
     cudf::io::parquet_reader_options read_args =
@@ -1951,7 +1955,7 @@ TEST_F(ParquetReaderTest, RepeatedNoAnnotations)
 
 TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
 {
-  constexpr int num_rows = 50'000;
+  constexpr int num_rows = 10'000;
   constexpr auto seed    = 21337;
 
   std::mt19937 engine{seed};
@@ -2003,7 +2007,7 @@ TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
       .compression(cudf::io::compression_type::NONE)
       .dictionary_policy(cudf::io::dictionary_policy::NEVER)
-      .max_page_size_rows(20'000)
+      .max_page_size_rows(5'000)
       .write_v2_headers(true)
       .build();
   cudf::io::write_parquet(out_opts);
@@ -2018,7 +2022,7 @@ TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
     // skip and truncate
     {1, 32}, {1, 33}, {32, 32}, {33, 139},
     // cross page boundaries
-    {10'000, 20'000}
+    {3'000, 5'000}
   };
 
   // clang-format on
@@ -2044,7 +2048,7 @@ TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
         .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
         .compression(cudf::io::compression_type::NONE)
         .dictionary_policy(cudf::io::dictionary_policy::NEVER)
-        .max_page_size_rows(20'000)
+        .max_page_size_rows(5'000)
         .write_v2_headers(true);
     cudf::io::write_parquet(out_opts2);
 
diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp
index f2b50639a4d..1a373ed92ae 100644
--- a/cpp/tests/io/parquet_v2_test.cpp
+++ b/cpp/tests/io/parquet_v2_test.cpp
@@ -23,6 +23,8 @@
 
 #include <cudf/io/parquet.hpp>
 
+using cudf::test::iterators::no_nulls;
+
 // Base test fixture for V2 header tests
 class ParquetV2Test : public ::cudf::test::BaseFixtureWithParam<bool> {};
 
@@ -33,7 +35,7 @@ INSTANTIATE_TEST_SUITE_P(ParquetV2ReadWriteTest,
 
 TEST_P(ParquetV2Test, MultiColumn)
 {
-  constexpr auto num_rows = 100000;
+  constexpr auto num_rows = 50000;
   auto const is_v2        = GetParam();
 
   // auto col0_data = random_values<bool>(num_rows);
@@ -45,27 +47,25 @@ TEST_P(ParquetV2Test, MultiColumn)
   auto col6_vals = random_values<int16_t>(num_rows);
   auto col7_vals = random_values<int32_t>(num_rows);
   auto col8_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) {
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) {
     return numeric::decimal32{col6_vals[i], numeric::scale_type{5}};
   });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [col7_vals](auto i) {
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) {
     return numeric::decimal64{col7_vals[i], numeric::scale_type{-5}};
   });
-  auto col8_data = cudf::detail::make_counting_transform_iterator(0, [col8_vals](auto i) {
+  auto col8_data = cudf::detail::make_counting_transform_iterator(0, [&col8_vals](auto i) {
     return numeric::decimal128{col8_vals[i], numeric::scale_type{-6}};
   });
-  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  // column_wrapper<bool> col0{
-  //    col0_data.begin(), col0_data.end(), validity};
-  column_wrapper<int8_t> col1{col1_data.begin(), col1_data.end(), validity};
-  column_wrapper<int16_t> col2{col2_data.begin(), col2_data.end(), validity};
-  column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), validity};
-  column_wrapper<float> col4{col4_data.begin(), col4_data.end(), validity};
-  column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
-  column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, validity};
-  column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, validity};
-  column_wrapper<numeric::decimal128> col8{col8_data, col8_data + num_rows, validity};
+  // column_wrapper<bool> col0{col0_data.begin(), col0_data.end(), no_nulls()};
+  column_wrapper<int8_t> col1{col1_data.begin(), col1_data.end(), no_nulls()};
+  column_wrapper<int16_t> col2{col2_data.begin(), col2_data.end(), no_nulls()};
+  column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), no_nulls()};
+  column_wrapper<float> col4{col4_data.begin(), col4_data.end(), no_nulls()};
+  column_wrapper<double> col5{col5_data.begin(), col5_data.end(), no_nulls()};
+  column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, no_nulls()};
+  column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, no_nulls()};
+  column_wrapper<numeric::decimal128> col8{col8_data, col8_data + num_rows, no_nulls()};
 
   auto expected = table_view{{col1, col2, col3, col4, col5, col6, col7, col8}};
 
@@ -108,17 +108,17 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls)
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int32_t>(num_rows);
   auto col7_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) {
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) {
     return numeric::decimal32{col6_vals[i], numeric::scale_type{-2}};
   });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [col7_vals](auto i) {
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) {
     return numeric::decimal64{col7_vals[i], numeric::scale_type{-8}};
   });
   // auto col0_mask = cudf::detail::make_counting_transform_iterator(
   //    0, [](auto i) { return (i % 2); });
   auto col1_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); });
-  auto col2_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  auto col2_mask = no_nulls();
   auto col3_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); });
   auto col4_mask =
@@ -181,11 +181,10 @@ TEST_P(ParquetV2Test, Strings)
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), validity};
+  column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), no_nulls()};
   column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
-  column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
+  column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), no_nulls()};
 
   auto expected = table_view{{col0, col1, col2}};
 
@@ -688,60 +687,9 @@ TEST_P(ParquetV2Test, PartitionedWriteEmptyColumns)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
 }
 
-TEST_P(ParquetV2Test, LargeColumnIndex)
-{
-  // create a file large enough to be written in 2 batches (currently 1GB per batch)
-  // pick fragment size that num_rows is divisible by, so we'll get equal sized row groups
-  const std::string s1(1000, 'a');
-  const std::string s2(1000, 'b');
-  constexpr auto num_rows  = 512 * 1024;
-  constexpr auto frag_size = num_rows / 128;
-  auto const is_v2         = GetParam();
-
-  auto col0_elements = cudf::detail::make_counting_transform_iterator(
-    0, [&](auto i) { return (i < num_rows) ? s1 : s2; });
-  auto col0 = cudf::test::strings_column_wrapper(col0_elements, col0_elements + 2 * num_rows);
-
-  auto const expected = table_view{{col0, col0}};
-
-  auto const filepath = temp_env->get_temp_filepath("LargeColumnIndex.parquet");
-  const cudf::io::parquet_writer_options out_opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
-      .compression(cudf::io::compression_type::NONE)
-      .dictionary_policy(cudf::io::dictionary_policy::NEVER)
-      .write_v2_headers(is_v2)
-      .max_page_fragment_size(frag_size)
-      .row_group_size_bytes(1024 * 1024 * 1024)
-      .row_group_size_rows(num_rows);
-  cudf::io::write_parquet(out_opts);
-
-  auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::detail::FileMetaData fmd;
-
-  read_footer(source, &fmd);
-
-  for (auto const& rg : fmd.row_groups) {
-    for (size_t c = 0; c < rg.columns.size(); c++) {
-      auto const& chunk = rg.columns[c];
-
-      auto const ci    = read_column_index(source, chunk);
-      auto const stats = get_statistics(chunk);
-
-      // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
-      auto const ptype = fmd.schema[c + 1].type;
-      auto const ctype = fmd.schema[c + 1].converted_type;
-      ASSERT_TRUE(stats.min_value.has_value());
-      ASSERT_TRUE(stats.max_value.has_value());
-      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0);
-      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0);
-    }
-  }
-}
-
 TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 {
-  constexpr auto num_rows      = 100000;
+  constexpr auto num_rows      = 50000;
   auto const is_v2             = GetParam();
   auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
                                        : cudf::io::parquet::detail::PageType::DATA_PAGE;
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 51190b5de9e..9415e018c6a 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -28,6 +28,8 @@
 
 #include <fstream>
 
+using cudf::test::iterators::no_nulls;
+
 template <typename mask_op_t>
 void test_durations(mask_op_t mask_op)
 {
@@ -100,13 +102,12 @@ TEST_F(ParquetWriterTest, MultiIndex)
   auto col2_data = random_values<int32_t>(num_rows);
   auto col3_data = random_values<float>(num_rows);
   auto col4_data = random_values<double>(num_rows);
-  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  column_wrapper<int8_t> col0{col0_data.begin(), col0_data.end(), validity};
-  column_wrapper<int16_t> col1{col1_data.begin(), col1_data.end(), validity};
-  column_wrapper<int32_t> col2{col2_data.begin(), col2_data.end(), validity};
-  column_wrapper<float> col3{col3_data.begin(), col3_data.end(), validity};
-  column_wrapper<double> col4{col4_data.begin(), col4_data.end(), validity};
+  column_wrapper<int8_t> col0{col0_data.begin(), col0_data.end(), no_nulls()};
+  column_wrapper<int16_t> col1{col1_data.begin(), col1_data.end(), no_nulls()};
+  column_wrapper<int32_t> col2{col2_data.begin(), col2_data.end(), no_nulls()};
+  column_wrapper<float> col3{col3_data.begin(), col3_data.end(), no_nulls()};
+  column_wrapper<double> col4{col4_data.begin(), col4_data.end(), no_nulls()};
 
   auto expected = table_view{{col0, col1, col2, col3, col4}};
 
@@ -139,9 +140,7 @@ TEST_F(ParquetWriterTest, BufferSource)
 {
   constexpr auto num_rows = 100 << 10;
   auto const seq_col      = random_values<int>(num_rows);
-  auto const validity =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-  column_wrapper<int> col{seq_col.begin(), seq_col.end(), validity};
+  column_wrapper<int> col{seq_col.begin(), seq_col.end(), no_nulls()};
 
   auto const expected = table_view{{col}};
 
@@ -185,12 +184,13 @@ TEST_F(ParquetWriterTest, BufferSource)
 TEST_F(ParquetWriterTest, ManyFragments)
 {
   srand(31337);
-  auto const expected = create_random_fixed_table<int>(10, 6'000'000, false);
+  auto const expected = create_random_fixed_table<int>(1, 700'000, false);
 
   auto const filepath = temp_env->get_temp_filepath("ManyFragments.parquet");
   cudf::io::parquet_writer_options const args =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
-      .max_page_size_bytes(8 * 1024);
+      .max_page_size_bytes(8 * 1024)
+      .max_page_fragment_size(10);
   cudf::io::write_parquet(args);
 
   cudf::io::parquet_reader_options const read_opts =
@@ -342,11 +342,12 @@ TEST_F(ParquetWriterTest, DeviceWriteLargeishFile)
 
   // exercises multiple rowgroups
   srand(31337);
-  auto expected = create_random_fixed_table<int>(4, 4 * 1024 * 1024, false);
+  auto expected = create_random_fixed_table<int>(4, 1024 * 1024, false);
 
   // write out using the custom sink (which uses device writes)
   cudf::io::parquet_writer_options args =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected);
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected)
+      .row_group_size_rows(128 * 1024);
   cudf::io::write_parquet(args);
 
   cudf::io::parquet_reader_options custom_args =
@@ -613,11 +614,10 @@ TEST_F(ParquetWriterTest, EmptyListWithStruct)
 TEST_F(ParquetWriterTest, CheckPageRows)
 {
   auto sequence = thrust::make_counting_iterator(0);
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   constexpr auto page_rows = 5000;
   constexpr auto num_rows  = 2 * page_rows;
-  column_wrapper<int> col(sequence, sequence + num_rows, validity);
+  column_wrapper<int> col(sequence, sequence + num_rows, no_nulls());
 
   auto expected = table_view{{col}};
 
@@ -1267,8 +1267,7 @@ TEST_F(ParquetWriterTest, CompStatsEmptyTable)
 
 TEST_F(ParquetWriterTest, NoNullsAsNonNullable)
 {
-  auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-  column_wrapper<int32_t> col{{1, 2, 3}, valids};
+  column_wrapper<int32_t> col{{1, 2, 3}, no_nulls()};
   table_view expected({col});
 
   cudf::io::table_input_metadata expected_metadata(expected);
@@ -1459,10 +1458,9 @@ TYPED_TEST(ParquetWriterNumericTypeTest, SingleColumn)
 {
   auto sequence =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return TypeParam(i % 400); });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   constexpr auto num_rows = 800;
-  column_wrapper<TypeParam> col(sequence, sequence + num_rows, validity);
+  column_wrapper<TypeParam> col(sequence, sequence + num_rows, no_nulls());
 
   auto expected = table_view{{col}};
 
@@ -1516,11 +1514,10 @@ TYPED_TEST(ParquetWriterTimestampTypeTest, Timestamps)
 {
   auto sequence = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return ((std::rand() / 10000) * 1000); });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   constexpr auto num_rows = 100;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
-    sequence, sequence + num_rows, validity);
+    sequence, sequence + num_rows, no_nulls());
 
   auto expected = table_view{{col}};
 
@@ -1567,11 +1564,10 @@ TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampOverflow)
 {
   constexpr int64_t max = std::numeric_limits<int64_t>::max();
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return max - i; });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   constexpr auto num_rows = 100;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
-    sequence, sequence + num_rows, validity);
+    sequence, sequence + num_rows, no_nulls());
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("ParquetTimestampOverflow.parquet");

From 734ca757bf43f76922f266ea3bb2cb67372374ca Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Jan 2024 07:19:59 -1000
Subject: [PATCH 05/60] Implement `cudf.MultiIndex.from_arrays` (#14740)

Implements `cudf.MultiIndex.from_arrays`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/14740
---
 .../user_guide/api_docs/index_objects.rst     |  1 +
 python/cudf/cudf/core/multiindex.py           | 63 +++++++++++++++++++
 python/cudf/cudf/tests/test_multiindex.py     | 38 ++++++++---
 3 files changed, 94 insertions(+), 8 deletions(-)

diff --git a/docs/cudf/source/user_guide/api_docs/index_objects.rst b/docs/cudf/source/user_guide/api_docs/index_objects.rst
index 013eaf29a56..b6da9af9b3e 100644
--- a/docs/cudf/source/user_guide/api_docs/index_objects.rst
+++ b/docs/cudf/source/user_guide/api_docs/index_objects.rst
@@ -228,6 +228,7 @@ MultiIndex constructors
 .. autosummary::
    :toctree: api/
 
+   MultiIndex.from_arrays
    MultiIndex.from_tuples
    MultiIndex.from_product
    MultiIndex.from_frame
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 0f323dd5540..8ba47795437 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -27,6 +27,7 @@
 from cudf.core._compat import PANDAS_GE_150
 from cudf.core.frame import Frame
 from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
+from cudf.utils.dtypes import is_column_like
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
@@ -1226,6 +1227,7 @@ def from_tuples(cls, tuples, names=None):
 
         See Also
         --------
+        MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
         MultiIndex.from_product : Make a MultiIndex from cartesian product
                                   of iterables.
         MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
@@ -1335,6 +1337,7 @@ def from_frame(cls, df, names=None):
 
         See Also
         --------
+        MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
         MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
         MultiIndex.from_product : Make a MultiIndex from cartesian product
                                   of iterables.
@@ -1429,6 +1432,66 @@ def from_product(cls, arrays, names=None):
         pdi = pd.MultiIndex.from_product(arrays, names=names)
         return cls.from_pandas(pdi)
 
+    @classmethod
+    @_cudf_nvtx_annotate
+    def from_arrays(
+        cls,
+        arrays,
+        sortorder=None,
+        names=None,
+    ) -> MultiIndex:
+        """
+        Convert arrays to MultiIndex.
+
+        Parameters
+        ----------
+        arrays : list / sequence of array-likes
+            Each array-like gives one level's value for each data point.
+            len(arrays) is the number of levels.
+        sortorder : optional int
+            Not yet supported
+        names : list / sequence of str, optional
+            Names for the levels in the index.
+
+        Returns
+        -------
+        MultiIndex
+
+        See Also
+        --------
+        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
+        MultiIndex.from_product : Make a MultiIndex from cartesian product
+                                  of iterables.
+        MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
+
+        Examples
+        --------
+        >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
+        >>> cudf.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+        MultiIndex([(1,  'red'),
+                    (1, 'blue'),
+                    (2,  'red'),
+                    (2, 'blue')],
+                   names=['number', 'color'])
+        """
+        # Imported here due to circular import
+        from cudf.core.algorithms import factorize
+
+        error_msg = "Input must be a list / sequence of array-likes."
+        if not is_list_like(arrays):
+            raise TypeError(error_msg)
+        codes = []
+        levels = []
+        for array in arrays:
+            if not (is_list_like(array) or is_column_like(array)):
+                raise TypeError(error_msg)
+            code, level = factorize(array, sort=True)
+            codes.append(code)
+            levels.append(level)
+        return cls(
+            codes=codes, levels=levels, sortorder=sortorder, names=names
+        )
+
     @_cudf_nvtx_annotate
     def _poplevels(self, level):
         """
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 2d5a4d1d782..78bce89f2a8 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 """
 Test related to MultiIndex
@@ -2085,12 +2085,7 @@ def test_multiindex_eq_other_multiindex():
     params=[
         "from_product",
         "from_tuples",
-        pytest.param(
-            "from_arrays",
-            marks=pytest.mark.xfail(
-                reason="TODO: from_arrays is not implemented"
-            ),
-        ),
+        "from_arrays",
         "init",
     ]
 )
@@ -2100,7 +2095,7 @@ def midx(request):
     elif request.param == "from_tuples":
         return cudf.MultiIndex.from_tuples([(0, 1), (0, 0), (1, 1), (1, 0)])
     elif request.param == "from_arrays":
-        return cudf.MultiIndex.from_arrays([0, 0, 1, 1], [1, 0, 1, 0])
+        return cudf.MultiIndex.from_arrays([[0, 0, 1, 1], [1, 0, 1, 0]])
     elif request.param == "init":
         return cudf.MultiIndex(
             levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]]
@@ -2112,3 +2107,30 @@ def midx(request):
 def test_multindex_constructor_levels_always_indexes(midx):
     assert_eq(midx.levels[0], cudf.Index([0, 1]))
     assert_eq(midx.levels[1], cudf.Index([0, 1]))
+
+
+@pytest.mark.parametrize(
+    "array",
+    [
+        list,
+        tuple,
+        np.array,
+        cp.array,
+        pd.Index,
+        cudf.Index,
+        pd.Series,
+        cudf.Series,
+    ],
+)
+def test_multiindex_from_arrays(array):
+    pd_data = [[0, 0, 1, 1], [1, 0, 1, 0]]
+    cudf_data = [array(lst) for lst in pd_data]
+    result = pd.MultiIndex.from_arrays(pd_data)
+    expected = cudf.MultiIndex.from_arrays(cudf_data)
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("arg", ["foo", ["foo"]])
+def test_multiindex_from_arrays_wrong_arg(arg):
+    with pytest.raises(TypeError):
+        cudf.MultiIndex.from_arrays(arg)

From 70cdeec9c0726b464a1f01746cf278a7de8ffeb7 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Jan 2024 07:22:08 -1000
Subject: [PATCH 06/60] Use _from_data instead of _from_columns for initialzing
 Frame (#14755)

In the spirit of reducing redundant methods, `_from_columns` just calls `_from_data` (hoping to rename to `_from_mapping` or similar) so removing the need for `_from_columns`.

Hoping to do the same for the `_from_columns_like_self` in a follow up.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/14755
---
 python/cudf/cudf/core/_internals/timezones.py |  8 +--
 python/cudf/cudf/core/frame.py                | 14 +----
 python/cudf/cudf/core/groupby/groupby.py      | 22 +++++---
 python/cudf/cudf/core/index.py                | 12 ++---
 python/cudf/cudf/core/indexed_frame.py        | 51 +++++++------------
 python/cudf/cudf/io/dlpack.py                 |  6 +--
 6 files changed, 49 insertions(+), 64 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 67043d3fbb3..053425fff8d 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 import os
 import zoneinfo
@@ -89,8 +89,10 @@ def _read_tzfile_as_frame(tzdir, zone_name):
             [np.timedelta64(0, "s")]
         )
 
-    return DataFrame._from_columns(
-        transition_times_and_offsets, ["transition_times", "offsets"]
+    return DataFrame._from_data(
+        dict(
+            zip(["transition_times", "offsets"], transition_times_and_offsets)
+        )
     )
 
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 5f7a86e86d8..05104a3ef05 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -144,17 +144,6 @@ def _from_data(cls, data: MutableMapping):
     def _from_data_like_self(self, data: MutableMapping):
         return self._from_data(data)
 
-    @classmethod
-    @_cudf_nvtx_annotate
-    def _from_columns(
-        cls,
-        columns: List[ColumnBase],
-        column_names: abc.Iterable[str],
-    ):
-        """Construct a `Frame` object from a list of columns."""
-        data = {name: columns[i] for i, name in enumerate(column_names)}
-        return cls._from_data(data)
-
     @_cudf_nvtx_annotate
     def _from_columns_like_self(
         self,
@@ -169,7 +158,8 @@ def _from_columns_like_self(
         """
         if column_names is None:
             column_names = self._column_names
-        frame = self.__class__._from_columns(columns, column_names)
+        data = dict(zip(column_names, columns))
+        frame = self.__class__._from_data(data)
         return frame._copy_type_metadata(self, override_dtypes=override_dtypes)
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 4e8947652ff..6c83bcd9efb 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2111,9 +2111,13 @@ def diff(self, periods=1, axis=0):
     def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
         """Internal implementation for `ffill` and `bfill`"""
         values = self.grouping.values
-        result = self.obj._from_columns(
-            self._groupby.replace_nulls([*values._columns], method),
-            values._column_names,
+        result = self.obj._from_data(
+            dict(
+                zip(
+                    values._column_names,
+                    self._groupby.replace_nulls([*values._columns], method),
+                )
+            )
         )
         result = self._mimic_pandas_order(result)
         return result._copy_type_metadata(values)
@@ -2305,9 +2309,15 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         else:
             fill_value = [fill_value] * len(values._data)
 
-        result = self.obj.__class__._from_columns(
-            self._groupby.shift([*values._columns], periods, fill_value)[0],
-            values._column_names,
+        result = self.obj.__class__._from_data(
+            dict(
+                zip(
+                    values._column_names,
+                    self._groupby.shift(
+                        [*values._columns], periods, fill_value
+                    )[0],
+                )
+            )
         )
         result = self._mimic_pandas_order(result)
         return result._copy_type_metadata(values)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index e012d8e7140..3e8f6bc2ccb 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -800,22 +800,22 @@ def sort_values(
     @_cudf_nvtx_annotate
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
-        return _dtype_to_index[self.dtype.type]._from_columns(
-            [self._values.take(gather_map, nullify, check_bounds)], [self.name]
+        return _dtype_to_index[self.dtype.type]._from_data(
+            {self.name: self._values.take(gather_map, nullify, check_bounds)}
         )
 
     @_cudf_nvtx_annotate
     def _apply_boolean_mask(self, boolean_mask):
-        return _dtype_to_index[self.dtype.type]._from_columns(
-            [self._values.apply_boolean_mask(boolean_mask)], [self.name]
+        return _dtype_to_index[self.dtype.type]._from_data(
+            {self.name: self._values.apply_boolean_mask(boolean_mask)}
         )
 
     def repeat(self, repeats, axis=None):
         return self._as_int_index().repeat(repeats, axis)
 
     def _split(self, splits):
-        return _dtype_to_index[self.dtype.type]._from_columns(
-            [self._as_int_index()._split(splits)], [self.name]
+        return _dtype_to_index[self.dtype.type]._from_data(
+            {self.name: self._as_int_index()._split(splits)}
         )
 
     def _binaryop(self, other, op: str):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 2a35ac0f959..70be5c3ad0f 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -291,19 +291,27 @@ def _from_data_like_self(self, data: MutableMapping):
         out._data._level_names = self._data._level_names
         return out
 
-    @classmethod
     @_cudf_nvtx_annotate
-    def _from_columns(
-        cls,
+    def _from_columns_like_self(
+        self,
         columns: List[ColumnBase],
-        column_names: List[str],
+        column_names: Optional[abc.Iterable[str]] = None,
         index_names: Optional[List[str]] = None,
-    ):
-        """Construct a `Frame` object from a list of columns.
+        *,
+        override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
+    ) -> Self:
+        """Construct a `Frame` from a list of columns with metadata from self.
 
         If `index_names` is set, the first `len(index_names)` columns are
         used to construct the index of the frame.
+
+        If override_dtypes is provided then any non-None entry will be
+        used for the dtype of the matching column in preference to the
+        dtype of the column in self.
         """
+        if column_names is None:
+            column_names = self._column_names
+
         data_columns = columns
         index = None
 
@@ -316,36 +324,11 @@ def _from_columns(
             else:
                 index.name = index_names[0]
 
-        out = super()._from_columns(data_columns, column_names)
+        data = dict(zip(column_names, data_columns))
+        frame = self.__class__._from_data(data)
 
         if index is not None:
-            out._index = index
-
-        return out
-
-    @_cudf_nvtx_annotate
-    def _from_columns_like_self(
-        self,
-        columns: List[ColumnBase],
-        column_names: Optional[abc.Iterable[str]] = None,
-        index_names: Optional[List[str]] = None,
-        *,
-        override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
-    ) -> Self:
-        """Construct a `Frame` from a list of columns with metadata from self.
-
-        If `index_names` is set, the first `len(index_names)` columns are
-        used to construct the index of the frame.
-
-        If override_dtypes is provided then any non-None entry will be
-        used for the dtype of the matching column in preference to the
-        dtype of the column in self.
-        """
-        if column_names is None:
-            column_names = self._column_names
-        frame = self.__class__._from_columns(
-            columns, column_names, index_names
-        )
+            frame._index = index
         return frame._copy_type_metadata(
             self,
             include_index=bool(index_names),
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index bed376e4a79..d3d99aab0cd 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -35,12 +35,12 @@ def from_dlpack(pycapsule_obj):
     """
 
     columns = libdlpack.from_dlpack(pycapsule_obj)
-    column_names = range(len(columns))
+    data = dict(enumerate(columns))
 
     if len(columns) == 1:
-        return cudf.Series._from_columns(columns, column_names=column_names)
+        return cudf.Series._from_data(data)
     else:
-        return cudf.DataFrame._from_columns(columns, column_names=column_names)
+        return cudf.DataFrame._from_data(data)
 
 
 @ioutils.doc_to_dlpack()

From 66c3e8e92f9c37dd909b78936addb463f1bd6011 Mon Sep 17 00:00:00 2001
From: Suraj Aralihalli <suraj.ara16@gmail.com>
Date: Thu, 18 Jan 2024 10:40:15 -0800
Subject: [PATCH 07/60] Resolve degenerate performance in `create_structs_data`
 (#14761)

Resolves issue [#14716](https://github.com/rapidsai/cudf/issues/14716)

- Eliminated unnecessary recursive self-calls in the `superimpose_nulls_no_sanitize` function, addressing performance issues in `make_structs_column`.
- Introduced `STRUCT_CREATION_NVBENCH` to assess the performance of the `create_structs_data` function.

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14761
---
 cpp/benchmarks/CMakeLists.txt             |  7 ++++-
 cpp/benchmarks/structs/create_structs.cpp | 31 +++++++++++++++++++++++
 cpp/src/structs/utilities.cpp             | 14 +++-------
 3 files changed, 41 insertions(+), 11 deletions(-)
 create mode 100644 cpp/benchmarks/structs/create_structs.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 9c3a05a2f5f..35b03fa33d0 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -183,6 +183,11 @@ ConfigureNVBench(
   sort/sort_lists.cpp sort/sort_structs.cpp
 )
 
+# ##################################################################################################
+# * structs benchmark
+# --------------------------------------------------------------------------------
+ConfigureNVBench(STRUCT_CREATION_NVBENCH structs/create_structs.cpp)
+
 # ##################################################################################################
 # * quantiles benchmark
 # --------------------------------------------------------------------------------
diff --git a/cpp/benchmarks/structs/create_structs.cpp b/cpp/benchmarks/structs/create_structs.cpp
new file mode 100644
index 00000000000..480a719461e
--- /dev/null
+++ b/cpp/benchmarks/structs/create_structs.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_nested_types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_create_structs(nvbench::state& state)
+{
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto const table_ptr = create_structs_data(state); });
+}
+
+NVBENCH_BENCH(nvbench_create_structs)
+  .set_name("create_structs")
+  .add_int64_power_of_two_axis("NumRows", {10, 18, 26})
+  .add_int64_axis("Depth", {1, 8, 16})
+  .add_int64_axis("Nulls", {0, 1});
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index acb153f28d6..f47d066852c 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/structs/structs_column_view.hpp>
@@ -229,6 +230,7 @@ std::unique_ptr<column> superimpose_nulls_no_sanitize(bitmask_type const* null_m
                                                       rmm::cuda_stream_view stream,
                                                       rmm::mr::device_memory_resource* mr)
 {
+  CUDF_FUNC_RANGE();
   if (input->type().id() == cudf::type_id::EMPTY) {
     // EMPTY columns should not have a null mask,
     // so don't superimpose null mask on empty columns.
@@ -258,19 +260,11 @@ std::unique_ptr<column> superimpose_nulls_no_sanitize(bitmask_type const* null_m
   // If the input is also a struct, repeat for all its children. Otherwise just return.
   if (input->type().id() != cudf::type_id::STRUCT) { return std::move(input); }
 
-  auto const current_mask   = input->view().null_mask();
   auto const new_null_count = input->null_count();  // this was just computed in the step above
   auto content              = input->release();
 
-  // Build new children columns.
-  std::for_each(content.children.begin(),
-                content.children.end(),
-                [current_mask, new_null_count, stream, mr](auto& child) {
-                  child = superimpose_nulls_no_sanitize(
-                    current_mask, new_null_count, std::move(child), stream, mr);
-                });
-
   // Replace the children columns.
+  // make_structs_column recursively calls superimpose_nulls
   return cudf::make_structs_column(num_rows,
                                    std::move(content.children),
                                    new_null_count,

From 46119314c3a7a7621a476d730d0441031398f74c Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 18 Jan 2024 14:54:50 -0500
Subject: [PATCH 08/60] DOC v24.04 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  4 +--
 .devcontainer/cuda11.8-pip/devcontainer.json  |  4 +--
 .../cuda12.0-conda/devcontainer.json          |  4 +--
 .devcontainer/cuda12.0-pip/devcontainer.json  |  4 +--
 .github/workflows/build.yaml                  | 16 ++++-----
 .github/workflows/pr.yaml                     | 36 +++++++++----------
 .github/workflows/test.yaml                   | 20 +++++------
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 ci/build_docs.sh                              |  2 +-
 ci/check_style.sh                             |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             | 10 +++---
 .../all_cuda-120_arch-x86_64.yaml             | 10 +++---
 cpp/CMakeLists.txt                            |  2 +-
 cpp/doxygen/Doxyfile                          |  4 +--
 cpp/examples/fetch_dependencies.cmake         |  2 +-
 cpp/libcudf_kafka/CMakeLists.txt              |  2 +-
 dependencies.yaml                             | 16 ++++-----
 docs/cudf/source/conf.py                      |  4 +--
 docs/dask_cudf/source/conf.py                 |  4 +--
 fetch_rapids.cmake                            |  2 +-
 java/ci/README.md                             |  4 +--
 java/pom.xml                                  |  2 +-
 java/src/main/native/CMakeLists.txt           |  2 +-
 python/cudf/CMakeLists.txt                    |  2 +-
 python/cudf/pyproject.toml                    |  4 +--
 python/cudf_kafka/CMakeLists.txt              |  2 +-
 python/cudf_kafka/pyproject.toml              |  2 +-
 python/custreamz/pyproject.toml               |  4 +--
 python/dask_cudf/pyproject.toml               |  6 ++--
 30 files changed, 90 insertions(+), 90 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 72a4b5e0f04..6e71505fc7e 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.02-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index d2afd3a497e..84616c25cf2 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda11.8-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
index 42ed334fe03..ef2b34b41a6 100644
--- a/.devcontainer/cuda12.0-conda/devcontainer.json
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.02-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
index 306a2065ef0..d3257b6cf43 100644
--- a/.devcontainer/cuda12.0-pip/devcontainer.json
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda12.0-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda12.0-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 36cca8b6161..c663f52f548 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       build-2_28-wheels: "true"
@@ -80,7 +80,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -90,7 +90,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index edcc140b191..9c46955ce56 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -31,34 +31,34 @@ jobs:
       #- pandas-tests-diff
       #- pandas-tests-diff-comment
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -66,14 +66,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -83,7 +83,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -93,7 +93,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -103,7 +103,7 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       build-2_28-wheels: "true"
@@ -111,14 +111,14 @@ jobs:
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -126,14 +126,14 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04
     with:
       build_command: |
         sccache -z;
@@ -142,7 +142,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -151,7 +151,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | max_by(.CUDA_VER) | [.]
       build_type: pull-request
@@ -161,7 +161,7 @@ jobs:
   #  needs: [pandas-tests-main, pandas-tests-pr]
   #  secrets: inherit
   #  # This branch exports a `job_output` output that the downstream job reads.
-  #  uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+  #  uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
   #  with:
   #    node_type: cpu4
   #    build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index df26a8c5916..65bb1af00cb 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -36,7 +36,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: nightly
@@ -98,7 +98,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -108,7 +108,7 @@ jobs:
   pandas-tests:
     # run the Pandas unit tests
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | max_by(.CUDA_VER) | [.]
       build_type: nightly
diff --git a/README.md b/README.md
index 996e5ff4800..0602647dad7 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.02 python=3.10 cuda-version=11.8
+    cudf=24.04 python=3.10 cuda-version=11.8
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index 3c6c5e2b706..4a2fe8aa570 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.02.00
+24.04.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index ceab29c2473..98b17bc0a64 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -27,7 +27,7 @@ rapids-mamba-retry install \
   --channel "${PYTHON_CHANNEL}" \
   libcudf cudf dask-cudf
 
-export RAPIDS_VERSION_NUMBER="24.02"
+export RAPIDS_VERSION_NUMBER="24.04"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
diff --git a/ci/check_style.sh b/ci/check_style.sh
index 088860d38df..da598a58880 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -16,7 +16,7 @@ rapids-dependency-file-generator \
 rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n checks
 conda activate checks
 
-FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.02/cmake-format-rapids-cmake.json
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.04/cmake-format-rapids-cmake.json
 export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
 mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 15bcf207b1b..b5e2566fd0d 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -28,7 +28,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.2.*
+- dask-cuda==24.4.*
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -47,10 +47,10 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==24.2.*
+- libkvikio==24.4.*
 - libparquet==14.0.1.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.2.*
+- librmm==24.4.*
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -82,9 +82,9 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
-- rapids-dask-dependency==24.2.*
+- rapids-dask-dependency==24.4.*
 - rich
-- rmm==24.2.*
+- rmm==24.4.*
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index ccad6a366fb..2aa64e6384b 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.2.*
+- dask-cuda==24.4.*
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -46,10 +46,10 @@ dependencies:
 - libarrow==14.0.1.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==24.2.*
+- libkvikio==24.4.*
 - libparquet==14.0.1.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.2.*
+- librmm==24.4.*
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -79,9 +79,9 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
-- rapids-dask-dependency==24.2.*
+- rapids-dask-dependency==24.4.*
 - rich
-- rmm==24.2.*
+- rmm==24.4.*
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2c0f601ca74..ca38284226a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF)
 
 project(
   CUDF
-  VERSION 24.02.00
+  VERSION 24.04.00
   LANGUAGES C CXX CUDA
 )
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 4333699d1f6..e45f856b870 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = libcudf
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 24.02.00
+PROJECT_NUMBER         = 24.04.00
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -2226,7 +2226,7 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/24.02
+TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/24.04
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
diff --git a/cpp/examples/fetch_dependencies.cmake b/cpp/examples/fetch_dependencies.cmake
index 2ec7171c7da..34db0bcdb8c 100644
--- a/cpp/examples/fetch_dependencies.cmake
+++ b/cpp/examples/fetch_dependencies.cmake
@@ -19,7 +19,7 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-24.02)
+set(CUDF_TAG branch-24.04)
 CPMFindPackage(
   NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index e31f6bd4096..5080091664e 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -22,7 +22,7 @@ include(rapids-find)
 
 project(
   CUDF_KAFKA
-  VERSION 24.02.00
+  VERSION 24.04.00
   LANGUAGES CXX
 )
 
diff --git a/dependencies.yaml b/dependencies.yaml
index 1c6d5086bf3..719794e6a19 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -247,8 +247,8 @@ dependencies:
           - &gbench benchmark==1.8.0
           - &gtest gtest>=1.13.0
           - &gmock gmock>=1.13.0
-          - librmm==24.2.*
-          - libkvikio==24.2.*
+          - librmm==24.4.*
+          - libkvikio==24.4.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==3.0.5
@@ -281,7 +281,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.2.*
+          - &rmm_conda rmm==24.4.*
           - &protobuf protobuf>=4.21,<5
           - pip
           - pip:
@@ -458,7 +458,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==24.2.*
+          - dask-cuda==24.4.*
           - *doxygen
           - make
           - myst-nb
@@ -557,7 +557,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==24.2.*
+          - rapids-dask-dependency==24.4.*
   run_custreamz:
     common:
       - output_types: conda
@@ -656,13 +656,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==24.2.*
+          - dask-cuda==24.4.*
           - *numba
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.2.*
+          - &cudf_conda cudf==24.4.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -683,7 +683,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &cudf_kafka_conda cudf_kafka==24.2.*
+          - &cudf_kafka_conda cudf_kafka==24.4.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index b997c78fba8..6bcc9f24e1c 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -163,9 +163,9 @@ def clean_all_xml_files(path):
 # built documents.
 #
 # The short X.Y version.
-version = '24.02'
+version = '24.04'
 # The full version, including alpha/beta/rc tags.
-release = '24.02.00'
+release = '24.04.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
index daffe941963..f1f28ccd752 100644
--- a/docs/dask_cudf/source/conf.py
+++ b/docs/dask_cudf/source/conf.py
@@ -11,8 +11,8 @@
 project = "dask-cudf"
 copyright = "2018-2023, NVIDIA Corporation"
 author = "NVIDIA Corporation"
-version = '24.02'
-release = '24.02.00'
+version = '24.04'
+release = '24.04.00'
 
 language = "en"
 
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 65b0b89042b..463caa5088b 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.02/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.04/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake
   )
 endif()
diff --git a/java/ci/README.md b/java/ci/README.md
index f31c950c517..ba039acc45d 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.02
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.04
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable devtoolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-24.02.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-24.04.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index ebc5c968836..8b2fdcaa85f 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>24.02.0-SNAPSHOT</version>
+    <version>24.04.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 1853cbd6a5c..e42eff19895 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -28,7 +28,7 @@ rapids_cuda_init_architectures(CUDF_JNI)
 
 project(
   CUDF_JNI
-  VERSION 24.02.00
+  VERSION 24.04.00
   LANGUAGES C CXX CUDA
 )
 
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index b3e1f13ab85..77771afe0e6 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(cudf_version 24.02.00)
+set(cudf_version 24.04.00)
 
 include(../../fetch_rapids.cmake)
 include(rapids-cuda)
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 18771804f61..780b2b39add 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
     "numpy>=1.21,<1.25",
     "protoc-wheel",
     "pyarrow==14.0.1.*",
-    "rmm==24.2.*",
+    "rmm==24.4.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -38,7 +38,7 @@ dependencies = [
     "ptxcompiler",
     "pyarrow>=14.0.1,<15.0.0a0",
     "rich",
-    "rmm==24.2.*",
+    "rmm==24.4.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt
index 116bc7a3beb..db18d901ba6 100644
--- a/python/cudf_kafka/CMakeLists.txt
+++ b/python/cudf_kafka/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(cudf_kafka_version 24.02.00)
+set(cudf_kafka_version 24.04.00)
 
 include(../../fetch_rapids.cmake)
 
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 062a0224c1f..24b27b40dbd 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -22,7 +22,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.2.*",
+    "cudf==24.4.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 3e6c74ab570..04396cab452 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -19,8 +19,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==24.2.*",
-    "cudf_kafka==24.2.*",
+    "cudf==24.4.*",
+    "cudf_kafka==24.4.*",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 33065da6e8d..6b279e5abd2 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -18,12 +18,12 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.2.*",
+    "cudf==24.4.*",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.21,<1.25",
     "pandas>=1.3,<1.6.0dev0",
-    "rapids-dask-dependency==24.2.*",
+    "rapids-dask-dependency==24.4.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -41,7 +41,7 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==24.2.*",
+    "dask-cuda==24.4.*",
     "numba>=0.57",
     "pytest",
     "pytest-cov",

From c0a9510280dffbc563e2960b9b844112d09ef15d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 18 Jan 2024 18:34:22 -0500
Subject: [PATCH 09/60] Remove unparseable attributes from all nodes (#14780)

In my initial pass through enabling Breathe I tried to leave a minimal footprint of external modification to the generated files. In this particular case it looks like the problematic attributes can appear in more places than I originally observed. I never observed this behavior in that PR, but I don't know if these now appear due to something that merged after #13846 or something else changing in the environment (e.g. a Sphinx or doxygen behavior etc). Nonetheless, blanket wiping these is the simpler and safer option. The docs build successfully locally with this change.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14780
---
 docs/cudf/source/conf.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index b997c78fba8..a0bb555365e 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -109,12 +109,12 @@ def clean_definitions(root):
     # All of these in type declarations cause Breathe to choke.
     # For friend, see https://github.com/breathe-doc/breathe/issues/916
     strings_to_remove = ("__forceinline__", "CUDF_HOST_DEVICE", "decltype(auto)", "friend")
-    for field in (".//type", ".//definition"):
-        for type_ in root.findall(field):
-            if type_.text is not None:
-                for string in strings_to_remove:
-                    type_.text = type_.text.replace(string, "")
-
+    for node in root.iter():
+        for string in strings_to_remove:
+            if node.text is not None:
+                node.text = node.text.replace(string, "")
+            if node.tail is not None:
+                node.tail = node.tail.replace(string, "")
 
 def clean_all_xml_files(path):
     for fn in glob.glob(os.path.join(path, "*.xml")):

From eeee795c232e2811adeb5a3942f7a149d8b16d49 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Thu, 18 Jan 2024 15:59:40 -0800
Subject: [PATCH 10/60] fix benchmarks compatibility with newer pytest-cases
 (#14764)

Reverts changes from #14756.

* updates `cudf`'s tests to be compatible with the latest `pytest-cases` ([version 3.8.2](https://pypi.org/project/pytest-cases/#history))
* puts a floor of `pytest-cases>=3.8.2` on that project to be sure older versions aren't used

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14764
---
 conda/environments/all_cuda-118_arch-x86_64.yaml            | 2 +-
 conda/environments/all_cuda-120_arch-x86_64.yaml            | 2 +-
 dependencies.yaml                                           | 2 +-
 python/cudf/benchmarks/API/bench_dataframe.py               | 6 ++++--
 python/cudf/benchmarks/API/bench_functions.py               | 6 ++++--
 .../API/{bench_dataframe_cases.py => cases_dataframe.py}    | 2 +-
 .../API/{bench_functions_cases.py => cases_functions.py}    | 2 +-
 python/cudf/pyproject.toml                                  | 2 +-
 8 files changed, 14 insertions(+), 10 deletions(-)
 rename python/cudf/benchmarks/API/{bench_dataframe_cases.py => cases_dataframe.py} (88%)
 rename python/cudf/benchmarks/API/{bench_functions_cases.py => cases_functions.py} (99%)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 15bcf207b1b..47b377013ce 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -75,7 +75,7 @@ dependencies:
 - pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
-- pytest-cases<3.8.2
+- pytest-cases>=3.8.2
 - pytest-cov
 - pytest-xdist
 - python-confluent-kafka>=1.9.0,<1.10.0a0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index ccad6a366fb..de59bc1d43c 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -72,7 +72,7 @@ dependencies:
 - pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
-- pytest-cases<3.8.2
+- pytest-cases>=3.8.2
 - pytest-cov
 - pytest-xdist
 - python-confluent-kafka>=1.9.0,<1.10.0a0
diff --git a/dependencies.yaml b/dependencies.yaml
index 1c6d5086bf3..9cf808907ec 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -620,7 +620,7 @@ dependencies:
           - fastavro>=0.22.9
           - hypothesis
           - pytest-benchmark
-          - pytest-cases<3.8.2
+          - pytest-cases>=3.8.2
           - python-snappy>=0.6.0
           - scipy
       - output_types: conda
diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py
index f908a995c2a..59d73015962 100644
--- a/python/cudf/benchmarks/API/bench_dataframe.py
+++ b/python/cudf/benchmarks/API/bench_dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Benchmarks of DataFrame methods."""
 
@@ -178,6 +178,8 @@ def bench_nsmallest(benchmark, dataframe, num_cols_to_sort, n):
     benchmark(dataframe.nsmallest, n, by)
 
 
-@pytest_cases.parametrize_with_cases("dataframe, cond, other", prefix="where")
+@pytest_cases.parametrize_with_cases(
+    "dataframe, cond, other", prefix="where", cases="cases_dataframe"
+)
 def bench_where(benchmark, dataframe, cond, other):
     benchmark(dataframe.where, cond, other)
diff --git a/python/cudf/benchmarks/API/bench_functions.py b/python/cudf/benchmarks/API/bench_functions.py
index ec4be221d9f..93109838900 100644
--- a/python/cudf/benchmarks/API/bench_functions.py
+++ b/python/cudf/benchmarks/API/bench_functions.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Benchmarks of free functions that accept cudf objects."""
 
@@ -9,7 +9,9 @@
 from utils import benchmark_with_object
 
 
-@pytest_cases.parametrize_with_cases("objs", prefix="concat")
+@pytest_cases.parametrize_with_cases(
+    "objs", prefix="concat", cases="cases_functions"
+)
 @pytest.mark.parametrize(
     "axis",
     [
diff --git a/python/cudf/benchmarks/API/bench_dataframe_cases.py b/python/cudf/benchmarks/API/cases_dataframe.py
similarity index 88%
rename from python/cudf/benchmarks/API/bench_dataframe_cases.py
rename to python/cudf/benchmarks/API/cases_dataframe.py
index fc41d141c8a..d12b9776f1b 100644
--- a/python/cudf/benchmarks/API/bench_dataframe_cases.py
+++ b/python/cudf/benchmarks/API/cases_dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from utils import benchmark_with_object
 
diff --git a/python/cudf/benchmarks/API/bench_functions_cases.py b/python/cudf/benchmarks/API/cases_functions.py
similarity index 99%
rename from python/cudf/benchmarks/API/bench_functions_cases.py
rename to python/cudf/benchmarks/API/cases_functions.py
index c81f8f20f80..6bc66aa4a9b 100644
--- a/python/cudf/benchmarks/API/bench_functions_cases.py
+++ b/python/cudf/benchmarks/API/cases_functions.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Test cases for benchmarks in bench_functions.py."""
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 18771804f61..c7b66abea27 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -59,7 +59,7 @@ test = [
     "msgpack",
     "pytest",
     "pytest-benchmark",
-    "pytest-cases<3.8.2",
+    "pytest-cases>=3.8.2",
     "pytest-cov",
     "pytest-xdist",
     "python-snappy>=0.6.0",

From 2c1b94970959a98780a603f18c560e79f558094d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Jan 2024 18:25:28 -1000
Subject: [PATCH 11/60] Clean up `DatetimeIndex.__init__` constructor (#14774)

Additionally adds some typing and remove validation done by `cudf.dtype` and add a unit test to ensure numpy dtype objects are accepted in the constructor

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14774
---
 python/cudf/cudf/core/index.py          | 22 ++++++++++------------
 python/cudf/cudf/tests/test_datetime.py |  8 ++++++++
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 3e8f6bc2ccb..96643ef08d3 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2118,13 +2118,13 @@ def __init__(
         data=None,
         freq=None,
         tz=None,
-        normalize=False,
+        normalize: bool = False,
         closed=None,
-        ambiguous="raise",
-        dayfirst=False,
-        yearfirst=False,
+        ambiguous: Literal["raise"] = "raise",
+        dayfirst: bool = False,
+        yearfirst: bool = False,
         dtype=None,
-        copy=False,
+        copy: bool = False,
         name=None,
     ):
         # we should be more strict on what we accept here but
@@ -2147,22 +2147,20 @@ def __init__(
 
         self._freq = _validate_freq(freq)
 
-        valid_dtypes = tuple(
-            f"datetime64[{res}]" for res in ("s", "ms", "us", "ns")
-        )
         if dtype is None:
             # nanosecond default matches pandas
             dtype = "datetime64[ns]"
-        elif dtype not in valid_dtypes:
-            raise TypeError("Invalid dtype")
+        dtype = cudf.dtype(dtype)
+        if dtype.kind != "M":
+            raise TypeError("dtype must be a datetime type")
 
-        kwargs = _setdefault_name(data, name=name)
+        name = _setdefault_name(data, name=name)["name"]
         data = column.as_column(data, dtype=dtype)
 
         if copy:
             data = data.copy()
 
-        super().__init__(data, **kwargs)
+        super().__init__(data, name=name)
 
         if self._freq is not None:
             unique_vals = self.to_series().diff().unique()
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 22d452fdda5..2ea2885bc7b 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2429,3 +2429,11 @@ def test_dateimeindex_from_noniso_string():
 def test_to_datetime_errors_non_scalar_not_implemented(errors):
     with pytest.raises(NotImplementedError):
         cudf.to_datetime([1, ""], unit="s", errors=errors)
+
+
+def test_datetimeindex_dtype_np_dtype():
+    dtype = np.dtype("datetime64[ns]")
+    data = [1]
+    gdti = cudf.DatetimeIndex(data, dtype=dtype)
+    pdti = pd.DatetimeIndex(data, dtype=dtype)
+    assert_eq(gdti, pdti)

From f785ed3ddebf8b225b9d7c07aab9d5f32eb39b05 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Jan 2024 19:22:50 -1000
Subject: [PATCH 12/60] Use instance over is_foo_dtype (#14641)

Similar to https://github.com/rapidsai/cudf/pull/14638, use isinstance when we know we are checking a dtype instance

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14641
---
 python/cudf/cudf/_lib/column.pyx   | 14 +++---
 python/cudf/cudf/_lib/groupby.pyx  | 74 +++++++++++++++++++-----------
 python/cudf/cudf/_lib/interop.pyx  | 10 ++--
 python/cudf/cudf/_lib/io/utils.pyx |  6 +--
 python/cudf/cudf/_lib/json.pyx     | 21 ++++-----
 python/cudf/cudf/_lib/orc.pyx      |  7 ++-
 python/cudf/cudf/_lib/parquet.pyx  | 21 +++------
 python/cudf/cudf/_lib/scalar.pyx   | 13 ++----
 python/cudf/cudf/_lib/types.pyx    | 34 +++++++-------
 python/cudf/cudf/_lib/utils.pyx    | 24 ++++------
 10 files changed, 111 insertions(+), 113 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index acd0ba519dd..45aa1081b8d 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -5,13 +5,13 @@ from typing import Literal
 
 import cupy as cp
 import numpy as np
+import pandas as pd
 
 import rmm
 
 import cudf
 import cudf._lib as libcudf
 from cudf._lib import pylibcudf
-from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype
 from cudf.core.buffer import (
     Buffer,
     ExposureTrackedBuffer,
@@ -344,10 +344,10 @@ cdef class Column:
             )
 
     cdef mutable_column_view mutable_view(self) except *:
-        if is_categorical_dtype(self.dtype):
+        if isinstance(self.dtype, cudf.CategoricalDtype):
             col = self.base_children[0]
             data_dtype = col.dtype
-        elif is_datetime64tz_dtype(self.dtype):
+        elif isinstance(self.dtype, pd.DatetimeTZDtype):
             col = self
             data_dtype = _get_base_dtype(col.dtype)
         else:
@@ -407,10 +407,10 @@ cdef class Column:
         return self._view(c_null_count)
 
     cdef column_view _view(self, libcudf_types.size_type null_count) except *:
-        if is_categorical_dtype(self.dtype):
+        if isinstance(self.dtype, cudf.CategoricalDtype):
             col = self.base_children[0]
             data_dtype = col.dtype
-        elif is_datetime64tz_dtype(self.dtype):
+        elif isinstance(self.dtype, pd.DatetimeTZDtype):
             col = self
             data_dtype = _get_base_dtype(col.dtype)
         else:
@@ -482,7 +482,7 @@ cdef class Column:
         # categoricals because cudf supports ordered and unordered categoricals
         # while libcudf supports only unordered categoricals (see
         # https://github.com/rapidsai/cudf/pull/8567).
-        if is_categorical_dtype(self.dtype):
+        if isinstance(self.dtype, cudf.CategoricalDtype):
             col = self.base_children[0]
         else:
             col = self
@@ -648,7 +648,7 @@ cdef class Column:
         """
         column_owner = isinstance(owner, Column)
         mask_owner = owner
-        if column_owner and is_categorical_dtype(owner.dtype):
+        if column_owner and isinstance(owner.dtype, cudf.CategoricalDtype):
             owner = owner.base_children[0]
 
         size = cv.size()
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index f332fead8d1..8848649736b 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -1,16 +1,17 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from functools import singledispatch
 
 from pandas.core.groupby.groupby import DataError
 
-from cudf.api.types import (
-    is_categorical_dtype,
-    is_decimal_dtype,
-    is_interval_dtype,
-    is_list_dtype,
-    is_string_dtype,
-    is_struct_dtype,
-)
+from cudf.api.types import is_string_dtype
 from cudf.core.buffer import acquire_spill_lock
+from cudf.core.dtypes import (
+    CategoricalDtype,
+    DecimalDtype,
+    IntervalDtype,
+    ListDtype,
+    StructDtype,
+)
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -73,6 +74,43 @@ _DECIMAL_AGGS = {
 ctypedef const scalar constscalar
 
 
+@singledispatch
+def get_valid_aggregation(dtype):
+    if is_string_dtype(dtype):
+        return _STRING_AGGS
+    return "ALL"
+
+
+@get_valid_aggregation.register
+def _(dtype: ListDtype):
+    return _LIST_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: CategoricalDtype):
+    return _CATEGORICAL_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: ListDtype):
+    return _LIST_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: StructDtype):
+    return _STRUCT_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: IntervalDtype):
+    return _INTERVAL_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: DecimalDtype):
+    return _DECIMAL_AGGS
+
+
 cdef _agg_result_from_columns(
     vector[libcudf_groupby.aggregation_result]& c_result_columns,
     set column_included,
@@ -187,15 +225,7 @@ cdef class GroupBy:
         for i, (col, aggs) in enumerate(zip(values, aggregations)):
             dtype = col.dtype
 
-            valid_aggregations = (
-                _LIST_AGGS if is_list_dtype(dtype)
-                else _STRING_AGGS if is_string_dtype(dtype)
-                else _CATEGORICAL_AGGS if is_categorical_dtype(dtype)
-                else _STRUCT_AGGS if is_struct_dtype(dtype)
-                else _INTERVAL_AGGS if is_interval_dtype(dtype)
-                else _DECIMAL_AGGS if is_decimal_dtype(dtype)
-                else "ALL"
-            )
+            valid_aggregations = get_valid_aggregation(dtype)
             included_aggregations_i = []
 
             c_agg_request = move(libcudf_groupby.aggregation_request())
@@ -258,15 +288,7 @@ cdef class GroupBy:
         for i, (col, aggs) in enumerate(zip(values, aggregations)):
             dtype = col.dtype
 
-            valid_aggregations = (
-                _LIST_AGGS if is_list_dtype(dtype)
-                else _STRING_AGGS if is_string_dtype(dtype)
-                else _CATEGORICAL_AGGS if is_categorical_dtype(dtype)
-                else _STRUCT_AGGS if is_struct_dtype(dtype)
-                else _INTERVAL_AGGS if is_interval_dtype(dtype)
-                else _DECIMAL_AGGS if is_decimal_dtype(dtype)
-                else "ALL"
-            )
+            valid_aggregations = get_valid_aggregation(dtype)
             included_aggregations_i = []
 
             c_agg_request = move(libcudf_groupby.scan_request())
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 8fd2a409d90..13c8ce43ea3 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cpython cimport pycapsule
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -18,8 +18,8 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
-from cudf.api.types import is_list_dtype, is_struct_dtype
 from cudf.core.buffer import acquire_spill_lock
+from cudf.core.dtypes import ListDtype, StructDtype
 
 
 def from_dlpack(dlpack_capsule):
@@ -98,7 +98,7 @@ cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *:
     if cols_dtypes is not None:
         for idx, (col_name, col_dtype) in enumerate(cols_dtypes):
             cpp_metadata.push_back(column_metadata(col_name.encode()))
-            if is_struct_dtype(col_dtype) or is_list_dtype(col_dtype):
+            if isinstance(col_dtype, (ListDtype, StructDtype)):
                 _set_col_children_metadata(col_dtype, cpp_metadata[idx])
     else:
         raise TypeError(
@@ -113,14 +113,14 @@ cdef _set_col_children_metadata(dtype,
 
     cdef column_metadata element_metadata
 
-    if is_struct_dtype(dtype):
+    if isinstance(dtype, StructDtype):
         for name, value in dtype.fields.items():
             element_metadata = column_metadata(name.encode())
             _set_col_children_metadata(
                 value, element_metadata
             )
             col_meta.children_meta.push_back(element_metadata)
-    elif is_list_dtype(dtype):
+    elif isinstance(dtype, ListDtype):
         col_meta.children_meta.reserve(2)
         # Offsets - child 0
         col_meta.children_meta.push_back(column_metadata())
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 9b027a4d275..ae978d18813 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cpython.buffer cimport PyBUF_READ
 from cpython.memoryview cimport PyMemoryView_FromMemory
@@ -23,7 +23,7 @@ import errno
 import io
 import os
 
-from cudf.api.types import is_struct_dtype
+from cudf.core.dtypes import StructDtype
 
 
 # Converts the Python source input to libcudf IO source_info
@@ -172,7 +172,7 @@ cdef Column update_column_struct_field_names(
             )
         col.set_base_children(tuple(children))
 
-    if is_struct_dtype(col):
+    if isinstance(col.dtype, StructDtype):
         field_names.reserve(len(col.base_children))
         for i in range(info.children.size()):
             field_names.push_back(info.children[i].name)
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 437c3ef6ec4..c361a3f00c4 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # cython: boundscheck = False
 
@@ -17,6 +17,7 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
+from cudf._lib.column cimport Column
 from cudf._lib.cpp.io.data_sink cimport data_sink
 from cudf._lib.cpp.io.json cimport (
     json_reader_options,
@@ -42,10 +43,6 @@ from cudf._lib.io.utils cimport (
 from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
-from cudf.api.types import is_list_dtype, is_struct_dtype
-
-from cudf._lib.column cimport Column
-
 
 cpdef read_json(object filepaths_or_buffers,
                 object dtype,
@@ -214,13 +211,12 @@ def write_json(
 cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *:
     cdef schema_element s_element
     cdef data_type lib_type
-    if cudf.api.types.is_categorical_dtype(dtype):
+    dtype = cudf.dtype(dtype)
+    if isinstance(dtype, cudf.CategoricalDtype):
         raise NotImplementedError(
             "CategoricalDtype as dtype is not yet "
             "supported in JSON reader"
         )
-
-    dtype = cudf.dtype(dtype)
     lib_type = dtype_to_data_type(dtype)
     s_element.type = lib_type
     if isinstance(dtype, cudf.StructDtype):
@@ -237,19 +233,18 @@ cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *:
 
 
 cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
-    if cudf.api.types.is_categorical_dtype(dtype):
+    dtype = cudf.dtype(dtype)
+    if isinstance(dtype, cudf.CategoricalDtype):
         raise NotImplementedError(
             "CategoricalDtype as dtype is not yet "
             "supported in JSON reader"
         )
-
-    dtype = cudf.dtype(dtype)
     return dtype_to_data_type(dtype)
 
 cdef _set_col_children_metadata(Column col,
                                 column_name_info& col_meta):
     cdef column_name_info child_info
-    if is_struct_dtype(col):
+    if isinstance(col.dtype, cudf.StructDtype):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
         ):
@@ -258,7 +253,7 @@ cdef _set_col_children_metadata(Column col,
             _set_col_children_metadata(
                 child_col, col_meta.children[i]
             )
-    elif is_list_dtype(col):
+    elif isinstance(col.dtype, cudf.ListDtype):
         for i, child_col in enumerate(col.children):
             col_meta.children.push_back(child_info)
             _set_col_children_metadata(
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 0ae039b14d2..c64296eb7da 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import cudf
 from cudf.core.buffer import acquire_spill_lock
@@ -59,7 +59,6 @@ from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 from pyarrow.lib import NativeFile
 
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
-from cudf.api.types import is_list_dtype, is_struct_dtype
 
 
 cpdef read_raw_orc_statistics(filepath_or_buffer):
@@ -474,7 +473,7 @@ cdef class ORCWriter:
 cdef _set_col_children_metadata(Column col,
                                 column_in_metadata& col_meta,
                                 list_column_as_map=False):
-    if is_struct_dtype(col):
+    if isinstance(col.dtype, cudf.StructDtype):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
         ):
@@ -482,7 +481,7 @@ cdef _set_col_children_metadata(Column col,
             _set_col_children_metadata(
                 child_col, col_meta.child(i), list_column_as_map
             )
-    elif is_list_dtype(col):
+    elif isinstance(col.dtype, cudf.ListDtype):
         if list_column_as_map:
             col_meta.set_list_column_as_map()
         _set_col_children_metadata(
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 4acb1ce10b1..27efc5e1ecd 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # cython: boundscheck = False
 
@@ -18,12 +18,7 @@ import numpy as np
 
 from cython.operator cimport dereference
 
-from cudf.api.types import (
-    is_decimal_dtype,
-    is_list_dtype,
-    is_list_like,
-    is_struct_dtype,
-)
+from cudf.api.types import is_list_like
 
 from cudf._lib.utils cimport data_from_unique_ptr
 
@@ -220,7 +215,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
 
         # update the decimal precision of each column
         for col in names:
-            if is_decimal_dtype(df._data[col].dtype):
+            if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype):
                 df._data[col].dtype.precision = (
                     meta_data_per_column[col]["metadata"]["precision"]
                 )
@@ -703,7 +698,7 @@ cdef _set_col_metadata(
         # is true.
         col_meta.set_nullability(True)
 
-    if is_struct_dtype(col):
+    if isinstance(col.dtype, cudf.StructDtype):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
         ):
@@ -713,13 +708,11 @@ cdef _set_col_metadata(
                 col_meta.child(i),
                 force_nullable_schema
             )
-    elif is_list_dtype(col):
+    elif isinstance(col.dtype, cudf.ListDtype):
         _set_col_metadata(
             col.children[1],
             col_meta.child(1),
             force_nullable_schema
         )
-    else:
-        if is_decimal_dtype(col):
-            col_meta.set_decimal_precision(col.dtype.precision)
-        return
+    elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype):
+        col_meta.set_decimal_precision(col.dtype.precision)
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 27fb9e994f0..37708a4e3ba 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import copy
 
@@ -14,12 +14,7 @@ from libcpp.utility cimport move
 import cudf
 from cudf._lib import pylibcudf
 from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
-from cudf.core.dtypes import (
-    ListDtype,
-    StructDtype,
-    is_list_dtype,
-    is_struct_dtype,
-)
+from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 
 cimport cudf._lib.cpp.types as libcudf_types
@@ -79,9 +74,9 @@ def gather_metadata(dtypes):
     out = []
     for name, dtype in dtypes.items():
         v = pylibcudf.interop.ColumnMetadata(name)
-        if is_struct_dtype(dtype):
+        if isinstance(dtype, cudf.StructDtype):
             v.children_meta = gather_metadata(dtype.fields)
-        elif is_list_dtype(dtype):
+        elif isinstance(dtype, cudf.ListDtype):
             # Offsets column is unnamed and has no children
             v.children_meta.append(pylibcudf.interop.ColumnMetadata(""))
             v.children_meta.extend(
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index d87104bf168..1b4f4617e97 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from enum import IntEnum
 
@@ -238,15 +238,15 @@ cdef dtype_from_column_view(column_view cv):
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
     cdef libcudf_types.type_id tid
-    if cudf.api.types.is_list_dtype(dtype):
+    if isinstance(dtype, cudf.ListDtype):
         tid = libcudf_types.type_id.LIST
-    elif cudf.api.types.is_struct_dtype(dtype):
+    elif isinstance(dtype, cudf.StructDtype):
         tid = libcudf_types.type_id.STRUCT
-    elif cudf.api.types.is_decimal128_dtype(dtype):
+    elif isinstance(dtype, cudf.Decimal128Dtype):
         tid = libcudf_types.type_id.DECIMAL128
-    elif cudf.api.types.is_decimal64_dtype(dtype):
+    elif isinstance(dtype, cudf.Decimal64Dtype):
         tid = libcudf_types.type_id.DECIMAL64
-    elif cudf.api.types.is_decimal32_dtype(dtype):
+    elif isinstance(dtype, cudf.Decimal32Dtype):
         tid = libcudf_types.type_id.DECIMAL32
     else:
         tid = <libcudf_types.type_id> (
@@ -259,21 +259,21 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
         return libcudf_types.data_type(tid)
 
 cpdef dtype_to_pylibcudf_type(dtype):
-    if cudf.api.types.is_list_dtype(dtype):
+    if isinstance(dtype, cudf.ListDtype):
         return pylibcudf.DataType(pylibcudf.TypeId.LIST)
-    elif cudf.api.types.is_struct_dtype(dtype):
+    elif isinstance(dtype, cudf.StructDtype):
         return pylibcudf.DataType(pylibcudf.TypeId.STRUCT)
-    elif cudf.api.types.is_decimal_dtype(dtype):
-        if cudf.api.types.is_decimal128_dtype(dtype):
-            tid = pylibcudf.TypeId.DECIMAL128
-        elif cudf.api.types.is_decimal64_dtype(dtype):
-            tid = pylibcudf.TypeId.DECIMAL64
-        else:
-            tid = pylibcudf.TypeId.DECIMAL32
+    elif isinstance(dtype, cudf.Decimal128Dtype):
+        tid = pylibcudf.TypeId.DECIMAL128
+        return pylibcudf.DataType(tid, -dtype.scale)
+    elif isinstance(dtype, cudf.Decimal64Dtype):
+        tid = pylibcudf.TypeId.DECIMAL64
+        return pylibcudf.DataType(tid, -dtype.scale)
+    elif isinstance(dtype, cudf.Decimal32Dtype):
+        tid = pylibcudf.TypeId.DECIMAL32
         return pylibcudf.DataType(tid, -dtype.scale)
-
     # libcudf types don't support localization so convert to the base type
-    if isinstance(dtype, pd.DatetimeTZDtype):
+    elif isinstance(dtype, pd.DatetimeTZDtype):
         dtype = np.dtype(f"<M8[{dtype.unit}]")
     else:
         dtype = np.dtype(dtype)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 03982a58517..50a47b4f507 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import numpy as np
 import pyarrow as pa
@@ -22,12 +22,6 @@ try:
 except ImportError:
     import json
 
-from cudf.api.types import (
-    is_categorical_dtype,
-    is_decimal_dtype,
-    is_list_dtype,
-    is_struct_dtype,
-)
 from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype
 
 PARQUET_META_TYPE_MAP = {
@@ -92,16 +86,16 @@ cpdef generate_pandas_metadata(table, index):
     # Columns
     for name, col in table._data.items():
         col_names.append(name)
-        if is_categorical_dtype(col):
+        if isinstance(col.dtype, cudf.CategoricalDtype):
             raise ValueError(
                 "'category' column dtypes are currently not "
                 + "supported by the gpu accelerated parquet writer"
             )
-        elif (
-            is_list_dtype(col)
-            or is_struct_dtype(col)
-            or is_decimal_dtype(col)
-        ):
+        elif isinstance(col.dtype, (
+            cudf.ListDtype,
+            cudf.StructDtype,
+            cudf.core.dtypes.DecimalDtype
+        )):
             types.append(col.dtype.to_arrow())
         else:
             # A boolean element takes 8 bits in cudf and 1 bit in
@@ -147,12 +141,12 @@ cpdef generate_pandas_metadata(table, index):
                         level=level,
                         column_names=col_names
                     )
-                if is_categorical_dtype(idx):
+                if isinstance(idx.dtype, cudf.CategoricalDtype):
                     raise ValueError(
                         "'category' column dtypes are currently not "
                         + "supported by the gpu accelerated parquet writer"
                     )
-                elif is_list_dtype(idx):
+                elif isinstance(idx.dtype, cudf.ListDtype):
                     types.append(col.dtype.to_arrow())
                 else:
                     # A boolean element takes 8 bits in cudf and 1 bit in

From 4317313a4bfc305e103e47c34faee2ea898f0108 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 19 Jan 2024 09:32:43 -0500
Subject: [PATCH 13/60] Fix calls to deprecated strings factory API (#14771)

Fixes deprecation warnings introduced when #14202 merged.
Most of these are for calls to `cudf::make_strings_column` which deprecated the chars-column function overload.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14771
---
 cpp/benchmarks/common/generate_input.cu       |  4 +--
 cpp/benchmarks/json/json.cu                   |  4 +--
 .../cudf/strings/detail/copy_if_else.cuh      |  4 +--
 .../cudf/strings/detail/copy_range.cuh        |  4 +--
 cpp/include/cudf/strings/detail/gather.cuh    |  2 +-
 cpp/include/cudf/strings/detail/merge.cuh     | 14 +++-----
 .../detail/strings_column_factories.cuh       |  9 +++--
 cpp/include/cudf_test/column_wrapper.hpp      | 32 +++++++++--------
 cpp/src/hash/md5_hash.cu                      | 10 +++---
 cpp/src/interop/from_arrow.cu                 |  4 +--
 cpp/src/io/csv/durations.cu                   |  4 +--
 cpp/src/io/csv/writer_impl.cu                 |  7 ++--
 cpp/src/io/json/legacy/reader_impl.cu         | 34 +++++++++----------
 cpp/src/io/json/write_json.cu                 | 24 +++++--------
 cpp/src/io/parquet/predicate_pushdown.cpp     |  4 +--
 cpp/src/io/text/multibyte_split.cu            |  2 +-
 cpp/src/io/utilities/column_buffer.cpp        | 22 ++----------
 cpp/src/io/utilities/data_casting.cu          |  7 ++--
 cpp/src/json/json_path.cu                     |  7 ++--
 cpp/src/lists/interleave_columns.cu           |  2 +-
 cpp/src/replace/clamp.cu                      |  4 +--
 cpp/src/replace/nulls.cu                      |  9 ++---
 cpp/src/replace/replace.cu                    | 18 ++++------
 cpp/src/reshape/interleave_columns.cu         |  9 +++--
 cpp/src/strings/capitalize.cu                 |  9 ++---
 cpp/src/strings/case.cu                       |  8 ++---
 cpp/src/strings/char_types/char_types.cu      |  9 ++---
 cpp/src/strings/combine/concatenate.cu        | 14 ++++----
 cpp/src/strings/combine/join.cu               |  7 ++--
 cpp/src/strings/combine/join_list_elements.cu | 16 ++++++---
 cpp/src/strings/convert/convert_booleans.cu   |  4 +--
 cpp/src/strings/convert/convert_datetime.cu   |  4 +--
 cpp/src/strings/convert/convert_durations.cu  |  4 +--
 .../strings/convert/convert_fixed_point.cu    |  4 +--
 cpp/src/strings/convert/convert_floats.cu     |  4 +--
 cpp/src/strings/convert/convert_hex.cu        |  8 ++---
 cpp/src/strings/convert/convert_integers.cu   |  4 +--
 cpp/src/strings/convert/convert_ipv4.cu       | 10 +++---
 cpp/src/strings/convert/convert_lists.cu      | 11 +++---
 cpp/src/strings/convert/convert_urls.cu       | 12 +++----
 cpp/src/strings/copying/concatenate.cu        |  7 ++--
 cpp/src/strings/copying/shift.cu              |  8 ++---
 cpp/src/strings/filling/fill.cu               |  2 +-
 cpp/src/strings/filter_chars.cu               |  9 ++---
 cpp/src/strings/padding.cu                    | 15 ++++----
 cpp/src/strings/repeat_strings.cu             | 10 +++---
 cpp/src/strings/replace/backref_re.cu         | 10 +++---
 cpp/src/strings/replace/multi.cu              | 16 ++++-----
 cpp/src/strings/replace/multi_re.cu           |  8 ++---
 cpp/src/strings/replace/replace.cu            | 25 +++++++-------
 cpp/src/strings/replace/replace_re.cu         |  8 ++---
 cpp/src/strings/slice.cu                      |  4 +--
 cpp/src/strings/translate.cu                  |  8 ++---
 cpp/src/text/bpe/byte_pair_encoding.cu        |  6 ++--
 cpp/src/text/detokenize.cu                    | 11 +++---
 cpp/src/text/generate_ngrams.cu               | 16 ++++++---
 cpp/src/text/ngrams_tokenize.cu               | 10 +++---
 cpp/src/text/normalize.cu                     | 12 +++----
 cpp/src/text/replace.cu                       | 16 +++++----
 cpp/src/text/tokenize.cu                      |  9 +++--
 cpp/src/transform/row_conversion.cu           |  3 +-
 cpp/tests/copying/concatenate_tests.cpp       | 34 +++++++++----------
 cpp/tests/strings/contains_tests.cpp          | 11 +++---
 cpp/tests/strings/factories_test.cu           | 14 +++-----
 cpp/tests/transform/row_conversion.cpp        | 30 ++++------------
 65 files changed, 309 insertions(+), 361 deletions(-)

diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index bb7529bb37a..0ea13957868 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -540,7 +540,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
   return cudf::make_strings_column(
     num_rows,
     std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
-    std::make_unique<cudf::column>(std::move(chars), rmm::device_buffer{}, 0),
+    chars.release(),
     null_count,
     profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
 }
diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
index c74701445f8..020c8e413b3 100644
--- a/cpp/benchmarks/json/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -177,10 +177,10 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2));
   json_benchmark_row_builder jb{
     desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets, chars] = cudf::strings::detail::make_strings_children(
     jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(
-    num_rows, std::move(children.first), std::move(children.second), 0, {});
+    num_rows, std::move(offsets), std::move(chars->release().data.release()[0]), 0, {});
 }
 
 void BM_case(benchmark::State& state, std::string query_arg)
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 6f0b199ff12..64e14dcc549 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -109,7 +109,7 @@ std::unique_ptr<cudf::column> copy_if_else(StringIterLeft lhs_begin,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh
index 5da3addd9a4..567452bac4e 100644
--- a/cpp/include/cudf/strings/detail/copy_range.cuh
+++ b/cpp/include/cudf/strings/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -205,7 +205,7 @@ std::unique_ptr<column> copy_range(SourceValueIterator source_value_begin,
 
     return make_strings_column(target.size(),
                                std::move(p_offsets_column),
-                               std::move(p_chars_column),
+                               std::move(p_chars_column->release().data.release()[0]),
                                null_count,
                                std::move(null_mask));
   }
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index f7d2ebebe9a..442155380a2 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -321,7 +321,7 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
 
   return make_strings_column(output_count,
                              std::move(out_offsets_column),
-                             std::move(out_chars_column),
+                             std::move(out_chars_column->release().data.release()[0]),
                              0,  // caller sets these
                              rmm::device_buffer{});
 }
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index aef1fe93792..8049895c3c2 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,9 +89,8 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
   auto d_offsets = offsets_column->view().template data<int32_t>();
 
   // create the chars column
-  auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr);
-  // merge the strings
-  auto d_chars = chars_column->mutable_view().template data<char>();
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  auto d_chars = chars.data();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
@@ -103,11 +102,8 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
                        memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
                      });
 
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
-                             null_count,
-                             std::move(null_mask));
+  return make_strings_column(
+    strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index de7db4ce47b..fcbdfa619f4 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -137,7 +137,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              null_count,
                              std::move(null_mask));
 }
@@ -187,13 +187,12 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
                       [] __device__(auto offset) { return static_cast<int32_t>(offset); }));
 
   // build chars column
-  auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr);
-  auto chars_view   = chars_column->mutable_view();
-  thrust::copy(rmm::exec_policy(stream), chars_begin, chars_end, chars_view.data<char>());
+  rmm::device_uvector<char> chars_data(bytes, stream, mr);
+  thrust::copy(rmm::exec_policy(stream), chars_begin, chars_end, chars_data.begin());
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             chars_data.release(),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index abcd89c3035..c4fa4be0f89 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -757,20 +757,21 @@ class strings_column_wrapper : public detail::column_wrapper {
   strings_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
   {
     size_type num_strings = std::distance(begin, end);
+    if (num_strings == 0) {
+      wrapped = cudf::make_empty_column(cudf::type_id::STRING);
+      return;
+    }
     auto all_valid        = thrust::make_constant_iterator(true);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
-    auto d_chars          = std::make_unique<cudf::column>(
-      cudf::detail::make_device_uvector_sync(
-        chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
-      rmm::device_buffer{},
-      0);
+    auto d_chars          = cudf::detail::make_device_uvector_async(
+      chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     auto d_offsets = std::make_unique<cudf::column>(
       cudf::detail::make_device_uvector_sync(
         offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
       rmm::device_buffer{},
       0);
     wrapped =
-      cudf::make_strings_column(num_strings, std::move(d_offsets), std::move(d_chars), 0, {});
+      cudf::make_strings_column(num_strings, std::move(d_offsets), d_chars.release(), 0, {});
   }
 
   /**
@@ -805,23 +806,24 @@ class strings_column_wrapper : public detail::column_wrapper {
   strings_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
     : column_wrapper{}
   {
-    size_type num_strings        = std::distance(begin, end);
+    size_type num_strings = std::distance(begin, end);
+    if (num_strings == 0) {
+      wrapped = cudf::make_empty_column(cudf::type_id::STRING);
+      return;
+    }
     auto [chars, offsets]        = detail::make_chars_and_offsets(begin, end, v);
     auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings);
-    auto d_chars                 = std::make_unique<cudf::column>(
-      cudf::detail::make_device_uvector_sync(
-        chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
-      rmm::device_buffer{},
-      0);
+    auto d_chars                 = cudf::detail::make_device_uvector_async(
+      chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     auto d_offsets = std::make_unique<cudf::column>(
-      cudf::detail::make_device_uvector_sync(
+      cudf::detail::make_device_uvector_async(
         offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
       rmm::device_buffer{},
       0);
     auto d_bitmask = cudf::detail::make_device_uvector_sync(
       null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     wrapped = cudf::make_strings_column(
-      num_strings, std::move(d_offsets), std::move(d_chars), null_count, d_bitmask.release());
+      num_strings, std::move(d_offsets), d_chars.release(), null_count, d_bitmask.release());
   }
 
   /**
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index 8fc3e63bc59..002c9a9137b 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -333,9 +333,8 @@ std::unique_ptr<column> md5(table_view const& input,
   auto [offsets_column, bytes] =
     cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
 
-  auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr);
-  auto chars_view   = chars_column->mutable_view();
-  auto d_chars      = chars_view.data<char>();
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  auto d_chars = chars.data();
 
   auto const device_input = table_device_view::create(input, stream);
 
@@ -366,8 +365,7 @@ std::unique_ptr<column> md5(table_view const& input,
       }
     });
 
-  return make_strings_column(
-    input.num_rows(), std::move(offsets_column), std::move(chars_column), 0, {});
+  return make_strings_column(input.num_rows(), std::move(offsets_column), chars.release(), 0, {});
 }
 
 }  // namespace detail
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index e39625c92e7..7b44fb41288 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -290,7 +290,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
   auto const num_rows = offsets_column->size() - 1;
   auto out_col        = make_strings_column(num_rows,
                                      std::move(offsets_column),
-                                     std::move(chars_column),
+                                     std::move(chars_column->release().data.release()[0]),
                                      array.null_count(),
                                      std::move(*get_mask_buffer(array, stream, mr)));
 
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 66143d3fdee..f4d32edac89 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -202,7 +202,7 @@ struct dispatch_from_durations_fn {
     //
     return make_strings_column(strings_count,
                                std::move(offsets_column),
-                               std::move(chars_column),
+                               std::move(chars_column->release().data.release()[0]),
                                durations.null_count(),
                                std::move(null_mask));
   }
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 995d8d942c9..65473073e31 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -181,11 +181,12 @@ struct column_to_strings_fn {
 
     auto d_column = column_device_view::create(column_v, stream_);
     escape_strings_fn fn{*d_column, delimiter.value(stream_)};
-    auto children = cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_);
+    auto [offsets_column, chars_column] =
+      cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_);
 
     return make_strings_column(column_v.size(),
-                               std::move(children.first),
-                               std::move(children.second),
+                               std::move(offsets_column),
+                               std::move(chars_column->release().data.release()[0]),
                                column_v.null_count(),
                                cudf::detail::copy_bitmask(column_v, stream_, mr_));
   }
diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu
index 5580628b0fe..d461f27c921 100644
--- a/cpp/src/io/json/legacy/reader_impl.cu
+++ b/cpp/src/io/json/legacy/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -530,29 +530,27 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
   auto repl_chars   = std::vector<char>{'"', '\\', '\t', '\r', '\b'};
   auto repl_offsets = std::vector<size_type>{0, 1, 2, 3, 4, 5};
 
-  auto target = make_strings_column(
-    static_cast<size_type>(target_offsets.size() - 1),
-    std::make_unique<cudf::column>(
-      cudf::detail::make_device_uvector_async(
-        target_offsets, stream, rmm::mr::get_current_device_resource()),
-      rmm::device_buffer{},
-      0),
-    std::make_unique<cudf::column>(cudf::detail::make_device_uvector_async(
-                                     target_chars, stream, rmm::mr::get_current_device_resource()),
-                                   rmm::device_buffer{},
-                                   0),
-    0,
-    {});
+  auto target =
+    make_strings_column(static_cast<size_type>(target_offsets.size() - 1),
+                        std::make_unique<cudf::column>(
+                          cudf::detail::make_device_uvector_async(
+                            target_offsets, stream, rmm::mr::get_current_device_resource()),
+                          rmm::device_buffer{},
+                          0),
+                        cudf::detail::make_device_uvector_async(
+                          target_chars, stream, rmm::mr::get_current_device_resource())
+                          .release(),
+                        0,
+                        {});
   auto repl = make_strings_column(
     static_cast<size_type>(repl_offsets.size() - 1),
     std::make_unique<cudf::column>(cudf::detail::make_device_uvector_async(
                                      repl_offsets, stream, rmm::mr::get_current_device_resource()),
                                    rmm::device_buffer{},
                                    0),
-    std::make_unique<cudf::column>(cudf::detail::make_device_uvector_async(
-                                     repl_chars, stream, rmm::mr::get_current_device_resource()),
-                                   rmm::device_buffer{},
-                                   0),
+    cudf::detail::make_device_uvector_async(
+      repl_chars, stream, rmm::mr::get_current_device_resource())
+      .release(),
     0,
     {});
 
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index c35f15049bd..84e0ac9e74d 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -170,12 +170,12 @@ struct escape_strings_fn {
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
   {
-    auto children =
+    auto [offsets_column, chars_column] =
       cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);
 
     return make_strings_column(column_v.size(),
-                               std::move(children.first),
-                               std::move(children.second),
+                               std::move(offsets_column),
+                               std::move(chars_column->release().data.release()[0]),
                                column_v.null_count(),
                                cudf::detail::copy_bitmask(column_v, stream, mr));
   }
@@ -347,13 +347,11 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
                  d_strview_offsets + row_string_offsets.size(),
                  old_offsets.begin<size_type>(),
                  row_string_offsets.begin());
-  auto chars_data       = joined_col->release().data;
-  auto const chars_size = chars_data->size();
+  auto chars_data = joined_col->release().data;
   return make_strings_column(
     strings_columns.num_rows(),
     std::make_unique<cudf::column>(std::move(row_string_offsets), rmm::device_buffer{}, 0),
-    std::make_unique<cudf::column>(
-      data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0),
+    std::move(chars_data.release()[0]),
     0,
     {});
 }
@@ -472,13 +470,11 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
                  d_strview_offsets.end(),
                  old_offsets.begin<size_type>(),
                  row_string_offsets.begin());
-  auto chars_data       = joined_col->release().data;
-  auto const chars_size = chars_data->size();
+  auto chars_data = joined_col->release().data;
   return make_strings_column(
     num_lists,
     std::make_unique<cudf::column>(std::move(row_string_offsets), rmm::device_buffer{}, 0),
-    std::make_unique<cudf::column>(
-      data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0),
+    std::move(chars_data.release()[0]),
     lists_strings.null_count(),
     cudf::detail::copy_bitmask(lists_strings.parent(), stream, mr));
 }
@@ -780,11 +776,7 @@ std::unique_ptr<column> make_strings_column_from_host(host_span<std::string cons
     rmm::device_buffer{},
     0);
   return cudf::make_strings_column(
-    host_strings.size(),
-    std::move(d_offsets),
-    std::make_unique<cudf::column>(std::move(d_chars), rmm::device_buffer{}, 0),
-    0,
-    {});
+    host_strings.size(), std::move(d_offsets), d_chars.release(), 0, {});
 }
 
 std::unique_ptr<column> make_column_names_column(host_span<column_name_info const> column_names,
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 9c8b03886b5..f43a8fd24c4 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -189,7 +189,7 @@ struct stats_caster {
             return cudf::make_strings_column(
               val.size(),
               std::make_unique<column>(std::move(d_offsets), rmm::device_buffer{}, 0),
-              std::make_unique<column>(std::move(d_chars), rmm::device_buffer{}, 0),
+              d_chars.release(),
               null_count,
               rmm::device_buffer{
                 null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr});
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 2194ee1aaa1..34a476974e4 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -556,7 +556,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
     return cudf::make_strings_column(
       string_count,
       std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
-      std::make_unique<cudf::column>(std::move(chars), rmm::device_buffer{}, 0),
+      chars.release(),
       0,
       {});
   }
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 88617510394..36303a60aa9 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -68,26 +68,10 @@ std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_colu
   rmm::cuda_stream_view stream)
 {
   // no need for copies, just transfer ownership of the data_buffers to the columns
-  auto const state = mask_state::UNALLOCATED;
-  auto str_col =
-    _string_data.is_empty()
-      ? make_empty_column(data_type{type_id::INT8})
-      : std::make_unique<column>(data_type{type_id::INT8},
-                                 string_size(),
-                                 std::move(_string_data),
-                                 cudf::detail::create_null_mask(size, state, stream, _mr),
-                                 state_null_count(state, size),
-                                 std::vector<std::unique_ptr<column>>{});
-  auto offsets_col =
-    std::make_unique<column>(data_type{type_to_id<size_type>()},
-                             size + 1,
-                             std::move(_data),
-                             cudf::detail::create_null_mask(size + 1, state, stream, _mr),
-                             state_null_count(state, size + 1),
-                             std::vector<std::unique_ptr<column>>{});
-
+  auto offsets_col = std::make_unique<column>(
+    data_type{type_to_id<size_type>()}, size + 1, std::move(_data), rmm::device_buffer{}, 0);
   return make_strings_column(
-    size, std::move(offsets_col), std::move(str_col), null_count(), std::move(_null_mask));
+    size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
 }
 
 namespace {
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 9545811a542..8fd860d9492 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -861,9 +861,8 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
                std::overflow_error);
 
   // CHARS column
-  std::unique_ptr<column> chars =
-    strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
-  auto d_chars = chars->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  auto d_chars = chars.data();
 
   single_thread_fn.d_chars = d_chars;
   thrust::for_each_n(rmm::exec_policy(stream),
@@ -902,7 +901,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
 
   return make_strings_column(col_size,
                              std::move(offsets),
-                             std::move(chars),
+                             chars.release(),
                              d_null_count.value(stream),
                              std::move(null_mask));
 }
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index 6794838c70f..146b54c0d87 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -1010,7 +1010,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     cudf::detail::get_value<size_type>(offsets_view, col.size(), stream);
 
   // allocate output string column
-  auto chars = cudf::strings::detail::create_chars_child_column(output_size, stream, mr);
+  rmm::device_uvector<char> chars(output_size, stream, mr);
 
   // potential optimization : if we know that all outputs are valid, we could skip creating
   // the validity mask altogether
@@ -1018,7 +1018,6 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr);
 
   // compute results
-  cudf::mutable_column_view chars_view(*chars);
   rmm::device_scalar<size_type> d_valid_count{0, stream};
 
   get_json_object_kernel<block_size>
@@ -1026,14 +1025,14 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
       *cdv,
       std::get<0>(preprocess).value().data(),
       offsets_view.head<size_type>(),
-      chars_view.head<char>(),
+      chars.data(),
       static_cast<bitmask_type*>(validity.data()),
       d_valid_count.data(),
       options);
 
   auto result = make_strings_column(col.size(),
                                     std::move(offsets),
-                                    std::move(chars),
+                                    chars.release(),
                                     col.size() - d_valid_count.value(stream),
                                     std::move(validity));
   // unmatched array query may result in unsanitized '[' value in the result
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 7b37e2dc8f6..8f05b020a2e 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -237,7 +237,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
 
     return make_strings_column(num_output_entries,
                                std::move(offsets_column),
-                               std::move(chars_column),
+                               std::move(chars_column->release().data.release()[0]),
                                null_count,
                                std::move(null_mask));
   }
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 6852b19af44..23c792ddcae 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,7 +106,7 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              std::move(cudf::detail::copy_bitmask(input.parent(), stream, mr)));
 }
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index bd3e75e2e80..8ea229368cc 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -255,22 +255,19 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
   auto offsets_view = offsets->mutable_view();
 
   // Allocate chars array and output null mask
-  std::unique_ptr<cudf::column> output_chars =
-    cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
-
-  auto output_chars_view = output_chars->mutable_view();
+  rmm::device_uvector<char> output_chars(bytes, stream, mr);
 
   replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
     *device_in,
     *device_replacement,
     reinterpret_cast<cudf::bitmask_type*>(valid_bits.data()),
     offsets_view.begin<cudf::size_type>(),
-    output_chars_view.data<char>(),
+    output_chars.data(),
     valid_count);
 
   return cudf::make_strings_column(input.size(),
                                    std::move(offsets),
-                                   std::move(output_chars),
+                                   output_chars.release(),
                                    input.size() - valid_counter.value(stream),
                                    std::move(valid_bits));
 }
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 7cad2fb10d3..184c30246c7 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -187,7 +187,7 @@ template <bool input_has_nulls, bool replacement_has_nulls>
 CUDF_KERNEL void replace_strings_second_pass(cudf::column_device_view input,
                                              cudf::column_device_view replacement,
                                              cudf::mutable_column_device_view offsets,
-                                             cudf::mutable_column_device_view strings,
+                                             char* strings,
                                              cudf::mutable_column_device_view indices)
 {
   cudf::size_type nrows = input.size();
@@ -211,9 +211,8 @@ CUDF_KERNEL void replace_strings_second_pass(cudf::column_device_view input,
       cudf::string_view output = (replace_idx == -1)
                                    ? input.element<cudf::string_view>(idx)
                                    : replacement.element<cudf::string_view>(replace_idx);
-      std::memcpy(strings.data<char>() + offsets.data<cudf::size_type>()[idx],
-                  output.data(),
-                  output.size_bytes());
+      std::memcpy(
+        strings + offsets.data<cudf::size_type>()[idx], output.data(), output.size_bytes());
     }
 
     tid += stride;
@@ -434,18 +433,15 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
 
   // Allocate chars array and output null mask
   cudf::size_type null_count = input_col.size() - valid_counter.value(stream);
-  std::unique_ptr<cudf::column> output_chars =
-    cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
-
-  auto output_chars_view = output_chars->mutable_view();
-  auto device_chars      = cudf::mutable_column_device_view::create(output_chars_view, stream);
+  rmm::device_uvector<char> output_chars(bytes, stream, mr);
+  auto d_chars = output_chars.data();
 
   replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
-    *device_in, *device_replacement, *device_offsets, *device_chars, *device_indices);
+    *device_in, *device_replacement, *device_offsets, d_chars, *device_indices);
 
   return cudf::make_strings_column(input_col.size(),
                                    std::move(offsets),
-                                   std::move(output_chars),
+                                   output_chars.release(),
                                    null_count,
                                    std::move(valid_bits));
 }
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index deb0acb4742..22b45fe7a58 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -193,9 +193,8 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::strin
     auto d_results_offsets = offsets_column->view().template data<int32_t>();
 
     // Create the chars column
-    auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr);
-    // Fill the chars column
-    auto d_results_chars = chars_column->mutable_view().template data<char>();
+    rmm::device_uvector<char> chars(bytes, stream, mr);
+    auto d_results_chars = chars.data();
     thrust::for_each_n(
       rmm::exec_policy(stream),
       thrust::make_counting_iterator<size_type>(0),
@@ -215,7 +214,7 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::strin
 
     return make_strings_column(num_strings,
                                std::move(offsets_column),
-                               std::move(chars_column),
+                               chars.release(),
                                null_count,
                                std::move(valid_mask.first));
   }
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index c555031b588..3b99093a89f 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -229,11 +229,12 @@ std::unique_ptr<column> capitalizer(CapitalFn cfn,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  auto children = cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
+  auto [offsets_column, chars_column] =
+    cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index c2e8033b42d..b3bf0e2a787 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -217,7 +217,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
       cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
     return make_strings_column(input.size(),
                                std::move(offsets),
-                               std::move(chars),
+                               std::move(chars->release().data.release()[0]),
                                input.null_count(),
                                cudf::detail::copy_bitmask(input.parent(), stream, mr));
   }
@@ -264,15 +264,15 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
                "Size of output exceeds the column size limit",
                std::overflow_error);
 
-  auto chars = create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
+  rmm::device_uvector<char> chars(bytes, stream, mr);
   // second pass, write output
   converter.d_offsets = d_offsets;
-  converter.d_chars   = chars->mutable_view().data<char>();
+  converter.d_chars   = chars.data();
   thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter);
 
   return make_strings_column(input.size(),
                              std::move(offsets),
-                             std::move(chars),
+                             chars.release(),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 35b0c0a2690..9c2a2701227 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -200,12 +200,13 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
+  auto [offsets_column, chars_column] =
+    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return make_strings_column(strings_count,
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              strings.null_count(),
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 0a11b6dc460..a48e84eac0c 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -142,7 +142,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   // Create device views from the strings columns.
   auto d_table = table_device_view::create(strings_columns, stream);
   concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls};
-  auto children = make_strings_children(fn, strings_count, stream, mr);
+  auto [offsets_column, chars_column] = make_strings_children(fn, strings_count, stream, mr);
 
   // create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
@@ -157,8 +157,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     mr);
 
   return make_strings_column(strings_count,
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              null_count,
                              std::move(null_mask));
 }
@@ -237,7 +237,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 
   multi_separator_concat_fn mscf{
     *d_table, separator_col_view, separator_rep, col_rep, separate_nulls};
-  auto children = make_strings_children(mscf, strings_count, stream, mr);
+  auto [offsets_column, chars_column] = make_strings_children(mscf, strings_count, stream, mr);
 
   // Create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
@@ -253,8 +253,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     mr);
 
   return make_strings_column(strings_count,
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 48304759f7a..0e0d6e437a7 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -173,8 +173,11 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
                      : rmm::device_buffer{0, stream, mr};
 
   // perhaps this return a string_scalar instead of a single-row column
-  return make_strings_column(
-    1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask));
+  return make_strings_column(1,
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
+                             null_count,
+                             std::move(null_mask));
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 372b49fb0ee..619f5feba15 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -215,8 +215,11 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                            stream,
                            mr);
 
-  return make_strings_column(
-    num_rows, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask));
+  return make_strings_column(num_rows,
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
+                             null_count,
+                             std::move(null_mask));
 }
 
 namespace {
@@ -290,8 +293,11 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                            stream,
                            mr);
 
-  return make_strings_column(
-    num_rows, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask));
+  return make_strings_column(num_rows,
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
+                             null_count,
+                             std::move(null_mask));
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index e75f1a6fe0f..4fe0be7883f 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -147,7 +147,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
 
   return make_strings_column(strings_count,
                              std::move(offsets),
-                             std::move(chars),
+                             std::move(chars->release().data.release()[0]),
                              booleans.null_count(),
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index d2609441d72..b7a662b0b76 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1158,7 +1158,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
 
   return make_strings_column(timestamps.size(),
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              timestamps.null_count(),
                              cudf::detail::copy_bitmask(timestamps, stream, mr));
 }
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 987087042cb..9a58926539c 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -422,7 +422,7 @@ struct dispatch_from_durations_fn {
 
     return make_strings_column(strings_count,
                                std::move(offsets),
-                               std::move(chars),
+                               std::move(chars->release().data.release()[0]),
                                durations.null_count(),
                                std::move(null_mask));
   }
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 2c59f6dcd29..975f03b37d6 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -249,7 +249,7 @@ struct dispatch_from_fixed_point_fn {
 
     return make_strings_column(input.size(),
                                std::move(offsets),
-                               std::move(chars),
+                               std::move(chars->release().data.release()[0]),
                                input.null_count(),
                                cudf::detail::copy_bitmask(input, stream, mr));
   }
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 9b3ef8f452b..c56e723de8e 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -408,7 +408,7 @@ struct dispatch_from_floats_fn {
 
     return make_strings_column(strings_count,
                                std::move(offsets),
-                               std::move(chars),
+                               std::move(chars->release().data.release()[0]),
                                floats.null_count(),
                                std::move(null_mask));
   }
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 8f656b149a5..68cff214507 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -180,12 +180,12 @@ struct dispatch_integers_to_hex_fn {
   {
     auto const d_column = column_device_view::create(input, stream);
 
-    auto children = cudf::strings::detail::make_strings_children(
+    auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
       integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
-                               std::move(children.first),
-                               std::move(children.second),
+                               std::move(offsets_column),
+                               std::move(chars_column->release().data.release()[0]),
                                input.null_count(),
                                cudf::detail::copy_bitmask(input, stream, mr));
   }
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 56637e88e19..364cb534d2f 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -367,7 +367,7 @@ struct dispatch_from_integers_fn {
 
     return make_strings_column(strings_count,
                                std::move(offsets),
-                               std::move(chars),
+                               std::move(chars->release().data.release()[0]),
                                integers.null_count(),
                                std::move(null_mask));
   }
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 75527e24e79..e07be26a23c 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -165,13 +165,13 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 
   CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
 
-  auto d_column = column_device_view::create(integers, stream);
-  auto children = cudf::strings::detail::make_strings_children(
+  auto d_column                       = column_device_view::create(integers, stream);
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr);
 
   return make_strings_column(integers.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              integers.null_count(),
                              cudf::detail::copy_bitmask(integers, stream, mr));
 }
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index f9f2b91eb12..1f22aea284b 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -216,14 +216,17 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
   auto const d_separators = column_device_view::create(separators.parent(), stream);
   auto const d_na_rep     = na_rep.value(stream);
 
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
     input.size(),
     stream,
     mr);
 
-  return make_strings_column(
-    input.size(), std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
+  return make_strings_column(input.size(),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
+                             0,
+                             rmm::device_buffer{});
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index b16eb318b39..a9ddcfa12a2 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -135,12 +135,12 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,
 
   auto d_column = column_device_view::create(input.parent(), stream);
 
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     url_encoder_fn{*d_column}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
@@ -409,8 +409,8 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
   auto out_chars_bytes = cudf::detail::get_value<size_type>(offsets_view, offset_count - 1, stream);
 
   // create the chars column
-  auto chars_column = create_chars_child_column(out_chars_bytes, stream, mr);
-  auto d_out_chars  = chars_column->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(out_chars_bytes, stream, mr);
+  auto d_out_chars = chars.data();
 
   // decode and copy the characters from the input column to the output column
   url_decode_char_replacer<num_warps_per_threadblock, char_block_size>
@@ -422,7 +422,7 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             chars.release(),
                              strings.null_count(),
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 8cabd0dc75f..c4564b1105b 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -228,9 +228,8 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
 
   // create output chars column
-  auto chars_column = create_chars_child_column(total_bytes, stream, mr);
-  auto d_new_chars  = chars_column->mutable_view().data<char>();
-  chars_column->set_null_count(0);
+  rmm::device_uvector<char> output_chars(total_bytes, stream, mr);
+  auto d_new_chars = output_chars.data();
 
   // create output offsets column
   auto offsets_column = make_numeric_column(
@@ -304,7 +303,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             output_chars.release(),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index 3b798a87d54..331cdecc36f 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -114,19 +114,19 @@ std::unique_ptr<column> shift(strings_column_view const& input,
   }();
 
   // create output chars child column
-  auto chars_column = create_chars_child_column(static_cast<size_type>(total_bytes), stream, mr);
-  auto d_chars      = mutable_column_device_view::create(chars_column->mutable_view(), stream);
+  rmm::device_uvector<char> chars(total_bytes, stream, mr);
+  auto d_chars = chars.data();
 
   // run kernel to shift all the characters
   thrust::transform(rmm::exec_policy(stream),
                     thrust::counting_iterator<size_type>(0),
                     thrust::counting_iterator<size_type>(total_bytes),
-                    d_chars->data<char>(),
+                    d_chars,
                     shift_chars_fn{*d_input, d_fill_str, shift_offset});
 
   // caller sets the null-mask
   return make_strings_column(
-    input.size(), std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+    input.size(), std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
 }
 
 }  // namespace cudf::strings::detail
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index 49e1b11c1db..d2e3b6f6af3 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -98,7 +98,7 @@ std::unique_ptr<column> fill(strings_column_view const& input,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 9f95fedfe0b..7a26fc45dcb 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -139,11 +139,12 @@ std::unique_ptr<column> filter_characters(
 
   // this utility calls the strip_fn to build the offsets and chars columns
   filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement};
-  auto children = cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr);
+  auto [offsets_column, chars_column] =
+    cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr);
 
   return make_strings_column(strings_count,
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              strings.null_count(),
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 850ccaa4535..ec77aea6338 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,7 +112,7 @@ std::unique_ptr<column> pad(strings_column_view const& input,
 
   auto d_strings = column_device_view::create(input.parent(), stream);
 
-  auto children = [&] {
+  auto [offsets_column, chars_column] = [&] {
     if (side == side_type::LEFT) {
       auto fn = pad_fn<side_type::LEFT>{*d_strings, width, fill_char_size, d_fill_char};
       return make_strings_children(fn, input.size(), stream, mr);
@@ -125,8 +125,8 @@ std::unique_ptr<column> pad(strings_column_view const& input,
   }();
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
@@ -151,11 +151,12 @@ std::unique_ptr<column> zfill(strings_column_view const& input,
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
   auto d_strings = column_device_view::create(input.parent(), stream);
-  auto children  = make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
+  auto [offsets_column, chars_column] =
+    make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 847a64f5602..b4a770f72bd 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,8 +81,6 @@ auto generate_empty_output(strings_column_view const& input,
                            rmm::cuda_stream_view stream,
                            rmm::mr::device_memory_resource* mr)
 {
-  auto chars_column = create_chars_child_column(0, stream, mr);
-
   auto offsets_column = make_numeric_column(
     data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data<size_type>(),
@@ -92,7 +90,7 @@ auto generate_empty_output(strings_column_view const& input,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             rmm::device_buffer{},
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
@@ -166,7 +164,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
     make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr);
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
@@ -252,7 +250,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index fc11b7d80b3..edec525a913 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -126,8 +126,8 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
 
-  using BackRefIterator = decltype(backrefs.begin());
-  auto children         = make_strings_children(
+  using BackRefIterator               = decltype(backrefs.begin());
+  auto [offsets_column, chars_column] = make_strings_children(
     backrefs_fn<BackRefIterator>{*d_strings, d_repl_template, backrefs.begin(), backrefs.end()},
     *d_prog,
     input.size(),
@@ -135,8 +135,8 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
     mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index a0f9d1136f3..3d0210d61b0 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -374,12 +374,8 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
     });
 
   // use this utility to gather the string parts into a contiguous chars column
-  auto chars = make_strings_column(indices.begin(), indices.end(), stream, mr);
-  // TODO ideally we can pass this chars_data as rmm buffer to make_strings_column
-  auto chars_data       = chars->release().data;
-  auto const chars_size = chars_data->size();
-  auto chars_col        = std::make_unique<cudf::column>(
-    data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0);
+  auto chars      = make_strings_column(indices.begin(), indices.end(), stream, mr);
+  auto chars_data = chars->release().data;
 
   // create offsets from the sizes
   offsets =
@@ -388,7 +384,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
   // build the strings columns from the chars and offsets
   return make_strings_column(strings_count,
                              std::move(offsets),
-                             std::move(chars_col),
+                             std::move(chars_data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
@@ -458,12 +454,12 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
   auto d_targets      = column_device_view::create(targets.parent(), stream);
   auto d_replacements = column_device_view::create(repls.parent(), stream);
 
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 3375cb7a789..c212d9f44ba 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -185,15 +185,15 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
 
   auto found_ranges = rmm::device_uvector<found_range>(d_progs.size() * input.size(), stream);
 
-  auto children = make_strings_children(
+  auto [offsets_column, chars_column] = make_strings_children(
     replace_multi_regex_fn{*d_strings, d_progs, found_ranges.data(), *d_repls},
     input.size(),
     stream,
     mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 8c4bd4490b9..936127f254b 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -486,9 +486,8 @@ std::unique_ptr<column> replace_char_parallel(strings_column_view const& strings
                     offsets_update_fn);
 
   // build the characters column
-  auto chars_column =
-    create_chars_child_column(chars_bytes + (delta_per_target * target_count), stream, mr);
-  auto d_out_chars = chars_column->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(chars_bytes + (delta_per_target * target_count), stream, mr);
+  auto d_out_chars = chars.data();
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(chars_start),
@@ -501,7 +500,7 @@ std::unique_ptr<column> replace_char_parallel(strings_column_view const& strings
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             chars.release(),
                              strings.null_count(),
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
@@ -532,12 +531,12 @@ std::unique_ptr<column> replace_row_parallel(strings_column_view const& strings,
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              strings.null_count(),
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
@@ -697,12 +696,12 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              strings.null_count(),
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
@@ -733,8 +732,8 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
-  auto chars_column = create_chars_child_column(bytes, stream, mr);
-  auto d_chars      = chars_column->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  auto d_chars = chars.data();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
@@ -745,7 +744,7 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                      });
 
   return make_strings_column(
-    strings_count, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+    strings_count, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 502d5f1a52e..10d83932928 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,12 +116,12 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
 
-  auto children = make_strings_children(
+  auto [offsets_column, chars_column] = make_strings_children(
     replace_regex_fn{*d_strings, d_repl, maxrepl}, *d_prog, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 5a1fee92c7d..1e55986fdb8 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -209,7 +209,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
 
   return make_strings_column(strings.size(),
                              std::move(offsets),
-                             std::move(chars),
+                             std::move(chars->release().data.release()[0]),
                              strings.null_count(),
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 0ca5e103d3d..039a8ac8a62 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -110,12 +110,12 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
-  auto children = make_strings_children(
+  auto [offsets_column, chars_column] = make_strings_children(
     translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              strings.null_count(),
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
index 1f125636208..c6d299424d2 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -429,8 +429,8 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                std::overflow_error);
 
   // build the output: inserting separators to the input character data
-  auto chars   = cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
-  auto d_chars = chars->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  auto d_chars = chars.data();
 
   auto const d_inserts     = d_working.data();  // stores the insert positions
   auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) {
@@ -453,7 +453,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
 
   return cudf::make_strings_column(input.size(),
                                    std::move(offsets),
-                                   std::move(chars),
+                                   chars.release(),
                                    input.null_count(),
                                    cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 38cb7dd6753..60625d6383a 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -156,15 +156,18 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
 
   cudf::string_view const d_separator(separator.data(), separator.size());
 
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator},
     output_count,
     stream,
     mr);
 
   // make the output strings column from the offsets and chars column
-  return cudf::make_strings_column(
-    output_count, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
+  return cudf::make_strings_column(output_count,
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
+                                   0,
+                                   rmm::device_buffer{});
 }
 
 }  // namespace detail
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 1d3e98a25ad..882d9a04501 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -139,12 +139,15 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
   // compute the number of strings of ngrams
   auto const ngrams_count = strings_count - ngrams + 1;
 
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr);
 
   // make the output strings column from the offsets and chars column
-  return cudf::make_strings_column(
-    ngrams_count, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
+  return cudf::make_strings_column(ngrams_count,
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
+                                   0,
+                                   rmm::device_buffer{});
 }
 
 }  // namespace detail
@@ -239,8 +242,11 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     generator, strings_count, total_ngrams, stream, mr);
 
-  return cudf::make_strings_column(
-    total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+  return cudf::make_strings_column(total_ngrams,
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
+                                   0,
+                                   rmm::device_buffer{});
 }
 
 namespace {
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index bc5cd04eac6..642dca5fc47 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -232,9 +232,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
   rmm::device_uvector<cudf::size_type> ngram_sizes(total_ngrams, stream);
 
   // build output chars column
-  auto chars_column = cudf::strings::detail::create_chars_child_column(
-    static_cast<cudf::size_type>(output_chars_size), stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(output_chars_size, stream, mr);
+  auto d_chars = chars.data();
   // Generate the ngrams into the chars column data buffer.
   // The ngram_builder_fn functor also fills the ngram_sizes vector with the
   // size of each ngram.
@@ -253,11 +252,10 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
   // build the offsets column -- converting the ngram sizes into offsets
   auto offsets_column = std::get<0>(
     cudf::detail::make_offsets_child_column(ngram_sizes.begin(), ngram_sizes.end(), stream, mr));
-  chars_column->set_null_count(0);
   offsets_column->set_null_count(0);
   // create the output strings column
   return make_strings_column(
-    total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+    total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
 }
 
 }  // namespace detail
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 5a0977d410f..d46ca25835f 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -182,12 +182,12 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the normalize_space_fn
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     normalize_spaces_fn{*d_strings}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
-                                   std::move(children.first),
-                                   std::move(children.second),
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
                                    strings.null_count(),
                                    cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
@@ -228,12 +228,12 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the codepoint_to_utf8_fn
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
-                                   std::move(children.first),
-                                   std::move(children.second),
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
                                    strings.null_count(),
                                    cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index a4b28fe2dab..50d7bbd077d 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -228,12 +228,13 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls replacer to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
+  auto [offsets_column, chars_column] =
+    cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
-                                   std::move(children.first),
-                                   std::move(children.second),
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
                                    strings.null_count(),
                                    std::move(null_mask));
 }
@@ -260,12 +261,13 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
+  auto [offsets_column, chars_column] =
+    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
-                                   std::move(children.first),
-                                   std::move(children.second),
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
                                    strings.null_count(),
                                    std::move(null_mask));
 }
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index c43b9dda56c..c256607fb23 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -219,14 +219,13 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
       return idx < chars_bytes ? cudf::strings::detail::is_begin_utf8_char(d_chars[idx]) : true;
     });
 
-  // create the output chars column -- just a copy of the input's chars column
-  cudf::column_view chars_view(
-    cudf::data_type{cudf::type_id::INT8}, chars_bytes, d_chars, nullptr, 0);
-  auto chars_column = std::make_unique<cudf::column>(chars_view, stream, mr);
+  // create the output chars buffer -- just a copy of the input's chars
+  rmm::device_uvector<char> output_chars(chars_bytes, stream, mr);
+  thrust::copy(rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, output_chars.data());
 
   // return new strings column
   return cudf::make_strings_column(
-    num_characters, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+    num_characters, std::move(offsets_column), output_chars.release(), 0, rmm::device_buffer{});
 }
 
 }  // namespace detail
diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu
index ef12fbeae52..b294369a90e 100644
--- a/cpp/src/transform/row_conversion.cu
+++ b/cpp/src/transform/row_conversion.cu
@@ -2509,8 +2509,7 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const& input,
           make_strings_column(num_rows,
                               std::make_unique<cudf::column>(
                                 std::move(string_col_offsets[string_idx]), rmm::device_buffer{}, 0),
-                              std::make_unique<cudf::column>(
-                                std::move(string_data_cols[string_idx]), rmm::device_buffer{}, 0),
+                              string_data_cols[string_idx].release(),
                               0,
                               std::move(*string_data.null_mask.release()));
         // Null count set to 0, temporarily. Will be fixed up before return.
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 06fb687ac2d..0f7c1053adf 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
@@ -406,9 +407,9 @@ TEST_F(OverflowTest, OverflowTest)
 
     // try and concatenate 6 string columns of with 1 billion chars in each
     auto offsets    = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, size};
-    auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size);
+    auto many_chars = rmm::device_uvector<char>(size, cudf::get_default_stream());
     auto col        = cudf::make_strings_column(
-      1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
+      1, offsets.release(), many_chars.release(), 0, rmm::device_buffer{});
 
     cudf::table_view tbl({*col});
     EXPECT_THROW(cudf::concatenate(std::vector<cudf::table_view>({tbl, tbl, tbl, tbl, tbl, tbl})),
@@ -422,7 +423,7 @@ TEST_F(OverflowTest, OverflowTest)
     // try and concatenate 6 string columns 1 billion rows each
     auto many_offsets =
       cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, size + 1);
-    auto chars = cudf::test::fixed_width_column_wrapper<int8_t>{0, 1, 2};
+    auto chars = rmm::device_uvector<char>(3, cudf::get_default_stream());
     auto col   = cudf::make_strings_column(
       size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{});
 
@@ -533,10 +534,9 @@ TEST_F(OverflowTest, Presliced)
     auto offset_gen = cudf::detail::make_counting_transform_iterator(
       0, [string_size](cudf::size_type index) { return index * string_size; });
     cudf::test::fixed_width_column_wrapper<int> offsets(offset_gen, offset_gen + num_rows + 1);
-    auto many_chars =
-      cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, total_chars_size);
-    auto col = cudf::make_strings_column(
-      num_rows, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
+    auto many_chars = rmm::device_uvector<char>(total_chars_size, cudf::get_default_stream());
+    auto col        = cudf::make_strings_column(
+      num_rows, offsets.release(), many_chars.release(), 0, rmm::device_buffer{});
 
     auto sliced = cudf::split(*col, {(num_rows / 2) - 1});
 
@@ -557,13 +557,12 @@ TEST_F(OverflowTest, Presliced)
     constexpr cudf::size_type num_rows         = total_chars_size / string_size;
 
     // try and concatenate 4 string columns of with ~1/2 billion chars in each
-    auto offsets = cudf::sequence(num_rows + 1,
+    auto offsets    = cudf::sequence(num_rows + 1,
                                   cudf::numeric_scalar<cudf::size_type>(0),
                                   cudf::numeric_scalar<cudf::size_type>(string_size));
-    auto many_chars =
-      cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, total_chars_size);
-    auto col = cudf::make_strings_column(
-      num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
+    auto many_chars = rmm::device_uvector<char>(total_chars_size, cudf::get_default_stream());
+    auto col        = cudf::make_strings_column(
+      num_rows, std::move(offsets), many_chars.release(), 0, rmm::device_buffer{});
 
     // should pass (with 2 rows to spare)
     // leaving this disabled as it typically runs out of memory on a T4
@@ -636,7 +635,7 @@ TEST_F(OverflowTest, Presliced)
                                   cudf::numeric_scalar<cudf::size_type>(0),
                                   cudf::numeric_scalar<cudf::size_type>(list_size));
 
-    auto col = cudf::make_strings_column(
+    auto col = cudf::make_lists_column(
       num_rows, std::move(offsets), std::move(struct_col), 0, rmm::device_buffer{});
 
     // should pass (with 2 rows to spare)
@@ -722,13 +721,12 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
     constexpr cudf::size_type num_rows    = 1024;
     constexpr cudf::size_type string_size = inner_size / num_rows;
 
-    auto offsets = cudf::sequence(num_rows + 1,
+    auto offsets    = cudf::sequence(num_rows + 1,
                                   cudf::numeric_scalar<cudf::size_type>(0),
                                   cudf::numeric_scalar<cudf::size_type>(string_size));
-    auto many_chars =
-      cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size);
-    auto col = cudf::make_strings_column(
-      num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
+    auto many_chars = rmm::device_uvector<char>(inner_size, cudf::get_default_stream());
+    auto col        = cudf::make_strings_column(
+      num_rows, std::move(offsets), many_chars.release(), 0, rmm::device_buffer{});
 
     auto sliced = cudf::slice(*col, {16, 32});
 
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 13459197aa3..2d9e2035e5e 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -297,17 +297,14 @@ TEST_F(StringsContainsTests, HexTest)
   std::vector<cudf::size_type> offsets(
     {thrust::make_counting_iterator<cudf::size_type>(0),
      thrust::make_counting_iterator<cudf::size_type>(0) + count + 1});
-  auto d_chars = std::make_unique<cudf::column>(
-    cudf::detail::make_device_uvector_sync(
-      ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
-    rmm::device_buffer{},
-    0);
+  auto d_chars = cudf::detail::make_device_uvector_sync(
+    ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto d_offsets = std::make_unique<cudf::column>(
     cudf::detail::make_device_uvector_sync(
       offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
     rmm::device_buffer{},
     0);
-  auto input = cudf::make_strings_column(count, std::move(d_offsets), std::move(d_chars), 0, {});
+  auto input = cudf::make_strings_column(count, std::move(d_offsets), d_chars.release(), 0, {});
 
   auto strings_view = cudf::strings_column_view(input->view());
   for (auto ch : ascii_chars) {
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 5381ad63bc3..64123690aea 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -145,11 +145,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
   }
 
   std::vector<cudf::bitmask_type> h_nulls{h_null_mask};
-  auto d_buffer = std::make_unique<cudf::column>(
-    cudf::detail::make_device_uvector_sync(
-      h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
-    rmm::device_buffer{},
-    0);
+  auto d_buffer = cudf::detail::make_device_uvector_sync(
+    h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto d_offsets = std::make_unique<cudf::column>(
     cudf::detail::make_device_uvector_sync(
       h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
@@ -158,7 +155,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
   auto d_nulls = cudf::detail::make_device_uvector_sync(
     h_nulls, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto column = cudf::make_strings_column(
-    count, std::move(d_offsets), std::move(d_buffer), null_count, d_nulls.release());
+    count, std::move(d_offsets), d_buffer.release(), null_count, d_nulls.release());
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
   EXPECT_EQ(column->null_count(), null_count);
   EXPECT_EQ(1, column->num_children());
@@ -197,8 +194,7 @@ TEST_F(StringsFactoriesTest, CreateScalar)
 
 TEST_F(StringsFactoriesTest, EmptyStringsColumn)
 {
-  auto d_chars = std::make_unique<cudf::column>(
-    rmm::device_uvector<char>{0, cudf::get_default_stream()}, rmm::device_buffer{}, 0);
+  auto d_chars   = rmm::device_uvector<char>(0, cudf::get_default_stream());
   auto d_offsets = std::make_unique<cudf::column>(
     cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(
       1, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
@@ -207,7 +203,7 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn)
   rmm::device_uvector<cudf::bitmask_type> d_nulls{0, cudf::get_default_stream()};
 
   auto results =
-    cudf::make_strings_column(0, std::move(d_offsets), std::move(d_chars), 0, d_nulls.release());
+    cudf::make_strings_column(0, std::move(d_offsets), d_chars.release(), 0, d_nulls.release());
   cudf::test::expect_column_empty(results->view());
 
   rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> d_strings{
diff --git a/cpp/tests/transform/row_conversion.cpp b/cpp/tests/transform/row_conversion.cpp
index e54929f1651..542ccc5e2d5 100644
--- a/cpp/tests/transform/row_conversion.cpp
+++ b/cpp/tests/transform/row_conversion.cpp
@@ -14,26 +14,20 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/io/parquet.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/row_conversion.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/random.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <rmm/exec_policy.hpp>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 
 #include <limits>
+#include <random>
 
 struct ColumnToRowTests : public cudf::test::BaseFixture {};
 struct RowToColumnTests : public cudf::test::BaseFixture {};
@@ -833,19 +827,7 @@ TEST_F(RowToColumnTests, SimpleString)
   EXPECT_EQ(new_rows.size(), 1);
   for (auto& row : new_rows) {
     auto new_cols = cudf::convert_from_rows(cudf::lists_column_view(*row), schema);
-
     EXPECT_EQ(row->size(), 5);
-    auto const num_columns = new_cols->num_columns();
-
-    cudf::strings_column_view str_col = new_cols->get_column(1).view();
-    std::vector<thrust::host_vector<int8_t>> col_data;
-    std::vector<thrust::host_vector<cudf::size_type>> offset_data;
-    for (int i = 0; i < num_columns; ++i) {
-      offset_data.emplace_back(
-        std::get<0>(cudf::test::to_host<cudf::size_type>(str_col.offsets())));
-      col_data.emplace_back(std::get<0>(cudf::test::to_host<int8_t>(str_col.chars())));
-    }
-
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols);
   }
 }

From 8fa294584815437c664ce3616712cdc14a130f75 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Jan 2024 10:28:57 -0500
Subject: [PATCH 14/60] Disable parallel build (#14796)

A new release of `pydata_sphinx_theme` [from last night](https://github.com/pydata/pydata-sphinx-theme/releases/tag/v0.15.2) includes https://github.com/pydata/pydata-sphinx-theme/pull/1642, which marks the theme as unsafe for parallel writing.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14796
---
 ci/build_docs.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index ceab29c2473..78518cdad53 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -40,8 +40,8 @@ popd
 
 rapids-logger "Build Python docs"
 pushd docs/cudf
-make dirhtml O="-j 4"
-make text O="-j 4"
+make dirhtml
+make text
 mkdir -p "${RAPIDS_DOCS_DIR}/cudf/"{html,txt}
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html"
 mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"

From e0905ac9cc21e90ebce0a6d31e78c1e9f21c3a2b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Fri, 19 Jan 2024 11:12:00 -0600
Subject: [PATCH 15/60] Add `pynvjitlink` as a dependency (#14763)

This PR adds `pynvjitlink` as a hard dependency for cuDF. This should allow for MVC when launching numba kernels across minor versions of CUDA 12 up to the version of `nvjitlink` statically shipped with `pynvjitlink`.

cc @bdice

Authors:
  - https://github.com/brandon-b-miller
  - https://github.com/jakirkham
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14763
---
 ci/build_wheel.sh                             |  2 +-
 .../all_cuda-120_arch-x86_64.yaml             |  1 +
 conda/recipes/cudf/meta.yaml                  |  3 +-
 dependencies.yaml                             |  5 +++-
 python/cudf/cudf/utils/_numba.py              | 29 +++++++------------
 5 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 9c674518810..c4b794e81f7 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -48,7 +48,7 @@ fi
 if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
     sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
     sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
-    sed -i "/ptxcompiler/d" ${pyproject_file}
+    sed -i "s/ptxcompiler/pynvjitlink/g" ${pyproject_file}
     sed -i "/cubinlinker/d" ${pyproject_file}
 fi
 
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index de59bc1d43c..4cf1d5427f4 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -70,6 +70,7 @@ dependencies:
 - protobuf>=4.21,<5
 - pyarrow==14.0.1.*
 - pydata-sphinx-theme!=0.14.2
+- pynvjitlink
 - pytest
 - pytest-benchmark
 - pytest-cases>=3.8.2
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index bc91ee61f6f..4f39a9fe452 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -98,6 +98,7 @@ requirements:
     # xref: https://github.com/rapidsai/cudf/issues/12822
     - cuda-nvrtc
     - cuda-python >=12.0,<13.0a0
+    - pynvjitlink
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - nvtx >=0.2.1
diff --git a/dependencies.yaml b/dependencies.yaml
index 9cf808907ec..90a04b2f876 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -534,16 +534,19 @@ dependencies:
           - {matrix: null, packages: *run_cudf_packages_all_cu11}
       - output_types: conda
         matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - pynvjitlink
           - matrix: {cuda: "11.*"}
             packages:
               - cubinlinker
               - ptxcompiler
-          - {matrix: null, packages: null}
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
               - rmm-cu12==24.2.*
+              - pynvjitlink-cu12
           - matrix: {cuda: "11.*"}
             packages:
               - rmm-cu11==24.2.*
diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index 7781c14e559..6d00fd397df 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -3,23 +3,10 @@
 import glob
 import os
 import sys
-import warnings
 from functools import lru_cache
 
 from numba import config as numba_config
 
-try:
-    from pynvjitlink.patch import (
-        patch_numba_linker as patch_numba_linker_pynvjitlink,
-    )
-except ImportError:
-
-    def patch_numba_linker_pynvjitlink():
-        warnings.warn(
-            "CUDA Toolkit is newer than CUDA driver. "
-            "Numba features will not work in this configuration. "
-        )
-
 
 # Use an lru_cache with a single value to allow a delayed import of
 # strings_udf. This is the easiest way to break an otherwise circular import
@@ -117,11 +104,13 @@ def _setup_numba():
     version of the CUDA Toolkit used to build the PTX files shipped
     with the user cuDF package.
     """
-    # ptxcompiler is a requirement for cuda 11.x packages but not
-    # cuda 12.x packages. However its version checking machinery
-    # is still necessary. If a user happens to have ptxcompiler
-    # in a cuda 12 environment, it's use for the purposes of
-    # checking the driver and runtime versions is harmless
+
+    # Either ptxcompiler, or our vendored version (_ptxcompiler.py)
+    # is needed to determine the driver and runtime CUDA versions in
+    # the environment. In a CUDA 11.x environment, ptxcompiler is used
+    # to provide MVC directly, whereas for CUDA 12.x this is provided
+    # through pynvjitlink. The presence of either package does not
+    # perturb cuDF's operation in situations where they aren't used.
     try:
         from ptxcompiler.patch import NO_DRIVER, safe_get_versions
     except ModuleNotFoundError:
@@ -145,7 +134,9 @@ def _setup_numba():
             if driver_version < (12, 0):
                 patch_numba_linker_cuda_11()
             else:
-                patch_numba_linker_pynvjitlink()
+                from pynvjitlink.patch import patch_numba_linker
+
+                patch_numba_linker()
 
 
 def _get_cuda_version_from_ptx_file(path):

From aba34fdaec3fd0be051e1ab466f7d2b64b5f7b43 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Fri, 19 Jan 2024 12:25:48 -0500
Subject: [PATCH 16/60] Add pip install instructions to README (#13677)

This PR adds pip installation instructions to the README.

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13677
---
 README.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/README.md b/README.md
index 996e5ff4800..0cf168123cf 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,10 @@ print(tips_df.groupby("size").tip_percentage.mean())
 - [libcudf (C++/CUDA) documentation](https://docs.rapids.ai/api/libcudf/stable/)
 - [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.
 
+See the [RAPIDS install page](https://docs.rapids.ai/install) for
+the most up-to-date information and commands for installing cuDF
+and other RAPIDS packages.
+
 ## Installation
 
 ### CUDA/GPU requirements
@@ -64,6 +68,24 @@ print(tips_df.groupby("size").tip_percentage.mean())
 * NVIDIA driver 450.80.02+
 * Volta architecture or better (Compute Capability >=7.0)
 
+### Pip
+
+cuDF can be installed via `pip` from the NVIDIA Python Package Index.
+Be sure to select the appropriate cuDF package depending
+on the major version of CUDA available in your environment:
+
+For CUDA 11.x:
+
+```bash
+pip install --extra-index-url=https://pypi.nvidia.com cudf-cu11
+```
+
+For CUDA 12.x:
+
+```bash
+pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12
+```
+
 ### Conda
 
 cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel:

From d017cf498212f27bb8954cf6c698aa754fe9c8e1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jan 2024 07:38:29 -1000
Subject: [PATCH 17/60] Clean up `TimedeltaIndex.__init__` constructor (#14775)

Aligns the constructor closer to `DatetimeIndex.__init__`: https://github.com/rapidsai/cudf/pull/14774

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14775
---
 python/cudf/cudf/core/index.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 96643ef08d3..fa7173f1d0f 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2838,8 +2838,8 @@ def __init__(
         unit=None,
         freq=None,
         closed=None,
-        dtype="timedelta64[ns]",
-        copy=False,
+        dtype=None,
+        copy: bool = False,
         name=None,
     ):
         if freq is not None:
@@ -2851,19 +2851,19 @@ def __init__(
                 "dtype parameter is supported"
             )
 
-        valid_dtypes = tuple(
-            f"timedelta64[{res}]" for res in ("s", "ms", "us", "ns")
-        )
-        if dtype not in valid_dtypes:
-            raise TypeError("Invalid dtype")
+        if dtype is None:
+            dtype = "timedelta64[ns]"
+        dtype = cudf.dtype(dtype)
+        if dtype.kind != "m":
+            raise TypeError("dtype must be a timedelta type")
 
-        kwargs = _setdefault_name(data, name=name)
+        name = _setdefault_name(data, name=name)["name"]
         data = column.as_column(data, dtype=dtype)
 
         if copy:
             data = data.copy()
 
-        super().__init__(data, **kwargs)
+        super().__init__(data, name=name)
 
     def __getitem__(self, index):
         value = super().__getitem__(index)

From 9bb9af608233ab51490f0e077b7b8371d462c83e Mon Sep 17 00:00:00 2001
From: Danial Javady <122740063+ZelboK@users.noreply.github.com>
Date: Fri, 19 Jan 2024 13:45:08 -0500
Subject: [PATCH 18/60] Expose streams in public filling APIs for label_bins
 (#14401)

Contributes to #925. Introduces cuda_stream parameter for downstream users to provide for `labeling_bins`

Authors:
  - Danial Javady (https://github.com/ZelboK)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14401
---
 cpp/include/cudf/labeling/label_bins.hpp |  4 ++-
 cpp/src/labeling/label_bins.cu           | 12 +++-----
 cpp/tests/CMakeLists.txt                 |  1 +
 cpp/tests/streams/labeling_bins_test.cpp | 37 ++++++++++++++++++++++++
 4 files changed, 45 insertions(+), 9 deletions(-)
 create mode 100644 cpp/tests/streams/labeling_bins_test.cpp

diff --git a/cpp/include/cudf/labeling/label_bins.hpp b/cpp/include/cudf/labeling/label_bins.hpp
index 2776f50a939..d8ea262dfe1 100644
--- a/cpp/include/cudf/labeling/label_bins.hpp
+++ b/cpp/include/cudf/labeling/label_bins.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,6 +64,7 @@ enum class inclusive { YES, NO };
  * @param left_inclusive Whether or not the left edge is inclusive.
  * @param right_edges Value of the right edge of each bin.
  * @param right_inclusive Whether or not the right edge is inclusive.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device.
  * @return The integer labels of the elements in `input` according to the specified bins.
  */
@@ -73,6 +74,7 @@ std::unique_ptr<column> label_bins(
   inclusive left_inclusive,
   column_view const& right_edges,
   inclusive right_inclusive,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 1a603785a41..9fecaa1ddb2 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -236,15 +236,11 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    inclusive left_inclusive,
                                    column_view const& right_edges,
                                    inclusive right_inclusive,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::label_bins(input,
-                            left_edges,
-                            left_inclusive,
-                            right_edges,
-                            right_inclusive,
-                            cudf::get_default_stream(),
-                            mr);
+  return detail::label_bins(
+    input, left_edges, left_inclusive, right_edges, right_inclusive, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b385c63e9cd..a3b982a6719 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -653,6 +653,7 @@ ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/labeling_bins_test.cpp b/cpp/tests/streams/labeling_bins_test.cpp
new file mode 100644
index 00000000000..a1d3983aacc
--- /dev/null
+++ b/cpp/tests/streams/labeling_bins_test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/labeling/label_bins.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class LabelingBinsStreamTest : public cudf::test::BaseFixture {};
+
+TEST_F(LabelingBinsStreamTest, SimpleStringsTest)
+{
+  cudf::test::strings_column_wrapper left_edges{"a", "b", "c", "d", "e"};
+  cudf::test::strings_column_wrapper right_edges{"b", "c", "d", "e", "f"};
+  cudf::test::strings_column_wrapper input{"abc", "bcd", "cde", "def", "efg"};
+
+  cudf::label_bins(input,
+                   left_edges,
+                   cudf::inclusive::YES,
+                   right_edges,
+                   cudf::inclusive::NO,
+                   cudf::test::get_default_stream());
+}

From 446da756f0703556734a7409534db3bdc01c7975 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 19 Jan 2024 13:50:27 -0500
Subject: [PATCH 19/60] Move cudf::char_utf8 definition from detail to public
 header (#14779)

Moves the `cudf::char_utf8` definition from the `cudf/strings/detail/utf8.hpp` to `cudf/types.hpp` since it is declared in the public namespace and used in public functions.

Reference: https://github.com/rapidsai/cudf/blob/9acddc08cc209e8d6b94891be6131edd63ff5b43/docs/cudf/source/conf.py#L372-L375

Authors:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14779
---
 cpp/include/cudf/strings/detail/utf8.hpp |  5 +-
 cpp/include/cudf/types.hpp               |  2 +-
 docs/cudf/source/conf.py                 | 83 +++++++++++++++---------
 3 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
index e04572535de..5587597cb51 100644
--- a/cpp/include/cudf/strings/detail/utf8.hpp
+++ b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,9 +23,6 @@
  */
 
 namespace cudf {
-
-using char_utf8 = uint32_t;  ///< UTF-8 characters are 1-4 bytes
-
 namespace strings {
 namespace detail {
 
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 86750ea4ca8..baf07fa3db6 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -62,7 +62,6 @@ class mutable_column_view;
 class string_view;
 class list_view;
 class struct_view;
-
 class scalar;
 
 // clang-format off
@@ -95,6 +94,7 @@ using size_type         = int32_t;   ///< Row index type for columns and tables
 using bitmask_type      = uint32_t;  ///< Bitmask type stored as 32-bit unsigned integer
 using valid_type        = uint8_t;   ///< Valid type in host memory
 using thread_index_type = int64_t;   ///< Thread index type in kernels
+using char_utf8         = uint32_t;  ///< UTF-8 characters are 1-4 bytes
 
 /**
  * @brief Similar to `std::distance` but returns `cudf::size_type` and performs `static_cast`
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index a0bb555365e..34ffd7f0258 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -33,16 +33,16 @@
 class PseudoLexer(RegexLexer):
     """Trivial lexer for pseudocode."""
 
-    name = 'pseudocode'
-    aliases = ['pseudo']
+    name = "pseudocode"
+    aliases = ["pseudo"]
     tokens = {
-        'root': [
-            (r'.*\n', PText),
+        "root": [
+            (r".*\n", PText),
         ]
     }
 
 
-lexers['pseudo'] = PseudoLexer()
+lexers["pseudo"] = PseudoLexer()
 
 # -- Custom Extensions ----------------------------------------------------
 sys.path.append(os.path.abspath("./_ext"))
@@ -69,6 +69,7 @@ class PseudoLexer(RegexLexer):
     "myst_nb",
 ]
 
+
 # Preprocess doxygen xml for compatibility with latest Breathe
 def clean_definitions(root):
     # Breathe can't handle SFINAE properly:
@@ -105,10 +106,14 @@ def clean_definitions(root):
                                     pass
                                 break
 
-
     # All of these in type declarations cause Breathe to choke.
     # For friend, see https://github.com/breathe-doc/breathe/issues/916
-    strings_to_remove = ("__forceinline__", "CUDF_HOST_DEVICE", "decltype(auto)", "friend")
+    strings_to_remove = (
+        "__forceinline__",
+        "CUDF_HOST_DEVICE",
+        "decltype(auto)",
+        "friend",
+    )
     for node in root.iter():
         for string in strings_to_remove:
             if node.text is not None:
@@ -116,6 +121,7 @@ def clean_definitions(root):
             if node.tail is not None:
                 node.tail = node.tail.replace(string, "")
 
+
 def clean_all_xml_files(path):
     for fn in glob.glob(os.path.join(path, "*.xml")):
         tree = ET.parse(fn)
@@ -130,7 +136,7 @@ def clean_all_xml_files(path):
 breathe_default_project = "libcudf"
 
 
-nb_execution_excludepatterns = ['performance-comparisons.ipynb']
+nb_execution_excludepatterns = ["performance-comparisons.ipynb"]
 
 nb_execution_mode = "force"
 nb_execution_timeout = 300
@@ -163,9 +169,9 @@ def clean_all_xml_files(path):
 # built documents.
 #
 # The short X.Y version.
-version = '24.02'
+version = "24.02"
 # The full version, including alpha/beta/rc tags.
-release = '24.02.00'
+release = "24.02.00"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -177,7 +183,10 @@ def clean_all_xml_files(path):
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ['venv', "**/includes/**",]
+exclude_patterns = [
+    "venv",
+    "**/includes/**",
+]
 
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = "sphinx"
@@ -286,7 +295,10 @@ def clean_all_xml_files(path):
     "pyarrow": ("https://arrow.apache.org/docs/", None),
     "python": ("https://docs.python.org/3", None),
     "rmm": ("https://docs.rapids.ai/api/rmm/nightly/", None),
-    "typing_extensions": ("https://typing-extensions.readthedocs.io/en/stable/", None),
+    "typing_extensions": (
+        "https://typing-extensions.readthedocs.io/en/stable/",
+        None,
+    ),
 }
 
 # Config numpydoc
@@ -333,12 +345,15 @@ def _generate_namespaces(namespaces):
             all_namespaces.append(f"{base_namespace}::{other_namespace}::")
     return all_namespaces
 
-_all_namespaces = _generate_namespaces({
-    # Note that io::datasource is actually a nested class
-    "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression"},
-    "numeric": {},
-    "nvtext": {},
-})
+
+_all_namespaces = _generate_namespaces(
+    {
+        # Note that io::datasource is actually a nested class
+        "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression"},
+        "numeric": {},
+        "nvtext": {},
+    }
+)
 
 _names_to_skip = {
     # External names
@@ -369,10 +384,6 @@ def _generate_namespaces(namespaces):
     # Unsupported by Breathe
     # https://github.com/breathe-doc/breathe/issues/355
     "deprecated",
-    # TODO: This type is currently defined in a detail header but it's in
-    # the public namespace. However, it's used in the detail header, so it
-    # needs to be put into a public header that can be shared.
-    "char_utf8",
     # TODO: This is currently in a src file but perhaps should be public
     "orc::column_statistics",
     # Sphinx doesn't know how to distinguish between the ORC and Parquet
@@ -396,21 +407,24 @@ def _cached_intersphinx_lookup(env, node, contnode):
     key = (node, contnode)
     if key in _intersphinx_cache:
         return _intersphinx_cache[key]
-    if (ref := intersphinx.resolve_reference_detect_inventory(env, node, contnode)) is not None:
+    if (
+        ref := intersphinx.resolve_reference_detect_inventory(
+            env, node, contnode
+        )
+    ) is not None:
         _intersphinx_cache[key] = ref
     return ref
 
 
 def on_missing_reference(app, env, node, contnode):
     # These variables are defined outside the function to speed up the build.
-    global _all_namespaces, _names_to_skip, _intersphinx_extra_prefixes, \
-        _domain_objects, _prefixed_domain_objects, _intersphinx_cache
+    global _all_namespaces, _names_to_skip, _intersphinx_extra_prefixes, _domain_objects, _prefixed_domain_objects, _intersphinx_cache
 
     # Precompute and cache domains for faster lookups
     if _domain_objects is None:
         _domain_objects = {}
         _prefixed_domain_objects = {}
-        for (name, _, _, docname, _, _) in env.domains["cpp"].get_objects():
+        for name, _, _, docname, _, _ in env.domains["cpp"].get_objects():
             _domain_objects[name] = docname
             for prefix in _all_namespaces:
                 _prefixed_domain_objects[f"{prefix}{name}"] = name
@@ -473,7 +487,9 @@ def on_missing_reference(app, env, node, contnode):
         # to fail.
         if reftarget != node["reftarget"]:
             node["reftarget"] = reftarget
-            if (ref := _cached_intersphinx_lookup(env, node, contnode)) is not None:
+            if (
+                ref := _cached_intersphinx_lookup(env, node, contnode)
+            ) is not None:
                 return ref
 
         # If the template wasn't the (only) issue, we check the various
@@ -481,11 +497,15 @@ def on_missing_reference(app, env, node, contnode):
         for prefix in _intersphinx_extra_prefixes:
             if prefix not in reftarget:
                 node["reftarget"] = f"{prefix}::{reftarget}"
-                if (ref := _cached_intersphinx_lookup(env, node, contnode)) is not None:
+                if (
+                    ref := _cached_intersphinx_lookup(env, node, contnode)
+                ) is not None:
                     return ref
             else:
                 node["reftarget"] = reftarget.replace(f"{prefix}::", "")
-                if (ref := _cached_intersphinx_lookup(env, node, contnode)) is not None:
+                if (
+                    ref := _cached_intersphinx_lookup(env, node, contnode)
+                ) is not None:
                     return ref
 
     return None
@@ -499,8 +519,11 @@ def on_missing_reference(app, env, node, contnode):
     ("py:class", "typing_extensions.Self"),
 ]
 
+
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
-    app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer")
+    app.add_js_file(
+        "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer"
+    )
     app.connect("doctree-read", resolve_aliases)
     app.connect("missing-reference", on_missing_reference)

From 51ecef3b8616eb851e23814f21db7b705cf75504 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jan 2024 09:40:51 -1000
Subject: [PATCH 20/60] Remove **kwargs from astype (#14765)

The aligns with pandas's behavior of `astype` which doesn't accept kwargs. AFAICT, `**kwargs` was never really respected in the first place

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/14765
---
 python/cudf/cudf/core/column/column.py   | 10 ++++------
 python/cudf/cudf/core/dataframe.py       | 10 ++++++++--
 python/cudf/cudf/core/frame.py           | 13 +++++--------
 python/cudf/cudf/core/indexed_frame.py   | 11 ++++++++---
 python/cudf/cudf/core/series.py          | 10 ++++++++--
 python/cudf/cudf/tests/test_dataframe.py | 19 +++++++++++--------
 6 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 705862c502a..bc7b1ed97c0 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -958,9 +958,7 @@ def distinct_count(self, dropna: bool = True) -> int:
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
         raise NotImplementedError()
 
-    def astype(
-        self, dtype: Dtype, copy: bool = False, format: str | None = None
-    ) -> ColumnBase:
+    def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
         if copy:
             col = self.copy()
         else:
@@ -1000,7 +998,7 @@ def astype(
                     f"Casting to {dtype} is not supported, use "
                     "`.astype('str')` instead."
                 )
-            return col.as_string_column(dtype, format=format)
+            return col.as_string_column(dtype)
         elif isinstance(dtype, (ListDtype, StructDtype)):
             if not col.dtype == dtype:
                 raise NotImplementedError(
@@ -1012,9 +1010,9 @@ def astype(
         elif isinstance(dtype, cudf.core.dtypes.DecimalDtype):
             return col.as_decimal_column(dtype)
         elif np.issubdtype(cast(Any, dtype), np.datetime64):
-            return col.as_datetime_column(dtype, format=format)
+            return col.as_datetime_column(dtype)
         elif np.issubdtype(cast(Any, dtype), np.timedelta64):
-            return col.as_timedelta_column(dtype, format=format)
+            return col.as_timedelta_column(dtype)
         else:
             return col.as_numerical_column(dtype)
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f9cf180ff44..2f18c194fde 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -19,6 +19,7 @@
     Callable,
     Dict,
     List,
+    Literal,
     MutableMapping,
     Optional,
     Set,
@@ -1774,7 +1775,12 @@ def _concat(
 
         return out
 
-    def astype(self, dtype, copy=False, errors="raise", **kwargs):
+    def astype(
+        self,
+        dtype,
+        copy: bool = False,
+        errors: Literal["raise", "ignore"] = "raise",
+    ):
         if is_dict_like(dtype):
             if len(set(dtype.keys()) - set(self._data.names)) > 0:
                 raise KeyError(
@@ -1783,7 +1789,7 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):
                 )
         else:
             dtype = {cc: dtype for cc in self._data.names}
-        return super().astype(dtype, copy, errors, **kwargs)
+        return super().astype(dtype, copy, errors)
 
     def _clean_renderable_dataframe(self, output):
         """
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 05104a3ef05..cb8f8b9cc7b 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -276,14 +276,11 @@ def __len__(self):
         return self._num_rows
 
     @_cudf_nvtx_annotate
-    def astype(self, dtype, copy=False, **kwargs):
-        result_data = {}
-        for col_name, col in self._data.items():
-            dt = dtype.get(col_name, col.dtype)
-            if not is_dtype_equal(dt, col.dtype):
-                result_data[col_name] = col.astype(dt, copy=copy, **kwargs)
-            else:
-                result_data[col_name] = col.copy() if copy else col
+    def astype(self, dtype, copy: bool = False):
+        result_data = {
+            col_name: col.astype(dtype.get(col_name, col.dtype), copy=copy)
+            for col_name, col in self._data.items()
+        }
 
         return ColumnAccessor._create_unsafe(
             data=result_data,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 70be5c3ad0f..8cd9cc54889 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -14,6 +14,7 @@
     Callable,
     Dict,
     List,
+    Literal,
     MutableMapping,
     Optional,
     Tuple,
@@ -3736,7 +3737,12 @@ def _append(
 
         return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort)
 
-    def astype(self, dtype, copy=False, errors="raise", **kwargs):
+    def astype(
+        self,
+        dtype,
+        copy: bool = False,
+        errors: Literal["raise", "ignore"] = "raise",
+    ):
         """Cast the object to the given dtype.
 
         Parameters
@@ -3757,7 +3763,6 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):
             -   ``raise`` : allow exceptions to be raised
             -   ``ignore`` : suppress exceptions. On error return original
                 object.
-        **kwargs : extra arguments to pass on to the constructor
 
         Returns
         -------
@@ -3848,7 +3853,7 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):
             raise ValueError("invalid error value specified")
 
         try:
-            data = super().astype(dtype, copy, **kwargs)
+            data = super().astype(dtype, copy)
         except Exception as e:
             if errors == "raise":
                 raise e
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index bc1eaef86db..55100343306 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -12,6 +12,7 @@
 from typing import (
     Any,
     Dict,
+    Literal,
     MutableMapping,
     Optional,
     Sequence,
@@ -2141,7 +2142,12 @@ def nullmask(self):
         return cudf.Series(self._column.nullmask)
 
     @_cudf_nvtx_annotate
-    def astype(self, dtype, copy=False, errors="raise", **kwargs):
+    def astype(
+        self,
+        dtype,
+        copy: bool = False,
+        errors: Literal["raise", "ignore"] = "raise",
+    ):
         if is_dict_like(dtype):
             if len(dtype) > 1 or self.name not in dtype:
                 raise KeyError(
@@ -2150,7 +2156,7 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):
                 )
         else:
             dtype = {self.name: dtype}
-        return super().astype(dtype, copy, errors, **kwargs)
+        return super().astype(dtype, copy, errors)
 
     @_cudf_nvtx_annotate
     def sort_index(self, axis=0, *args, **kwargs):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 30530e9d2a3..d75db7dfaae 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import array as arr
 import contextlib
@@ -5114,15 +5114,18 @@ def test_df_astype_to_categorical_ordered(ordered):
 
 
 @pytest.mark.parametrize(
-    "dtype,args",
-    [(dtype, {}) for dtype in ALL_TYPES]
-    + [("category", {"ordered": True}), ("category", {"ordered": False})],
+    "dtype",
+    [dtype for dtype in ALL_TYPES]
+    + [
+        cudf.CategoricalDtype(ordered=True),
+        cudf.CategoricalDtype(ordered=False),
+    ],
 )
-def test_empty_df_astype(dtype, args):
+def test_empty_df_astype(dtype):
     df = cudf.DataFrame()
-    kwargs = {}
-    kwargs.update(args)
-    assert_eq(df, df.astype(dtype=dtype, **kwargs))
+    result = df.astype(dtype=dtype)
+    assert_eq(df, result)
+    assert_eq(df.to_pandas().astype(dtype=dtype), result)
 
 
 @pytest.mark.parametrize(

From a38fc01a6b8cb8506753b6a7fd77c7444e25d52c Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Fri, 19 Jan 2024 14:58:42 -0500
Subject: [PATCH 21/60] Fix shared-workflows repo name (#14784)

Fix the repo name for `shared-workflows`

Authors:
  - Ray Douglass (https://github.com/raydouglass)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/14784
---
 .github/workflows/pr.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index edcc140b191..c94724bcf8c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -133,11 +133,11 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
     with:
       build_command: |
         sccache -z;
-        build-all -DBUILD_BENCHMARKS=ON -DNVBench_ENABLE_CUPTI=OFF --verbose;
+        build-all -DBUILD_BENCHMARKS=ON --verbose;
         sccache -s;
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf

From bfbaf3f47789aec4dc1d15230968fd82c019af8a Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Fri, 19 Jan 2024 15:40:15 -0500
Subject: [PATCH 22/60] Add BaseOffset as a final proxy type to pass
 instancechecks for offsets against `BaseOffset` (#14678)

Addresses part of https://github.com/rapidsai/cudf/issues/14674.

This PR makes `cudf.pandas` aware of the `BaseOffset` type. Because of the way `isinstance()` checks work on proxy types*, we will now pass instance checks like:

```python
freq = "D"
assert isinstance(pd.tseries.frequencies.to_offset(freq), pd.tseries.offsets.BaseOffset)
```

*proxy types appear to be related to each other in the same way as their corresponding slow types. So if `A` inherits from `B`, then `ProxyA` _appears_ to inherit from `ProxyB` even though they are not _actually_ related by inheritance.

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14678
---
 python/cudf/cudf/pandas/_wrappers/pandas.py       | 11 ++++++++++-
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py |  5 +++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index c50e72b4b12..5ea2af7d002 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 import copyreg
@@ -1036,6 +1036,15 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs):
     additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
 )
 
+BaseOffset = make_final_proxy_type(
+    "BaseOffset",
+    _Unusable,
+    pd.offsets.BaseOffset,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
 Day = make_final_proxy_type(
     "Day",
     _Unusable,
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index bd8ca9f1640..44f301819ed 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1233,3 +1233,8 @@ def test_concat_fast():
 def test_func_namespace():
     # note: this test is sensitive to Pandas' internal module layout
     assert xpd.concat is xpd.core.reshape.concat.concat
+
+
+def test_isinstance_base_offset():
+    offset = xpd.tseries.frequencies.to_offset("1s")
+    assert isinstance(offset, xpd.tseries.offsets.BaseOffset)

From 3f7983e4bac3878d47e168784a2db5fba81262e2 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 19 Jan 2024 15:41:07 -0500
Subject: [PATCH 23/60] Create strings-specific make_offsets_child_column for
 multiple offset types (#14612)

Creates `cudf::strings::detail::make_offsets_child_column` for specific use with strings offsets to enable int64 offsets.
This also introduces the `LIBCUDF_LARGE_STRINGS_THRESHOLD` environment variable which can be used at runtime to produce int64 offsets for an arbitrary number of character bytes. This utility is not used in this PR but will be employed in a future PR.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14612
---
 .../cudf/strings/detail/strings_children.cuh  | 62 ++++++++++++++++++-
 cpp/include/cudf/strings/detail/utilities.hpp | 14 +++++
 cpp/src/strings/utilities.cu                  |  9 +++
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 5f8a2a34606..42a180c27c1 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,6 +122,66 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
   return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
 }
 
+/**
+ * @brief Create an offsets column to be a child of a compound column
+ *
+ * This function sets the offsets values by executing scan over the sizes in the provided
+ * Iterator.
+ *
+ * The return also includes the total number of elements -- the last element value from the
+ * scan.
+ *
+ * @tparam InputIterator Used as input to scan to set the offset values
+ * @param begin The beginning of the input sequence
+ * @param end The end of the input sequence
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Offsets column and total elements
+ */
+template <typename InputIterator>
+std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
+  InputIterator begin,
+  InputIterator end,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto constexpr size_type_max = static_cast<int64_t>(std::numeric_limits<size_type>::max());
+  auto const lcount            = static_cast<int64_t>(std::distance(begin, end));
+  CUDF_EXPECTS(
+    lcount <= size_type_max, "Size of output exceeds the column size limit", std::overflow_error);
+  auto const strings_count = static_cast<size_type>(lcount);
+  auto offsets_column      = make_numeric_column(
+    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  auto d_offsets = offsets_column->mutable_view().template data<int32_t>();
+
+  // The number of offsets is strings_count+1 so to build the offsets from the sizes
+  // using exclusive-scan technically requires strings_count+1 input values even though
+  // the final input value is never used.
+  // The input iterator is wrapped here to allow the 'last value' to be safely read.
+  auto map_fn = cuda::proclaim_return_type<size_type>(
+    [begin, strings_count] __device__(size_type idx) -> size_type {
+      return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
+    });
+  auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
+  // Use the sizes-to-offsets iterator to compute the total number of elements
+  auto const total_elements =
+    sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
+
+  // TODO: replace exception with if-statement when enabling creating INT64 offsets
+  CUDF_EXPECTS(total_elements <= size_type_max,
+               "Size of output exceeds the character size limit",
+               std::overflow_error);
+  // if (total_elements >= get_offset64_threshold()) {
+  //   // recompute as int64 offsets when above the threshold
+  //   offsets_column = make_numeric_column(
+  //     data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  //   auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
+  //   sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
+  // }
+
+  return std::pair(std::move(offsets_column), total_elements);
+}
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index e279ee2eb65..3cf2850548d 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -53,9 +53,23 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Return the threshold size for a strings column to use int64 offsets
+ *
+ * A computed size above this threshold should using int64 offsets, otherwise
+ * int32 offsets. By default this function will return std::numeric_limits<int32_t>::max().
+ * This value can be overridden at runtime using the environment variable
+ * LIBCUDF_LARGE_STRINGS_THRESHOLD.
+ *
+ * @return size in bytes
+ */
+int64_t get_offset64_threshold();
+
 /**
  * @brief Return a normalized offset value from a strings offsets column
  *
+ * The maximum value returned is `std::numeric_limits<int32_t>::max()`.
+ *
  * @throw std::invalid_argument if `offsets` is neither INT32 nor INT64
  *
  * @param offsets Input column of type INT32 or INT64
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 13f4776ca33..782d9767fb5 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -129,6 +129,15 @@ special_case_mapping const* get_special_case_mapping_table()
   });
 }
 
+int64_t get_offset64_threshold()
+{
+  auto const threshold  = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
+  std::size_t const rtn = threshold != nullptr ? std::atol(threshold) : 0;
+  return (rtn > 0 && rtn < std::numeric_limits<int32_t>::max())
+           ? rtn
+           : std::numeric_limits<int32_t>::max();
+}
+
 int64_t get_offset_value(cudf::column_view const& offsets,
                          size_type index,
                          rmm::cuda_stream_view stream)

From 09b6e45b7e02bc5964d2a32020171040f82c308b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jan 2024 10:52:35 -1000
Subject: [PATCH 24/60] Consolidate cudf object handling in as_column (#14754)

Since these paths are very similar, consolidating these paths and returning early for this case

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14754
---
 python/cudf/cudf/core/column/column.py | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index bc7b1ed97c0..df5d1c3879a 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1871,6 +1871,7 @@ def as_column(
         If None (default), treats NaN values in arbitrary as null if there is
         no mask passed along with it. If True, combines the mask and NaNs to
         form a new validity mask. If False, leaves NaN values as is.
+        Only applies when arbitrary is not a cudf object (Index, Series, Column).
     dtype : optional
         Optionally typecast the constructed Column to the given
         dtype.
@@ -1907,22 +1908,17 @@ def as_column(
                 f'i{cudf.get_option("default_integer_bitwidth")//8}'
             )
         if dtype is not None:
-            column = column.astype(dtype)
+            return column.astype(dtype)
         return column
-    elif isinstance(arbitrary, ColumnBase):
+    elif isinstance(arbitrary, (ColumnBase, cudf.Series, cudf.BaseIndex)):
+        # Ignoring nan_as_null per the docstring
+        if isinstance(arbitrary, cudf.Series):
+            arbitrary = arbitrary._column
+        elif isinstance(arbitrary, cudf.BaseIndex):
+            arbitrary = arbitrary._values
         if dtype is not None:
             return arbitrary.astype(dtype)
-        else:
-            return arbitrary
-    elif isinstance(arbitrary, cudf.Series):
-        data = arbitrary._column
-        if dtype is not None:
-            data = data.astype(dtype)
-    elif isinstance(arbitrary, cudf.BaseIndex):
-        data = arbitrary._values
-        if dtype is not None:
-            data = data.astype(dtype)
-
+        return arbitrary
     elif hasattr(arbitrary, "__cuda_array_interface__"):
         desc = arbitrary.__cuda_array_interface__
         shape = desc["shape"]

From 1c37c780ced37d6084c90b815b274b598665d60e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jan 2024 11:50:33 -1000
Subject: [PATCH 25/60] Some `frame.py` typing, move seldom used methods in
 `frame.py` (#14766)

* `_drop_na_columns` was only use on `IndexedFrame` and not `Frame` so moved the method to `IndexedFrame
* `_has_nulls` is equivalent to `isna().any()`
* Some typing

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14766
---
 python/cudf/cudf/core/frame.py             | 97 +++++++++-------------
 python/cudf/cudf/core/groupby/groupby.py   |  8 +-
 python/cudf/cudf/core/indexed_frame.py     | 36 +++++++-
 python/cudf/cudf/core/udf/groupby_utils.py |  5 +-
 4 files changed, 77 insertions(+), 69 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index cb8f8b9cc7b..fc313a62fd0 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -94,10 +94,6 @@ def _dtypes(self):
             zip(self._data.names, (col.dtype for col in self._data.columns))
         )
 
-    @property
-    def _has_nulls(self):
-        return any(col.has_nulls() for col in self._data.values())
-
     @_cudf_nvtx_annotate
     def serialize(self):
         # TODO: See if self._data can be serialized outright
@@ -135,13 +131,13 @@ def deserialize(cls, header, frames):
 
     @classmethod
     @_cudf_nvtx_annotate
-    def _from_data(cls, data: MutableMapping):
+    def _from_data(cls, data: MutableMapping) -> Self:
         obj = cls.__new__(cls)
         Frame.__init__(obj, data)
         return obj
 
     @_cudf_nvtx_annotate
-    def _from_data_like_self(self, data: MutableMapping):
+    def _from_data_like_self(self, data: MutableMapping) -> Self:
         return self._from_data(data)
 
     @_cudf_nvtx_annotate
@@ -179,7 +175,7 @@ def _mimic_inplace(
 
     @property
     @_cudf_nvtx_annotate
-    def size(self):
+    def size(self) -> int:
         """
         Return the number of elements in the underlying data.
 
@@ -272,7 +268,7 @@ def memory_usage(self, deep=False):
         raise NotImplementedError
 
     @_cudf_nvtx_annotate
-    def __len__(self):
+    def __len__(self) -> int:
         return self._num_rows
 
     @_cudf_nvtx_annotate
@@ -291,7 +287,7 @@ def astype(self, dtype, copy: bool = False):
         )
 
     @_cudf_nvtx_annotate
-    def equals(self, other):
+    def equals(self, other) -> bool:
         """
         Test whether two objects contain the same elements.
 
@@ -375,7 +371,7 @@ def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
 
     @property
     @_cudf_nvtx_annotate
-    def values(self):
+    def values(self) -> cupy.ndarray:
         """
         Return a CuPy representation of the DataFrame.
 
@@ -391,7 +387,7 @@ def values(self):
 
     @property
     @_cudf_nvtx_annotate
-    def values_host(self):
+    def values_host(self) -> np.ndarray:
         """
         Return a NumPy representation of the data.
 
@@ -547,7 +543,7 @@ def to_numpy(
         )
 
     @_cudf_nvtx_annotate
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]:
         """
         Replace values where the condition is False.
 
@@ -618,7 +614,7 @@ def where(self, cond, other=None, inplace=False):
         raise NotImplementedError
 
     @_cudf_nvtx_annotate
-    def mask(self, cond, other=None, inplace=False):
+    def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]:
         """
         Replace values where the condition is True.
 
@@ -728,8 +724,13 @@ def pipe(self, func, *args, **kwargs):
 
     @_cudf_nvtx_annotate
     def fillna(
-        self, value=None, method=None, axis=None, inplace=False, limit=None
-    ):
+        self,
+        value=None,
+        method: Optional[Literal["ffill", "bfill", "pad", "backfill"]] = None,
+        axis=None,
+        inplace: bool = False,
+        limit=None,
+    ) -> Optional[Self]:
         """Fill null values with ``value`` or specified ``method``.
 
         Parameters
@@ -848,15 +849,15 @@ def fillna(
         if isinstance(value, cudf.Series):
             value = value.reindex(self._data.names)
         elif isinstance(value, cudf.DataFrame):
-            if not self.index.equals(value.index):
-                value = value.reindex(self.index)
+            if not self.index.equals(value.index):  # type: ignore[attr-defined]
+                value = value.reindex(self.index)  # type: ignore[attr-defined]
             else:
                 value = value
         elif not isinstance(value, abc.Mapping):
             value = {name: copy.deepcopy(value) for name in self._data.names}
         else:
             value = {
-                key: value.reindex(self.index)
+                key: value.reindex(self.index)  # type: ignore[attr-defined]
                 if isinstance(value, cudf.Series)
                 else value
                 for key, value in value.items()
@@ -898,44 +899,14 @@ def _drop_column(self, name):
             raise KeyError(f"column '{name}' does not exist")
         del self._data[name]
 
-    @_cudf_nvtx_annotate
-    def _drop_na_columns(self, how="any", subset=None, thresh=None):
-        """
-        Drop columns containing nulls
-        """
-        out_cols = []
-
-        if subset is None:
-            df = self
-        else:
-            df = self.take(subset)
-
-        if thresh is None:
-            if how == "all":
-                thresh = 1
-            else:
-                thresh = len(df)
-
-        for name, col in df._data.items():
-            try:
-                check_col = col.nans_to_nulls()
-            except AttributeError:
-                check_col = col
-            no_threshold_valid_count = (
-                len(col) - check_col.null_count
-            ) < thresh
-            if no_threshold_valid_count:
-                continue
-            out_cols.append(name)
-
-        return self[out_cols]
-
     @_cudf_nvtx_annotate
     def _quantile_table(
         self,
-        q,
-        interpolation="LINEAR",
-        is_sorted=False,
+        q: float,
+        interpolation: Literal[
+            "LINEAR", "LOWER", "HIGHER", "MIDPOINT", "NEAREST"
+        ] = "LINEAR",
+        is_sorted: bool = False,
         column_order=(),
         null_precedence=(),
     ):
@@ -963,7 +934,7 @@ def _quantile_table(
 
     @classmethod
     @_cudf_nvtx_annotate
-    def from_arrow(cls, data):
+    def from_arrow(cls, data: pa.Table) -> Self:
         """Convert from PyArrow Table to Frame
 
         Parameters
@@ -1140,7 +1111,7 @@ def to_arrow(self):
         )
 
     @_cudf_nvtx_annotate
-    def _positions_from_column_names(self, column_names):
+    def _positions_from_column_names(self, column_names) -> list[int]:
         """Map each column name into their positions in the frame.
 
         The order of indices returned corresponds to the column order in this
@@ -1542,7 +1513,12 @@ def argsort(
         ).values
 
     @_cudf_nvtx_annotate
-    def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
+    def _get_sorted_inds(
+        self,
+        by=None,
+        ascending=True,
+        na_position: Literal["first", "last"] = "last",
+    ) -> ColumnBase:
         """
         Get the indices required to sort self according to the columns
         specified in by.
@@ -1556,13 +1532,14 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
             )._columns
         ]
 
-        # If given a scalar need to construct a sequence of length # of columns
-        if np.isscalar(ascending):
-            ascending = [ascending] * len(to_sort)
+        if is_scalar(ascending):
+            ascending_lst = [ascending] * len(to_sort)
+        else:
+            ascending_lst = list(ascending)
 
         return libcudf.sort.order_by(
             to_sort,
-            ascending,
+            ascending_lst,
             na_position,
             stable=True,
         )
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6c83bcd9efb..e28ba233c56 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1251,10 +1251,6 @@ def pipe(self, func, *args, **kwargs):
     def _jit_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
-        # Nulls are not yet supported
-        if self.grouping._obj._has_nulls:
-            raise ValueError("Nulls not yet supported with groupby JIT engine")
-
         chunk_results = jit_groupby_apply(
             offsets, grouped_values, function, *args
         )
@@ -1445,9 +1441,7 @@ def mult(df):
         group_names, offsets, group_keys, grouped_values = self._grouped()
 
         if engine == "auto":
-            if (not grouped_values._has_nulls) and _can_be_jitted(
-                grouped_values, function, args
-            ):
+            if _can_be_jitted(grouped_values, function, args):
                 engine = "jit"
             else:
                 engine = "cudf"
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8cd9cc54889..3e564919090 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2439,7 +2439,9 @@ def sort_values(
             out.columns = self._data.to_pandas_index()
         return out
 
-    def _n_largest_or_smallest(self, largest, n, columns, keep):
+    def _n_largest_or_smallest(
+        self, largest: bool, n: int, columns, keep: Literal["first", "last"]
+    ):
         # Get column to operate on
         if isinstance(columns, str):
             columns = [columns]
@@ -3068,6 +3070,38 @@ def dropna(
 
         return self._mimic_inplace(result, inplace=inplace)
 
+    @_cudf_nvtx_annotate
+    def _drop_na_columns(self, how="any", subset=None, thresh=None):
+        """
+        Drop columns containing nulls
+        """
+        out_cols = []
+
+        if subset is None:
+            df = self
+        else:
+            df = self.take(subset)
+
+        if thresh is None:
+            if how == "all":
+                thresh = 1
+            else:
+                thresh = len(df)
+
+        for name, col in df._data.items():
+            try:
+                check_col = col.nans_to_nulls()
+            except AttributeError:
+                check_col = col
+            no_threshold_valid_count = (
+                len(col) - check_col.null_count
+            ) < thresh
+            if no_threshold_valid_count:
+                continue
+            out_cols.append(name)
+
+        return self[out_cols]
+
     def _drop_na_rows(self, how="any", subset=None, thresh=None):
         """
         Drop null rows from `self`.
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index c82f8d2cd7b..06d9296ca0f 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 
 import cupy as cp
@@ -209,6 +209,9 @@ def _can_be_jitted(frame, func, args):
         # Numba requires bytecode to be present to proceed.
         # See https://github.com/numba/numba/issues/4587
         return False
+
+    if any(col.has_nulls() for col in frame._data.values()):
+        return False
     np_field_types = np.dtype(
         list(
             _supported_dtypes_from_frame(

From eb850fa839a559da5893c1e673d0f5a2ca22e57b Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Sat, 20 Jan 2024 00:56:27 -0500
Subject: [PATCH 26/60] Add pylibcudf to the docs (#14791)

Although pylibcudf remains internal for now, this PR enables publishing documentation in the Python guide so that we can add docs as APIs are created and verify that they appear as desired, as well as benefiting from Sphinx's validation of the published docstrings. This will make migrating those docs to a more public view later much easier.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14791
---
 docs/cudf/source/conf.py                      |  22 +-
 .../cudf/source/user_guide/api_docs/index.rst |   1 +
 .../user_guide/api_docs/pylibcudf/column.rst  |   6 +
 .../user_guide/api_docs/pylibcudf/copying.rst |   6 +
 .../api_docs/pylibcudf/gpumemoryview.rst      |   6 +
 .../user_guide/api_docs/pylibcudf/index.rst   |  16 +
 .../user_guide/api_docs/pylibcudf/scalar.rst  |   6 +
 .../user_guide/api_docs/pylibcudf/table.rst   |   6 +
 .../user_guide/api_docs/pylibcudf/types.rst   |   6 +
 python/cudf/cudf/_lib/pylibcudf/column.pxd    |   4 +-
 python/cudf/cudf/_lib/pylibcudf/column.pyx    |  25 +-
 python/cudf/cudf/_lib/pylibcudf/copying.pyx   | 288 +++++++++++++++++-
 .../cudf/_lib/pylibcudf/gpumemoryview.pyx     |   9 +-
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   |  13 +-
 python/cudf/cudf/_lib/pylibcudf/scalar.pyx    |  24 +-
 python/cudf/cudf/_lib/pylibcudf/table.pyx     |  18 +-
 python/cudf/cudf/_lib/pylibcudf/types.pyx     |   4 +-
 17 files changed, 432 insertions(+), 28 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 34ffd7f0258..92d0c767ccf 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -355,7 +355,18 @@ def _generate_namespaces(namespaces):
     }
 )
 
-_names_to_skip = {
+
+_names_to_skip_in_pylibcudf = {
+    # Cython types that don't alias cleanly because of
+    # https://github.com/cython/cython/issues/5609
+    "size_type",
+    "type_id",
+    # Unknown base types
+    "int32_t",
+}
+
+
+_names_to_skip_in_cpp = {
     # External names
     "thrust",
     "cuda",
@@ -418,7 +429,9 @@ def _cached_intersphinx_lookup(env, node, contnode):
 
 def on_missing_reference(app, env, node, contnode):
     # These variables are defined outside the function to speed up the build.
-    global _all_namespaces, _names_to_skip, _intersphinx_extra_prefixes, _domain_objects, _prefixed_domain_objects, _intersphinx_cache
+    global _all_namespaces, _names_to_skip_in_cpp, \
+        _names_to_skip_in_pylibcudf, _intersphinx_extra_prefixes, \
+        _domain_objects, _prefixed_domain_objects, _intersphinx_cache
 
     # Precompute and cache domains for faster lookups
     if _domain_objects is None:
@@ -443,6 +456,9 @@ def on_missing_reference(app, env, node, contnode):
         node["reftarget"] = "cudf::column_device_view"
         return contnode
 
+    if any(toskip in reftarget for toskip in _names_to_skip_in_pylibcudf):
+        return contnode
+
     if (refid := node.get("refid")) is not None and "hpp" in refid:
         # We don't want to link to C++ header files directly from the
         # Sphinx docs, those are pages that doxygen automatically
@@ -450,7 +466,7 @@ def on_missing_reference(app, env, node, contnode):
         return contnode
 
     if node["refdomain"] in ("std", "cpp") and reftarget is not None:
-        if any(toskip in reftarget for toskip in _names_to_skip):
+        if any(toskip in reftarget for toskip in _names_to_skip_in_cpp):
             return contnode
 
         # Strip template parameters and just use the base type.
diff --git a/docs/cudf/source/user_guide/api_docs/index.rst b/docs/cudf/source/user_guide/api_docs/index.rst
index 01047a31462..b3442908531 100644
--- a/docs/cudf/source/user_guide/api_docs/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/index.rst
@@ -23,3 +23,4 @@ This page provides a list of all publicly accessible modules, methods and classe
     struct_handling
     options
     extension_dtypes
+    pylibcudf/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst
new file mode 100644
index 00000000000..d1105d356b4
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst
@@ -0,0 +1,6 @@
+======
+Column
+======
+
+.. automodule:: cudf._lib.pylibcudf.column
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst
new file mode 100644
index 00000000000..fddd3ea440f
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst
@@ -0,0 +1,6 @@
+=======
+copying
+=======
+
+.. automodule:: cudf._lib.pylibcudf.copying
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst
new file mode 100644
index 00000000000..dffc7c24e02
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst
@@ -0,0 +1,6 @@
+=============
+gpumemoryview
+=============
+
+.. automodule:: cudf._lib.pylibcudf.gpumemoryview
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
new file mode 100644
index 00000000000..435278afeeb
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -0,0 +1,16 @@
+=========
+pylibcudf
+=========
+
+This page provides API documentation for pylibcudf.
+
+.. toctree::
+    :maxdepth: 1
+    :caption: API Documentation
+
+    column
+    copying
+    gpumemoryview
+    scalar
+    table
+    types
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst
new file mode 100644
index 00000000000..b12f47618fb
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst
@@ -0,0 +1,6 @@
+======
+Scalar
+======
+
+.. automodule:: cudf._lib.pylibcudf.scalar
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
new file mode 100644
index 00000000000..d8337b6596d
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
@@ -0,0 +1,6 @@
+=====
+Table
+=====
+
+.. automodule:: cudf._lib.pylibcudf.table
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst
new file mode 100644
index 00000000000..8d5409bbd97
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst
@@ -0,0 +1,6 @@
+=====
+types
+=====
+
+.. automodule:: cudf._lib.pylibcudf.types
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index 27b77438c79..a821c9186a0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
@@ -44,7 +44,7 @@ cdef class Column:
     cpdef gpumemoryview null_mask(self)
     cpdef list children(self)
 
-    cpdef list_view(self)
+    cpdef ListColumnView list_view(self)
 
 
 cdef class ListColumnView:
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index 389a1c82be5..dbe8d4feb37 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -16,12 +16,12 @@ from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
 cdef class Column:
     """A container of nullable device data as a column of elements.
 
-    This class is an implementation of [Arrow columnar data
-    specification](https://arrow.apache.org/docs/format/Columnar.html) for data
-    stored on GPUs. It relies on Python memoryview-like semantics to maintain
-    shared ownership of the data it is constructed with, so any input data may
-    also be co-owned by other data structures. The Column is designed to be
-    operated on using algorithms backed by libcudf.
+    This class is an implementation of `Arrow columnar data specification
+    <https://arrow.apache.org/docs/format/Columnar.html>`__ for data stored on
+    GPUs. It relies on Python memoryview-like semantics to maintain shared
+    ownership of the data it is constructed with, so any input data may also be
+    co-owned by other data structures. The Column is designed to be operated on
+    using algorithms backed by libcudf.
 
     Parameters
     ----------
@@ -217,25 +217,32 @@ cdef class Column:
         """The number of children of this column."""
         return self._num_children
 
-    cpdef list_view(self):
+    cpdef ListColumnView list_view(self):
+        """Accessor for methods of a Column that are specific to lists."""
         return ListColumnView(self)
 
     cpdef gpumemoryview data(self):
+        """The data buffer of the column."""
         return self._data
 
     cpdef gpumemoryview null_mask(self):
+        """The null mask of the column."""
         return self._mask
 
     cpdef size_type size(self):
+        """The number of elements in the column."""
         return self._size
 
     cpdef size_type offset(self):
+        """The offset of the column."""
         return self._offset
 
     cpdef size_type null_count(self):
+        """The number of null elements in the column."""
         return self._null_count
 
     cpdef list children(self):
+        """The children of the column."""
         return self._children
 
 
@@ -247,7 +254,9 @@ cdef class ListColumnView:
         self._column = col
 
     cpdef child(self):
+        """The data column of the underlying list column."""
         return self._column.child(1)
 
     cpdef offsets(self):
+        """The offsets column of the underlying list column."""
         return self._column.child(1)
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
index c08b57c05d1..65f8c7a1854 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cython.operator import dereference
 
@@ -45,8 +45,6 @@ cdef vector[reference_wrapper[const scalar]] _as_vector(list source):
     return c_scalars
 
 
-# TODO: Is it OK to reference the corresponding libcudf algorithm in the
-# documentation? Otherwise there's a lot of room for duplication.
 cpdef Table gather(
     Table source_table,
     Column gather_map,
@@ -54,7 +52,7 @@ cpdef Table gather(
 ):
     """Select rows from source_table according to the provided gather_map.
 
-    For details on the implementation, see cudf::gather in libcudf.
+    For details, see :cpp:func:`gather`.
 
     Parameters
     ----------
@@ -84,6 +82,24 @@ cpdef Table gather(
 
 
 cpdef Table scatter_table(Table source, Column scatter_map, Table target_table):
+    """Scatter rows from source into target_table according to scatter_map.
+
+    For details, see :cpp:func:`scatter`.
+
+    Parameters
+    ----------
+    source : Table
+        The table object from which to pull data.
+    scatter_map : Column
+        A mapping from rows in source to rows in target_table.
+    target_table : Table
+        The table object into which to scatter data.
+
+    Returns
+    -------
+    pylibcudf.Table
+        The result of the scatter
+    """
     cdef unique_ptr[table] c_result
 
     with nogil:
@@ -100,6 +116,24 @@ cpdef Table scatter_table(Table source, Column scatter_map, Table target_table):
 
 # TODO: Could generalize list to sequence
 cpdef Table scatter_scalars(list source, Column scatter_map, Table target_table):
+    """Scatter scalars from source into target_table according to scatter_map.
+
+    For details, see :cpp:func:`scatter`.
+
+    Parameters
+    ----------
+    source : List[Scalar]
+        A list of scalars to scatter into target_table.
+    scatter_map : Column
+        A mapping from rows in source to rows in target_table.
+    target_table : Table
+        The table object into which to scatter data.
+
+    Returns
+    -------
+    pylibcudf.Table
+        The result of the scatter
+    """
     cdef vector[reference_wrapper[const scalar]] source_scalars = \
         _as_vector(source)
 
@@ -117,6 +151,20 @@ cpdef Table scatter_scalars(list source, Column scatter_map, Table target_table)
 
 
 cpdef object empty_column_like(Column input):
+    """Create an empty column with the same type as input.
+
+    For details, see :cpp:func:`empty_like`.
+
+    Parameters
+    ----------
+    input : Column
+        The column to use as a template for the output.
+
+    Returns
+    -------
+    pylibcudf.Column
+        An empty column with the same type as input.
+    """
     cdef unique_ptr[column] c_column_result
     with nogil:
         c_column_result = move(
@@ -128,6 +176,20 @@ cpdef object empty_column_like(Column input):
 
 
 cpdef object empty_table_like(Table input):
+    """Create an empty table with the same type as input.
+
+    For details, see :cpp:func:`empty_like`.
+
+    Parameters
+    ----------
+    input : Table
+        The table to use as a template for the output.
+
+    Returns
+    -------
+    pylibcudf.Table
+        An empty table with the same type as input.
+    """
     cdef unique_ptr[table] c_table_result
     with nogil:
         c_table_result = move(
@@ -141,6 +203,26 @@ cpdef object empty_table_like(Table input):
 cpdef Column allocate_like(
     Column input_column, mask_allocation_policy policy, size=None
 ):
+    """Allocate a column with the same type as input_column.
+
+    For details, see :cpp:func:`allocate_like`.
+
+    Parameters
+    ----------
+    input_column : Column
+        The column to use as a template for the output.
+    policy : mask_allocation_policy
+        Controls whether the output column has a valid mask.
+    size : int, optional
+        The number of elements to allocate in the output column. If not
+        specified, the size of the input column is used.
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column with the same type and size as input.
+    """
+
     cdef unique_ptr[column] c_result
     cdef size_type c_size = size if size is not None else input_column.size()
 
@@ -163,6 +245,26 @@ cpdef Column copy_range_in_place(
     size_type input_end,
     size_type target_begin,
 ):
+    """Copy a range of elements from input_column to target_column.
+
+    The target_column is overwritten in place.
+
+    For details on the implementation, see :cpp:func:`copy_range_in_place`.
+
+    Parameters
+    ----------
+    input_column : Column
+        The column from which to copy elements.
+    target_column : Column
+        The column into which to copy elements.
+    input_begin : int
+        The index of the first element in input_column to copy.
+    input_end : int
+        The index of the last element in input_column to copy.
+    target_begin : int
+        The index of the first element in target_column to overwrite.
+    """
+
     # Need to initialize this outside the function call so that Cython doesn't
     # try and pass a temporary that decays to an rvalue reference in where the
     # function requires an lvalue reference.
@@ -184,6 +286,28 @@ cpdef Column copy_range(
     size_type input_end,
     size_type target_begin,
 ):
+    """Copy a range of elements from input_column to target_column.
+
+    For details on the implementation, see :cpp:func:`copy_range`.
+
+    Parameters
+    ----------
+    input_column : Column
+        The column from which to copy elements.
+    target_column : Column
+        The column into which to copy elements.
+    input_begin : int
+        The index of the first element in input_column to copy.
+    input_end : int
+        The index of the last element in input_column to copy.
+    target_begin : int
+        The index of the first element in target_column to overwrite.
+
+    Returns
+    -------
+    pylibcudf.Column
+        A copy of target_column with the specified range overwritten.
+    """
     cdef unique_ptr[column] c_result
 
     with nogil:
@@ -199,6 +323,25 @@ cpdef Column copy_range(
 
 
 cpdef Column shift(Column input, size_type offset, Scalar fill_values):
+    """Shift the elements of input by offset.
+
+    For details on the implementation, see :cpp:func:`shift`.
+
+    Parameters
+    ----------
+    input : Column
+        The column to shift.
+    offset : int
+        The number of elements to shift by.
+    fill_values : Scalar
+        The value to use for elements that are shifted in from outside the
+        bounds of the input column.
+
+    Returns
+    -------
+    pylibcudf.Column
+        A copy of input shifted by offset.
+    """
     cdef unique_ptr[column] c_result
     with nogil:
         c_result = move(
@@ -212,6 +355,22 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_values):
 
 
 cpdef list column_split(Column input_column, list splits):
+    """Split input_column into multiple columns.
+
+    For details on the implementation, see :cpp:func:`split`.
+
+    Parameters
+    ----------
+    input_column : Column
+        The column to split.
+    splits : List[int]
+        The indices at which to split the column.
+
+    Returns
+    -------
+    List[pylibcudf.Column]
+        The result of splitting input_column.
+    """
     cdef vector[size_type] c_splits
     cdef int split
     for split in splits:
@@ -234,6 +393,22 @@ cpdef list column_split(Column input_column, list splits):
 
 
 cpdef list table_split(Table input_table, list splits):
+    """Split input_table into multiple tables.
+
+    For details on the implementation, see :cpp:func:`split`.
+
+    Parameters
+    ----------
+    input_table : Table
+        The table to split.
+    splits : List[int]
+        The indices at which to split the table.
+
+    Returns
+    -------
+    List[pylibcudf.Table]
+        The result of splitting input_table.
+    """
     cdef vector[size_type] c_splits = splits
     cdef vector[table_view] c_result
     with nogil:
@@ -252,6 +427,22 @@ cpdef list table_split(Table input_table, list splits):
 
 
 cpdef list column_slice(Column input_column, list indices):
+    """Slice input_column according to indices.
+
+    For details on the implementation, see :cpp:func:`slice`.
+
+    Parameters
+    ----------
+    input_column : Column
+        The column to slice.
+    indices : List[int]
+        The indices to select from input_column.
+
+    Returns
+    -------
+    List[pylibcudf.Column]
+        The result of slicing input_column.
+    """
     cdef vector[size_type] c_indices = indices
     cdef vector[column_view] c_result
     with nogil:
@@ -270,6 +461,22 @@ cpdef list column_slice(Column input_column, list indices):
 
 
 cpdef list table_slice(Table input_table, list indices):
+    """Slice input_table according to indices.
+
+    For details on the implementation, see :cpp:func:`slice`.
+
+    Parameters
+    ----------
+    input_table : Table
+        The table to slice.
+    indices : List[int]
+        The indices to select from input_table.
+
+    Returns
+    -------
+    List[pylibcudf.Table]
+        The result of slicing input_table.
+    """
     cdef vector[size_type] c_indices = indices
     cdef vector[table_view] c_result
     with nogil:
@@ -288,6 +495,26 @@ cpdef list table_slice(Table input_table, list indices):
 
 
 cpdef Column copy_if_else(object lhs, object rhs, Column boolean_mask):
+    """Copy elements from lhs or rhs into a new column according to boolean_mask.
+
+    For details on the implementation, see :cpp:func:`copy_if_else`.
+
+    Parameters
+    ----------
+    lhs : Column or Scalar
+        The column or scalar to copy from if the corresponding element in
+        boolean_mask is True.
+    rhs : Column or Scalar
+        The column or scalar to copy from if the corresponding element in
+        boolean_mask is False.
+    boolean_mask : Column
+        The boolean mask to use to select elements from lhs and rhs.
+
+    Returns
+    -------
+    pylibcudf.Column
+        The result of copying elements from lhs and rhs according to boolean_mask.
+    """
     cdef unique_ptr[column] result
 
     if isinstance(lhs, Column) and isinstance(rhs, Column):
@@ -333,6 +560,24 @@ cpdef Column copy_if_else(object lhs, object rhs, Column boolean_mask):
 
 
 cpdef Table boolean_mask_table_scatter(Table input, Table target, Column boolean_mask):
+    """Scatter rows from input into target according to boolean_mask.
+
+    For details on the implementation, see :cpp:func:`boolean_mask_scatter`.
+
+    Parameters
+    ----------
+    input : Table
+        The table object from which to pull data.
+    target : Table
+        The table object into which to scatter data.
+    boolean_mask : Column
+        A mapping from rows in input to rows in target.
+
+    Returns
+    -------
+    pylibcudf.Table
+        The result of the scatter
+    """
     cdef unique_ptr[table] result
 
     with nogil:
@@ -349,6 +594,24 @@ cpdef Table boolean_mask_table_scatter(Table input, Table target, Column boolean
 
 # TODO: Could generalize list to sequence
 cpdef Table boolean_mask_scalars_scatter(list input, Table target, Column boolean_mask):
+    """Scatter scalars from input into target according to boolean_mask.
+
+    For details on the implementation, see :cpp:func:`boolean_mask_scatter`.
+
+    Parameters
+    ----------
+    input : List[Scalar]
+        A list of scalars to scatter into target.
+    target : Table
+        The table object into which to scatter data.
+    boolean_mask : Column
+        A mapping from rows in input to rows in target.
+
+    Returns
+    -------
+    pylibcudf.Table
+        The result of the scatter
+    """
     cdef vector[reference_wrapper[const scalar]] source_scalars = _as_vector(input)
 
     cdef unique_ptr[table] result
@@ -363,7 +626,24 @@ cpdef Table boolean_mask_scalars_scatter(list input, Table target, Column boolea
 
     return Table.from_libcudf(move(result))
 
+
 cpdef Scalar get_element(Column input_column, size_type index):
+    """Get the element at index from input_column.
+
+    For details on the implementation, see :cpp:func:`get_element`.
+
+    Parameters
+    ----------
+    input_column : Column
+        The column from which to get the element.
+    index : int
+        The index of the element to get.
+
+    Returns
+    -------
+    pylibcudf.Scalar
+        The element at index from input_column.
+    """
     cdef unique_ptr[scalar] c_output
     with nogil:
         c_output = move(
diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
index fc98f087a1b..a2f5b2ac387 100644
--- a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
@@ -1,13 +1,12 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 
 cdef class gpumemoryview:
     """Minimal representation of a memory buffer.
 
-    This class aspires to be a GPU equivalent of the [Python memoryview
-    type](https://docs.python.org/3/library/stdtypes.html#memoryview) for any
-    objects exposing a [CUDA Array
-    Interface](https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html).
+    This class aspires to be a GPU equivalent of :py:class:`memoryview` for any
+    objects exposing a `CUDA Array Interface
+    <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`__.
     It will be expanded to encompass more memoryview functionality over time.
     """
     # TODO: dlpack support
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index 0cdca275027..1ec5eb2e71a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -1,9 +1,20 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.interop cimport column_metadata
 
 
 cdef class ColumnMetadata:
+    """Metadata associated with a column.
+
+    This is the Cython representation of :cpp:class:`cudf::column_metadata`.
+
+    Parameters
+    ----------
+    id : TypeId
+        The type's identifier
+    scale : int
+        The scale associated with the data. Only used for decimal data types.
+    """
     def __init__(self, name):
         self.name = name
         self.children_meta = []
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
index 965f10999f2..a1a347bc924 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cython cimport no_gc_clear
 from cython.operator cimport dereference
@@ -31,7 +31,10 @@ from .types cimport DataType, type_id
 # https://github.com/rapidsai/rmm/pull/931 for details.
 @no_gc_clear
 cdef class Scalar:
-    """A scalar value in device memory."""
+    """A scalar value in device memory.
+
+    This is the Cython representation of :cpp:class:`cudf::scalar`.
+    """
     # Unlike for columns, libcudf does not support scalar views. All APIs that
     # accept scalar values accept references to the owning object rather than a
     # special view type. As a result, pylibcudf.Scalar has a simpler structure
@@ -50,6 +53,16 @@ cdef class Scalar:
 
     @staticmethod
     def from_arrow(pa.Scalar value, DataType data_type=None):
+        """Create a Scalar from a pyarrow Scalar.
+
+        Parameters
+        ----------
+        value : pyarrow.Scalar
+            The pyarrow scalar to construct from
+        data_type : DataType, optional
+            The data type of the scalar. If not passed, the data type will be
+            inferred from the pyarrow scalar.
+        """
         # Allow passing a dtype, but only for the purpose of decimals for now
 
         cdef shared_ptr[pa.CScalar] cscalar = (
@@ -100,6 +113,13 @@ cdef class Scalar:
         return s
 
     cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata):
+        """Convert to a pyarrow scalar.
+
+        Parameters
+        ----------
+        metadata : ColumnMetadata
+            The metadata for the column the scalar is being used in.
+        """
         cdef shared_ptr[pa.CScalar] c_result
         cdef column_metadata c_metadata = metadata.to_libcudf()
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
index 6a6fad46d69..6d25d215f28 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -83,10 +83,19 @@ cdef class Table:
         ])
 
     cpdef list columns(self):
+        """The columns in this table."""
         return self._columns
 
     @staticmethod
     def from_arrow(pa.Table pyarrow_table):
+        """Create a Table from a PyArrow Table.
+
+        Parameters
+        ----------
+        pyarrow_table : pyarrow.Table
+            The PyArrow Table to convert to a Table.
+        """
+
         cdef shared_ptr[pa.CTable] ctable = (
             pa.pyarrow_unwrap_table(pyarrow_table)
         )
@@ -98,6 +107,13 @@ cdef class Table:
         return Table.from_libcudf(move(c_result))
 
     cpdef pa.Table to_arrow(self, list metadata):
+        """Convert to a PyArrow Table.
+
+        Parameters
+        ----------
+        metadata : list
+            The metadata to attach to the columns of the table.
+        """
         cdef shared_ptr[pa.CTable] c_result
         cdef vector[column_metadata] c_metadata
         cdef ColumnMetadata meta
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index b1391723f0e..931ab9fde39 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 
@@ -10,7 +10,7 @@ from cudf._lib.cpp.types import type_id as TypeId  # no-cython-lint
 cdef class DataType:
     """Indicator for the logical data type of an element in a column.
 
-    This is the Cython representation of libcudf's data_type.
+    This is the Cython representation of :cpp:class:`cudf::data_type`.
 
     Parameters
     ----------

From 19942809679e4675c296a38f90bfdbaa8574eee2 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Sat, 20 Jan 2024 01:39:00 -0500
Subject: [PATCH 27/60] Enable intermediate proxies to be picklable (#14752)

Closes https://github.com/rapidsai/cudf/issues/14738

Enables intermediate proxy types to be pickled, same as final proxy types.

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14752
---
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 77 ++++++++++++++-----
 python/cudf/cudf/pandas/module_accelerator.py |  7 +-
 .../cudf_pandas_tests/test_cudf_pandas.py     |  8 ++
 3 files changed, 70 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 3dc6a59cc16..d132116af61 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -25,6 +25,11 @@
 
 from .annotation import nvtx
 
+
+def call_operator(fn, args, kwargs):
+    return fn(*args, **kwargs)
+
+
 _CUDF_PANDAS_NVTX_COLORS = {
     "COPY_SLOW_TO_FAST": 0xCA0020,
     "COPY_FAST_TO_SLOW": 0xF4A582,
@@ -189,22 +194,6 @@ def _fsproxy_state(self) -> _State:
             else _State.SLOW
         )
 
-    def __reduce__(self):
-        # Need a local import to avoid circular import issues
-        from .module_accelerator import disable_module_accelerator
-
-        with disable_module_accelerator():
-            pickled_wrapped_obj = pickle.dumps(self._fsproxy_wrapped)
-        return (_PickleConstructor(type(self)), (), pickled_wrapped_obj)
-
-    def __setstate__(self, state):
-        # Need a local import to avoid circular import issues
-        from .module_accelerator import disable_module_accelerator
-
-        with disable_module_accelerator():
-            unpickled_wrapped_obj = pickle.loads(state)
-        self._fsproxy_wrapped = unpickled_wrapped_obj
-
     slow_dir = dir(slow_type)
     cls_dict = {
         "__init__": __init__,
@@ -215,9 +204,8 @@ def __setstate__(self, state):
         "_fsproxy_slow_to_fast": _fsproxy_slow_to_fast,
         "_fsproxy_fast_to_slow": _fsproxy_fast_to_slow,
         "_fsproxy_state": _fsproxy_state,
-        "__reduce__": __reduce__,
-        "__setstate__": __setstate__,
     }
+
     if additional_attributes is None:
         additional_attributes = {}
     for method in _SPECIAL_METHODS:
@@ -716,6 +704,27 @@ def _fsproxy_wrap(cls, value, func):
         proxy._fsproxy_wrapped = value
         return proxy
 
+    def __reduce__(self):
+        """
+        In conjunction with `__proxy_setstate__`, this effectively enables
+        proxy types to be pickled and unpickled by pickling and unpickling
+        the underlying wrapped types.
+        """
+        # Need a local import to avoid circular import issues
+        from .module_accelerator import disable_module_accelerator
+
+        with disable_module_accelerator():
+            pickled_wrapped_obj = pickle.dumps(self._fsproxy_wrapped)
+        return (_PickleConstructor(type(self)), (), pickled_wrapped_obj)
+
+    def __setstate__(self, state):
+        # Need a local import to avoid circular import issues
+        from .module_accelerator import disable_module_accelerator
+
+        with disable_module_accelerator():
+            unpickled_wrapped_obj = pickle.loads(state)
+        self._fsproxy_wrapped = unpickled_wrapped_obj
+
 
 class _IntermediateProxy(_FastSlowProxy):
     """
@@ -772,6 +781,34 @@ def _fsproxy_fast_to_slow(self) -> Any:
         args, kwargs = _slow_arg(args), _slow_arg(kwargs)
         return func(*args, **kwargs)
 
+    def __reduce__(self):
+        """
+        In conjunction with `__proxy_setstate__`, this effectively enables
+        proxy types to be pickled and unpickled by pickling and unpickling
+        the underlying wrapped types.
+        """
+        # Need a local import to avoid circular import issues
+        from .module_accelerator import disable_module_accelerator
+
+        with disable_module_accelerator():
+            pickled_wrapped_obj = pickle.dumps(self._fsproxy_wrapped)
+        pickled_method_chain = pickle.dumps(self._method_chain)
+        return (
+            _PickleConstructor(type(self)),
+            (),
+            (pickled_wrapped_obj, pickled_method_chain),
+        )
+
+    def __setstate__(self, state):
+        # Need a local import to avoid circular import issues
+        from .module_accelerator import disable_module_accelerator
+
+        with disable_module_accelerator():
+            unpickled_wrapped_obj = pickle.loads(state[0])
+        unpickled_method_chain = pickle.loads(state[1])
+        self._fsproxy_wrapped = unpickled_wrapped_obj
+        self._method_chain = unpickled_method_chain
+
 
 class _CallableProxyMixin:
     """
@@ -788,7 +825,7 @@ def __call__(self, *args, **kwargs) -> Any:
             # _fast_slow_function_call) to avoid infinite recursion.
             # TODO: When Python 3.11 is the minimum supported Python version
             # this can use operator.call
-            lambda fn, args, kwargs: fn(*args, **kwargs),
+            call_operator,
             self,
             args,
             kwargs,
diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index 180d75d96e8..e97d6e4af24 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -551,12 +551,15 @@ def getattr_real_or_wrapped(
             # release the lock after reading this value)
             use_real = not loader._use_fast_lib
         if not use_real:
+            CUDF_PANDAS_PATH = __file__.rsplit("/", 1)[0]
             # Only need to check the denylist if we're not turned off.
             frame = sys._getframe()
             # We cannot possibly be at the top level.
             assert frame.f_back
             calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename)
-            use_real = any(
+            use_real = not calling_module.is_relative_to(
+                CUDF_PANDAS_PATH
+            ) and any(
                 calling_module.is_relative_to(path)
                 for path in loader._denylist
             )
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 44f301819ed..738ff24f374 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1235,6 +1235,14 @@ def test_func_namespace():
     assert xpd.concat is xpd.core.reshape.concat.concat
 
 
+def test_pickle_groupby(dataframe):
+    pdf, df = dataframe
+    pgb = pdf.groupby("a")
+    gb = df.groupby("a")
+    gb = pickle.loads(pickle.dumps(gb))
+    tm.assert_equal(pgb.sum(), gb.sum())
+
+
 def test_isinstance_base_offset():
     offset = xpd.tseries.frequencies.to_offset("1s")
     assert isinstance(offset, xpd.tseries.offsets.BaseOffset)

From f258d04e6c46e8edb31fbb17294d2be6579d973b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 22 Jan 2024 05:02:19 -1000
Subject: [PATCH 28/60] Remove unused/single use methods (#14739)

* Didn't see use of `get_numeric_type_info`
* `Column._minmax` was only used once. Replaced with `Column._reduce`
* `CategoryColumn.unary_operator` raises like the base class

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/14739
---
 python/cudf/cudf/core/column/categorical.py |  6 ------
 python/cudf/cudf/core/column/column.py      | 16 +++++-----------
 python/cudf/cudf/core/resample.py           |  9 +++++----
 python/cudf/cudf/utils/dtypes.py            | 12 ------------
 4 files changed, 10 insertions(+), 33 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 6b3ee0ba852..0ddb31efbfe 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -823,12 +823,6 @@ def ordered(self) -> bool:
     def ordered(self, value: bool):
         self.dtype.ordered = value
 
-    def unary_operator(self, unaryop: str):
-        raise TypeError(
-            f"Series of dtype `category` cannot perform the operation: "
-            f"{unaryop}"
-        )
-
     def __setitem__(self, key, value):
         if cudf.api.types.is_scalar(
             value
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index df5d1c3879a..7a99ef9f470 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1241,12 +1241,6 @@ def normalize_binop_value(
     ) -> Union[ColumnBase, ScalarLike]:
         raise NotImplementedError
 
-    def _minmax(self, skipna: Optional[bool] = None):
-        result_col = self._process_for_reduction(skipna=skipna)
-        if isinstance(result_col, ColumnBase):
-            return libcudf.reduce.minmax(result_col)
-        return result_col
-
     def _reduce(
         self,
         op: str,
@@ -1273,13 +1267,13 @@ def _reduce(
     def _process_for_reduction(
         self, skipna: Optional[bool] = None, min_count: int = 0
     ) -> Union[ColumnBase, ScalarLike]:
-        skipna = True if skipna is None else skipna
+        if skipna is None:
+            skipna = True
 
-        if skipna:
-            if self.has_nulls():
+        if self.has_nulls():
+            if skipna:
                 result_col = self.dropna()
-        else:
-            if self.has_nulls():
+            else:
                 return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
         result_col = self
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index fbf25104303..0226c778da3 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2023, NVIDIA CORPORATION &
+# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION &
 # AFFILIATES. All rights reserved.  SPDX-License-Identifier:
 # Apache-2.0
 #
@@ -217,10 +217,11 @@ def _handle_frequency_grouper(self, by):
 
         # get the start and end values that will be used to generate
         # the bin labels
-        min_date, max_date = key_column._minmax()
+        min_date = key_column._reduce("min")
+        max_date = key_column._reduce("max")
         start, end = _get_timestamp_range_edges(
-            pd.Timestamp(min_date.value),
-            pd.Timestamp(max_date.value),
+            pd.Timestamp(min_date),
+            pd.Timestamp(max_date),
             offset,
             closed=closed,
         )
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 72721b5197f..df363b72909 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import datetime
-from collections import namedtuple
 from decimal import Decimal
 
 import cupy as cp
@@ -139,17 +138,6 @@ def np_to_pa_dtype(dtype):
     return _np_pa_dtypes[cudf.dtype(dtype).type]
 
 
-def get_numeric_type_info(dtype):
-    _TypeMinMax = namedtuple("_TypeMinMax", "min,max")
-    if dtype.kind in {"i", "u"}:
-        info = np.iinfo(dtype)
-        return _TypeMinMax(info.min, info.max)
-    elif dtype.kind == "f":
-        return _TypeMinMax(dtype.type("-inf"), dtype.type("+inf"))
-    else:
-        raise TypeError(dtype)
-
-
 def numeric_normalize_types(*args):
     """Cast all args to a common type using numpy promotion logic"""
     dtype = np.result_type(*[a.dtype for a in args])

From d1c0e2532eca26a8f16f279b1fda7ffd0987ea64 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 22 Jan 2024 05:09:31 -1000
Subject: [PATCH 29/60] Implement to_datetime(..., utc=True) (#14749)

closes #13661

Also added typing and fixes a bug where `uint` data would raise a TypeError

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/14749
---
 python/cudf/cudf/core/tools/datetimes.py | 100 +++++++++++------------
 python/cudf/cudf/tests/test_datetime.py  |  35 ++++++++
 2 files changed, 85 insertions(+), 50 deletions(-)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 6ec9dcb5f44..14459c81966 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import math
 import re
 import warnings
-from typing import Sequence, Union
+from typing import Literal, Optional, Sequence, Union
 
 import cupy as cp
 import numpy as np
@@ -49,16 +49,16 @@
 
 def to_datetime(
     arg,
-    errors="raise",
-    dayfirst=False,
-    yearfirst=False,
-    utc=None,
-    format=None,
-    exact=True,
-    unit="ns",
-    infer_datetime_format=False,
+    errors: Literal["raise", "coerce", "warn", "ignore"] = "raise",
+    dayfirst: bool = False,
+    yearfirst: bool = False,
+    utc: bool = False,
+    format: Optional[str] = None,
+    exact: bool = True,
+    unit: str = "ns",
+    infer_datetime_format: bool = False,
     origin="unix",
-    cache=True,
+    cache: bool = True,
 ):
     """
     Convert argument to datetime.
@@ -80,6 +80,8 @@ def to_datetime(
         2012-11-10.
         Warning: dayfirst=True is not strict, but will prefer to parse
         with day first (this is a known bug, based on dateutil behavior).
+    utc : bool, default False
+        Whether the result should be have a UTC timezone.
     format : str, default None
         The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
         all the way up to nanoseconds.
@@ -148,9 +150,6 @@ def to_datetime(
     if yearfirst:
         raise NotImplementedError("yearfirst support is not yet implemented")
 
-    if utc:
-        raise NotImplementedError("utc is not yet implemented")
-
     if format is not None:
         if "%Z" in format or "%z" in format:
             raise NotImplementedError(
@@ -165,24 +164,24 @@ def to_datetime(
             required = ["year", "month", "day"]
             req = list(set(required) - set(arg._data.names))
             if len(req):
-                req = ",".join(req)
+                err_req = ",".join(req)
                 raise ValueError(
                     f"to assemble mappings requires at least that "
-                    f"[year, month, day] be specified: [{req}] "
+                    f"[year, month, day] be specified: [{err_req}] "
                     f"is missing"
                 )
 
             # replace passed column name with values in _unit_map
-            unit = {k: get_units(k) for k in arg._data.names}
-            unit_rev = {v: k for k, v in unit.items()}
+            got_units = {k: get_units(k) for k in arg._data.names}
+            unit_rev = {v: k for k, v in got_units.items()}
 
             # keys we don't recognize
             excess = set(unit_rev.keys()) - set(_unit_map.values())
             if len(excess):
-                excess = ",".join(excess)
+                err_excess = ",".join(excess)
                 raise ValueError(
                     f"extra keys have been passed to the "
-                    f"datetime assemblage: [{excess}]"
+                    f"datetime assemblage: [{err_excess}]"
                 )
 
             new_series = (
@@ -245,38 +244,29 @@ def to_datetime(
                 col = (col.astype(dtype="int64") + times_column).astype(
                     dtype=col.dtype
                 )
-            return cudf.Series(col, index=arg.index)
-        elif isinstance(arg, cudf.BaseIndex):
-            col = arg._values
-            col = _process_col(
-                col=col,
-                unit=unit,
-                dayfirst=dayfirst,
-                infer_datetime_format=infer_datetime_format,
-                format=format,
-            )
-            return as_index(col, name=arg.name)
-        elif isinstance(arg, (cudf.Series, pd.Series)):
-            col = column.as_column(arg)
             col = _process_col(
                 col=col,
                 unit=unit,
                 dayfirst=dayfirst,
                 infer_datetime_format=infer_datetime_format,
                 format=format,
+                utc=utc,
             )
-            return cudf.Series(col, index=arg.index, name=arg.name)
+            return cudf.Series(col, index=arg.index)
         else:
-            col = column.as_column(arg)
             col = _process_col(
-                col=col,
+                col=column.as_column(arg),
                 unit=unit,
                 dayfirst=dayfirst,
                 infer_datetime_format=infer_datetime_format,
                 format=format,
+                utc=utc,
             )
-
-            if is_scalar(arg):
+            if isinstance(arg, (cudf.BaseIndex, pd.Index)):
+                return as_index(col, name=arg.name)
+            elif isinstance(arg, (cudf.Series, pd.Series)):
+                return cudf.Series(col, index=arg.index, name=arg.name)
+            elif is_scalar(arg):
                 return col.element_indexing(0)
             else:
                 return as_index(col)
@@ -295,11 +285,18 @@ def to_datetime(
         return arg
 
 
-def _process_col(col, unit, dayfirst, infer_datetime_format, format):
-    if col.dtype.kind == "M":
-        return col
+def _process_col(
+    col,
+    unit: str,
+    dayfirst: bool,
+    infer_datetime_format: bool,
+    format: Optional[str],
+    utc: bool,
+):
+    # Causes circular import
+    from cudf.core._internals.timezones import localize
 
-    elif col.dtype.kind in ("f"):
+    if col.dtype.kind == "f":
         if unit not in (None, "ns"):
             factor = cudf.Scalar(
                 column.datetime._unit_to_nanoseconds_conversion[unit]
@@ -325,9 +322,8 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
             )
         else:
             col = col.as_datetime_column(dtype="datetime64[ns]")
-        return col
 
-    elif col.dtype.kind in ("i"):
+    elif col.dtype.kind in "iu":
         if unit in ("D", "h", "m"):
             factor = cudf.Scalar(
                 column.datetime._unit_to_nanoseconds_conversion[unit]
@@ -341,9 +337,8 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
             )
         else:
             col = col.as_datetime_column(dtype=_unit_dtype_map[unit])
-        return col
 
-    elif col.dtype.kind in ("O"):
+    elif col.dtype.kind == "O":
         if unit not in (None, "ns") or col.null_count == len(col):
             try:
                 col = col.astype(dtype="int64")
@@ -355,6 +350,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
                 dayfirst=dayfirst,
                 infer_datetime_format=infer_datetime_format,
                 format=format,
+                utc=utc,
             )
         else:
             if format is None:
@@ -367,13 +363,17 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
                     element=col.element_indexing(0),
                     dayfirst=dayfirst,
                 )
-            return col.as_datetime_column(
+            col = col.as_datetime_column(
                 dtype=_unit_dtype_map[unit],
                 format=format,
             )
-    raise TypeError(
-        f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
-    )
+    elif col.dtype.kind != "M":
+        raise TypeError(
+            f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
+        )
+    if utc and not isinstance(col.dtype, pd.DatetimeTZDtype):
+        return localize(col, "UTC", ambiguous="NaT", nonexistent="NaT")
+    return col
 
 
 def get_units(value):
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 2ea2885bc7b..deddedbe3e8 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2431,6 +2431,41 @@ def test_to_datetime_errors_non_scalar_not_implemented(errors):
         cudf.to_datetime([1, ""], unit="s", errors=errors)
 
 
+@pytest.mark.parametrize(
+    "box", [list, pd.Index, cudf.Index, pd.Series, cudf.Series]
+)
+@pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64])
+def test_to_datetime_arraylike_utc_true(box, dtype):
+    pd_data = [1, 2]
+    cudf_data = box(pd_data)
+    if box is not list:
+        cudf_data = cudf_data.astype(dtype)
+    if box is cudf.Series or box is pd.Series:
+        pd_data = pd.Series(pd_data)
+    result = cudf.to_datetime(cudf_data, utc=True)
+    expected = pd.to_datetime(pd_data, utc=True)
+    assert_eq(result, expected)
+
+
+@pytest.mark.xfail(
+    raises=TypeError,
+    reason="libcudf.copying.get_element doesn't understand pd.DatetimeTZDtype",
+)
+def test_to_datetime_scalar_utc_true():
+    data = pd.Timestamp(2020, 1, 1)
+    with cudf.option_context("mode.pandas_compatible", True):
+        result = cudf.to_datetime(data, utc=True)
+    expected = pd.Timestamp(year=2020, month=1, day=1, tz="UTC")
+    assert_eq(result, expected)
+
+
+def test_to_datetime_dataframe_utc_true():
+    data = cudf.DataFrame([[2020, 1, 1]], columns=["year", "month", "day"])
+    result = cudf.to_datetime(data, utc=True)
+    expected = pd.Series([datetime.datetime(2020, 1, 1)]).dt.tz_localize("UTC")
+    assert_eq(result, expected)
+
+
 def test_datetimeindex_dtype_np_dtype():
     dtype = np.dtype("datetime64[ns]")
     data = [1]

From 8fdc62bbf131963539c380823a5f1d8acea9b5df Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Mon, 22 Jan 2024 23:38:29 +0530
Subject: [PATCH 30/60] JSON - Parse mixed types as string in JSON reader
 (#14572)

Addresses https://github.com/rapidsai/cudf/issues/14239


This PR adds an option to read mixed types as string columns.
It also adds related functional changes to nested JSON reader (libcudf, cuDF-python, Java).

Details:
- Added new option `mixed_types_as_string` bool in json_reader_options
- This feature requires 2 things: finding end of struct/list nodes, parse struct/list type as string.
- For Struct and List, node_range_end was node_range_begin+1 earlier (since it was not used anywhere). Now it is calculated properly by copying only struct and list tokens and their node_range_end is calculated. (Since end token is child of begin token, scattering end token's index to parent' token's corresponding node's node_range_end will get the node_range_end of List and Struct nodes).
- In `reduce_to_column_tree()` (which infers the schema), the list and struct node_range_end are changed to node_begin+1 so that it does not copy entire list/struct strings to host for column names.
- `reinitialize_as_string` reinitializes an initialized column as string.
- Mixed type columns are parsed as strings since their column category is changed to `NC_STR`.
- Added tests

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Andy Grove (https://github.com/andygrove)

Approvers:
  - Andy Grove (https://github.com/andygrove)
  - Jason Lowe (https://github.com/jlowe)
  - Elias Stehle (https://github.com/elstehle)
  - Bradley Dice (https://github.com/bdice)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/14572
---
 cpp/include/cudf/io/json.hpp                  |  30 ++++-
 cpp/src/io/json/json_column.cu                | 109 ++++++++++++++--
 cpp/src/io/json/json_tree.cu                  | 122 +++++++++++++++++-
 cpp/src/io/json/nested_json.hpp               |   4 +-
 cpp/tests/io/json_test.cpp                    | 113 ++++++++++++++++
 cpp/tests/io/json_tree.cpp                    |  15 ++-
 .../main/java/ai/rapids/cudf/JSONOptions.java |  21 ++-
 java/src/main/java/ai/rapids/cudf/Table.java  |  18 ++-
 java/src/main/native/src/TableJni.cpp         |  16 ++-
 .../test/java/ai/rapids/cudf/TableTest.java   |  52 +++++++-
 java/src/test/resources/mixed_types_1.json    |   2 +
 java/src/test/resources/mixed_types_2.json    |   2 +
 python/cudf/cudf/_lib/cpp/io/json.pxd         |   7 +-
 python/cudf/cudf/_lib/json.pyx                |   4 +-
 python/cudf/cudf/io/json.py                   |   4 +-
 15 files changed, 472 insertions(+), 47 deletions(-)
 create mode 100644 java/src/test/resources/mixed_types_1.json
 create mode 100644 java/src/test/resources/mixed_types_2.json

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 472d42b1db5..2a39a539cc7 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,6 +98,8 @@ class json_reader_options {
 
   // Read the file as a json object per line
   bool _lines = false;
+  // Parse mixed types as a string column
+  bool _mixed_types_as_string = false;
 
   // Bytes to skip from the start
   size_t _byte_range_offset = 0;
@@ -225,6 +227,13 @@ class json_reader_options {
    */
   bool is_enabled_lines() const { return _lines; }
 
+  /**
+   * @brief Whether to parse mixed types as a string column.
+   *
+   * @return `true` if mixed types are parsed as a string column
+   */
+  bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }
+
   /**
    * @brief Whether to parse dates as DD/MM versus MM/DD.
    *
@@ -302,6 +311,13 @@ class json_reader_options {
    */
   void enable_lines(bool val) { _lines = val; }
 
+  /**
+   * @brief Set whether to parse mixed types as a string column.
+   *
+   * @param val Boolean value to enable/disable parsing mixed types as a string column
+   */
+  void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
@@ -437,6 +453,18 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether to parse mixed types as a string column.
+   *
+   * @param val Boolean value to enable/disable parsing mixed types as a string column
+   * @return this for chaining
+   */
+  json_reader_options_builder& mixed_types_as_string(bool val)
+  {
+    options._mixed_types_as_string = val;
+    return *this;
+  }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index f1296daca26..b1dc2c9dd7f 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -277,6 +277,16 @@ reduce_to_column_tree(tree_meta_t& tree,
       return is_non_list_parent(parent_col_id);
     });
 
+  // For Struct and List (to avoid copying entire strings when mixed type as string is enabled)
+  thrust::transform_if(
+    rmm::exec_policy(stream),
+    col_range_begin.begin(),
+    col_range_begin.end(),
+    column_categories.begin(),
+    col_range_end.begin(),
+    [] __device__(auto i) { return i + 1; },
+    [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; });
+
   return std::tuple{tree_meta_t{std::move(column_categories),
                                 std::move(parent_col_ids),
                                 std::move(column_levels),
@@ -407,6 +417,7 @@ struct json_column_data {
  * @param root Root node of the `d_json_column` tree
  * @param is_array_of_arrays Whether the tree is an array of arrays
  * @param is_enabled_lines Whether the input is a line-delimited JSON
+ * @param is_enabled_mixed_types_as_string Whether to enable reading mixed types as string
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the device memory
  * of child_offets and validity members of `d_json_column`
@@ -418,6 +429,7 @@ void make_device_json_column(device_span<SymbolT const> input,
                              device_json_column& root,
                              bool is_array_of_arrays,
                              bool is_enabled_lines,
+                             bool is_enabled_mixed_types_as_string,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
@@ -516,6 +528,19 @@ void make_device_json_column(device_span<SymbolT const> input,
     col.type = to_json_col_type(column_categories[i]);
   };
 
+  auto reinitialize_as_string = [&](auto i, auto& col) {
+    col.string_offsets.resize(max_row_offsets[i] + 1, stream);
+    col.string_lengths.resize(max_row_offsets[i] + 1, stream);
+    init_to_zero(col.string_offsets);
+    init_to_zero(col.string_lengths);
+    col.num_rows = max_row_offsets[i] + 1;
+    col.validity =
+      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
+    col.type = json_col_t::StringColumn;
+    col.child_columns.clear();  // their references should be deleted too.
+    col.column_order.clear();
+  };
+
   // 2. generate nested columns tree and its device_memory
   // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
   auto h_range_col_id_it =
@@ -530,6 +555,7 @@ void make_device_json_column(device_span<SymbolT const> input,
   std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
   // find column_ids which are values, but should be ignored in validity
   std::vector<uint8_t> ignore_vals(num_columns, 0);
+  std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
   for (auto const this_col_id : unique_col_ids) {
@@ -552,6 +578,13 @@ void make_device_json_column(device_span<SymbolT const> input,
     } else {
       CUDF_FAIL("Unexpected parent column category");
     }
+
+    if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) {
+      // if parent is mixed type column, ignore this column.
+      is_mixed_type_column[this_col_id] = 1;
+      ignore_vals[this_col_id]          = 1;
+      continue;
+    }
     // If the child is already found,
     // replace if this column is a nested column and the existing was a value column
     // ignore this column if this column is a value column and the existing was a nested column
@@ -560,6 +593,24 @@ void make_device_json_column(device_span<SymbolT const> input,
     auto& parent_col = it->second.get();
     bool replaced    = false;
     if (mapped_columns.count({parent_col_id, name}) > 0) {
+      // If mixed type as string is enabled, make both of them strings and merge them.
+      // All child columns will be ignored when parsing.
+      if (is_enabled_mixed_types_as_string) {
+        // VAL/STR or STRUCT or LIST
+        auto old_col_id = mapped_columns[{parent_col_id, name}];
+
+        is_mixed_type_column[this_col_id] = 1;
+        is_mixed_type_column[old_col_id]  = 1;
+        // if old col type (not cat) is list or struct, replace with string.
+        auto& col = columns.at(old_col_id).get();
+        if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
+          reinitialize_as_string(old_col_id, col);
+          // all its children (which are already inserted) are ignored later.
+        }
+        columns.try_emplace(this_col_id, columns.at(old_col_id));
+        continue;
+      }
+
       if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
         ignore_vals[this_col_id] = 1;
         continue;
@@ -592,6 +643,28 @@ void make_device_json_column(device_span<SymbolT const> input,
     columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
     mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
   }
+
+  if (is_enabled_mixed_types_as_string) {
+    // ignore all children of mixed type columns
+    for (auto const this_col_id : unique_col_ids) {
+      auto parent_col_id = column_parent_ids[this_col_id];
+      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
+        is_mixed_type_column[this_col_id] = 1;
+        ignore_vals[this_col_id]          = 1;
+        columns.erase(this_col_id);
+      }
+      // Convert only mixed type columns as string (so to copy), but not its children
+      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and
+          is_mixed_type_column[this_col_id] == 1)
+        column_categories[this_col_id] = NC_STR;
+    }
+    cudaMemcpyAsync(d_column_tree.node_categories.begin(),
+                    column_categories.data(),
+                    column_categories.size() * sizeof(column_categories[0]),
+                    cudaMemcpyDefault,
+                    stream.value());
+  }
+
   // restore unique_col_ids order
   std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
     return thrust::get<1>(a) < thrust::get<1>(b);
@@ -617,14 +690,16 @@ void make_device_json_column(device_span<SymbolT const> input,
     rmm::exec_policy(stream),
     thrust::counting_iterator<size_type>(0),
     num_nodes,
-    [node_categories = tree.node_categories.begin(),
-     col_ids         = col_ids.begin(),
-     row_offsets     = row_offsets.begin(),
-     range_begin     = tree.node_range_begin.begin(),
-     range_end       = tree.node_range_end.begin(),
-     d_ignore_vals   = d_ignore_vals.begin(),
-     d_columns_data  = d_columns_data.begin()] __device__(size_type i) {
-      switch (node_categories[i]) {
+    [column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     row_offsets       = row_offsets.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     d_ignore_vals     = d_ignore_vals.begin(),
+     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
+      if (d_ignore_vals[col_ids[i]]) return;
+      auto const node_category = column_categories[col_ids[i]];
+      switch (node_category) {
         case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
         case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
         case NC_STR: [[fallthrough]];
@@ -662,10 +737,14 @@ void make_device_json_column(device_span<SymbolT const> input,
       num_nodes,
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
-    [node_categories = tree.node_categories.begin(),
-     parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
+    [d_ignore_vals     = d_ignore_vals.begin(),
+     parent_node_ids   = tree.parent_node_ids.begin(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin()] __device__(size_type node_id) {
       auto parent_node_id = parent_node_ids[node_id];
-      return parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST;
+      return parent_node_id != parent_node_sentinel and
+             column_categories[col_ids[parent_node_id]] == NC_LIST and
+             (!d_ignore_vals[col_ids[parent_node_id]]);
     });
 
   auto const num_list_children =
@@ -896,8 +975,11 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     const auto [tokens_gpu, token_indices_gpu] =
       get_token_stream(d_input, options, stream, rmm::mr::get_current_device_resource());
     // gpu tree generation
-    return get_tree_representation(
-      tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
+    return get_tree_representation(tokens_gpu,
+                                   token_indices_gpu,
+                                   options.is_enabled_mixed_types_as_string(),
+                                   stream,
+                                   rmm::mr::get_current_device_resource());
   }();  // IILE used to free memory of token data.
 #ifdef NJP_DEBUG_PRINT
   auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
@@ -941,6 +1023,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                           root_column,
                           is_array_of_arrays,
                           options.is_enabled_lines(),
+                          options.is_enabled_mixed_types_as_string(),
                           stream,
                           mr);
 
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 9a70b987fa5..275907c19c9 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -134,6 +134,14 @@ struct node_ranges {
   }
 };
 
+struct is_nested_end {
+  PdaTokenT const* tokens;
+  __device__ auto operator()(NodeIndexT i) -> bool
+  {
+    return tokens[i] == token_t::StructEnd or tokens[i] == token_t::ListEnd;
+  }
+};
+
 /**
  * @brief Returns stable sorted keys and its sorted order
  *
@@ -184,16 +192,16 @@ std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_s
 }
 
 /**
- * @brief Propagate parent node to siblings from first sibling.
+ * @brief Propagate parent node from first sibling to other siblings.
  *
  * @param node_levels Node levels of each node
  * @param parent_node_ids parent node ids initialized for first child of each push node,
  *                       and other siblings are initialized to -1.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-void propagate_parent_to_siblings(cudf::device_span<TreeDepthT const> node_levels,
-                                  cudf::device_span<NodeIndexT> parent_node_ids,
-                                  rmm::cuda_stream_view stream)
+void propagate_first_sibling_to_other(cudf::device_span<TreeDepthT const> node_levels,
+                                      cudf::device_span<NodeIndexT> parent_node_ids,
+                                      rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   auto [sorted_node_levels, sorted_order] = stable_sorted_key_order<size_type>(node_levels, stream);
@@ -212,6 +220,7 @@ void propagate_parent_to_siblings(cudf::device_span<TreeDepthT const> node_level
 // Generates a tree representation of the given tokens, token_indices.
 tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
+                                    bool is_strict_nested_boundaries,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
@@ -297,9 +306,9 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
   // Node parent ids:
   // previous push node_id transform, stable sort by level, segmented scan with Max, reorder.
   rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
+  rmm::device_uvector<NodeIndexT> node_token_ids(num_nodes, stream);  // needed for SE, LE later
   // This block of code is generalized logical stack algorithm. TODO: make this a separate function.
   {
-    rmm::device_uvector<NodeIndexT> node_token_ids(num_nodes, stream);
     cudf::detail::copy_if_safe(thrust::make_counting_iterator<NodeIndexT>(0),
                                thrust::make_counting_iterator<NodeIndexT>(0) + num_tokens,
                                tokens.begin(),
@@ -345,7 +354,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
       });
   }
   // Propagate parent node to siblings from first sibling - inplace.
-  propagate_parent_to_siblings(
+  propagate_first_sibling_to_other(
     cudf::device_span<TreeDepthT const>{node_levels.data(), node_levels.size()},
     parent_node_ids,
     stream);
@@ -380,6 +389,105 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     stream);
   CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");
 
+  // Extract Struct, List range_end:
+  // 1. Extract Struct, List - begin & end separately, their token ids
+  // 2. push, pop to get levels
+  // 3. copy first child's parent token_id, also translate to node_id
+  // 4. propagate to siblings using levels, parent token id. (segmented scan)
+  // 5. scatter to node_range_end for only nested end tokens.
+  if (is_strict_nested_boundaries) {
+    // Whether the token is nested
+    auto const is_nested = [] __device__(PdaTokenT const token) -> bool {
+      switch (token) {
+        case token_t::StructBegin:
+        case token_t::StructEnd:
+        case token_t::ListBegin:
+        case token_t::ListEnd: return true;
+        default: return false;
+      };
+    };
+    auto const num_nested =
+      thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_nested);
+    rmm::device_uvector<TreeDepthT> token_levels(num_nested, stream);
+    rmm::device_uvector<NodeIndexT> token_id(num_nested, stream);
+    rmm::device_uvector<NodeIndexT> parent_node_ids(num_nested, stream);
+    auto const push_pop_it = thrust::make_transform_iterator(
+      tokens.begin(),
+      cuda::proclaim_return_type<cudf::size_type>(
+        [] __device__(PdaTokenT const token) -> size_type {
+          if (token == token_t::StructBegin or token == token_t::ListBegin) {
+            return 1;
+          } else if (token == token_t::StructEnd or token == token_t::ListEnd) {
+            return -1;
+          }
+          return 0;
+        }));
+    // copy_if only struct/list's token levels, token ids, tokens.
+    auto zipped_in_it =
+      thrust::make_zip_iterator(push_pop_it, thrust::make_counting_iterator<NodeIndexT>(0));
+    auto zipped_out_it = thrust::make_zip_iterator(token_levels.begin(), token_id.begin());
+    cudf::detail::copy_if_safe(
+      zipped_in_it, zipped_in_it + num_tokens, tokens.begin(), zipped_out_it, is_nested, stream);
+
+    thrust::exclusive_scan(
+      rmm::exec_policy(stream), token_levels.begin(), token_levels.end(), token_levels.begin());
+
+    // Get parent of first child of struct/list begin.
+    auto const nested_first_childs_parent_token_id =
+      [tokens_gpu = tokens.begin(), token_id = token_id.begin()] __device__(auto i) -> NodeIndexT {
+      if (i <= 0) { return -1; }
+      auto id = token_id[i - 1];  // current token's predecessor
+      if (tokens_gpu[id] == token_t::StructBegin or tokens_gpu[id] == token_t::ListBegin) {
+        return id;
+      } else {
+        return -1;
+      }
+    };
+
+    // copied L+S tokens, and their token ids, their token levels.
+    // initialize first child parent token ids
+    // translate token ids to node id using similar binary search.
+    thrust::transform(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator<NodeIndexT>(0),
+      thrust::make_counting_iterator<NodeIndexT>(0) + num_nested,
+      parent_node_ids.begin(),
+      [node_ids_gpu = node_token_ids.begin(),
+       num_nodes,
+       nested_first_childs_parent_token_id] __device__(NodeIndexT const tid) -> NodeIndexT {
+        auto const pid = nested_first_childs_parent_token_id(tid);
+        // token_ids which are converted to nodes, are stored in node_ids_gpu in order
+        // so finding index of token_id in node_ids_gpu will return its node index.
+        return pid < 0
+                 ? parent_node_sentinel
+                 : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
+                     node_ids_gpu;
+        // parent_node_sentinel is -1, useful for segmented max operation below
+      });
+
+    // propagate parent node from first sibling to other siblings - inplace.
+    propagate_first_sibling_to_other(
+      cudf::device_span<TreeDepthT const>{token_levels.data(), token_levels.size()},
+      parent_node_ids,
+      stream);
+
+    // scatter to node_range_end for only nested end tokens.
+    auto token_indices_it =
+      thrust::make_permutation_iterator(token_indices.begin(), token_id.begin());
+    auto nested_node_range_end_it =
+      thrust::make_transform_output_iterator(node_range_end.begin(), [] __device__(auto i) {
+        // add +1 to include end symbol.
+        return i + 1;
+      });
+    auto stencil = thrust::make_transform_iterator(token_id.begin(), is_nested_end{tokens.begin()});
+    thrust::scatter_if(rmm::exec_policy(stream),
+                       token_indices_it,
+                       token_indices_it + num_nested,
+                       parent_node_ids.begin(),
+                       stencil,
+                       nested_node_range_end_it);
+  }
+
   return {std::move(node_categories),
           std::move(parent_node_ids),
           std::move(node_levels),
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 8d89f4ff927..c13daf9b9f5 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -216,6 +216,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
  *
  * @param tokens Vector of token types in the json string
  * @param token_indices The indices within the input string corresponding to each token
+ * @param is_strict_nested_boundaries Whether to extract node end of nested types strictly
  * @param stream The CUDA stream to which kernels are dispatched
  * @param mr Optional, resource with which to allocate
  * @return A tree representation of the input JSON string as vectors of node type, parent index,
@@ -223,6 +224,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
  */
 tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
+                                    bool is_strict_nested_boundaries,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr);
 
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 7fce31461ef..22c2f0de924 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -2050,4 +2050,117 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars)
     float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()});
 }
 
+TEST_F(JsonReaderTest, MixedTypes)
+{
+  {
+    // Simple test for mixed types
+    std::string json_string = R"({ "foo": [1,2,3], "bar": 123 }
+                               { "foo": { "a": 1 }, "bar": 456 })";
+
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_string.data(), json_string.size()})
+        .mixed_types_as_string(true)
+        .lines(true);
+
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 2);
+    EXPECT_EQ(result.tbl->num_rows(), 2);
+    EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING);
+    EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::INT64);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                   cudf::test::strings_column_wrapper({"[1,2,3]", "{ \"a\": 1 }"}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1),
+                                   cudf::test::fixed_width_column_wrapper<int64_t>({123, 456}));
+  }
+
+  // Testing function for mixed types in JSON (for spark json reader)
+  auto test_fn = [](std::string_view json_string, cudf::column_view expected) {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_string.data(), json_string.size()})
+        .mixed_types_as_string(true)
+        .lines(true);
+
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected);
+  };
+
+  // test cases.
+  test_fn(R"(
+{ "a": "123" }
+{ "a": 123 }
+)",
+          cudf::test::strings_column_wrapper({"123", "123"}));
+
+  test_fn(R"(
+{ "a": [1,2,3] }
+{ "a": { "b": 1 } }
+)",
+          cudf::test::strings_column_wrapper({"[1,2,3]", "{ \"b\": 1 }"}));
+
+  test_fn(R"(
+{ "a": "fox" }
+{ "a": { "b": 1 } }
+)",
+          cudf::test::strings_column_wrapper({"fox", "{ \"b\": 1 }"}));
+
+  test_fn(R"(
+{ "a": [1,2,3] }
+{ "a": "fox" }
+)",
+          cudf::test::strings_column_wrapper({"[1,2,3]", "fox"}));
+
+  test_fn(R"(
+{ "a": [1,2,3] }
+{ "a": [true,false,true] }
+{ "a": ["a", "b", "c"] }
+)",
+          cudf::test::lists_column_wrapper<cudf::string_view>{
+            {"1", "2", "3"}, {"true", "false", "true"}, {"a", "b", "c"}});
+  {
+    std::string json_string = R"(
+{ "var1": true }
+{ "var1": [{ "var0": true, "var1": "hello", "var2": null }, null, [true, null, null]] }
+  )";
+
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_string.data(), json_string.size()})
+        .mixed_types_as_string(true)
+        .lines(true);
+
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+  }
+
+  // test to confirm if reinitialize a non-string column as string affects max_rowoffsets.
+  // max_rowoffsets is generated based on parent col id,
+  // so, even if mixed types are present, their row offset will be correct.
+  using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
+  using valid_t = std::vector<cudf::valid_type>;
+
+  cudf::test::lists_column_wrapper expected_list{
+    {
+      cudf::test::lists_column_wrapper({LCW({"1", "2", "3"}), LCW({"4", "5", "6"})}),
+      cudf::test::lists_column_wrapper({LCW()}),
+      cudf::test::lists_column_wrapper({LCW()}),  // null
+      cudf::test::lists_column_wrapper({LCW()}),  // null
+      cudf::test::lists_column_wrapper({LCW({"{\"c\": -1}"}), LCW({"5"})}),
+      cudf::test::lists_column_wrapper({LCW({"7"}), LCW({"8", "9"})}),
+      cudf::test::lists_column_wrapper({LCW()}),  // null
+    },
+    valid_t{1, 1, 0, 0, 1, 1, 0}.begin()};
+  test_fn(R"(
+{"b": [ [1, 2, 3], [ 4, 5, 6] ]}
+{"b": [[]]}
+{}
+{}
+{"b": [ [ {"c": -1} ], [ 5 ] ]}
+{"b": [ [7], [8, 9]]}
+{}
+)",
+          expected_list);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index f5d03293d30..40996e4fffa 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -596,7 +596,7 @@ TEST_F(JsonTest, TreeRepresentation)
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -684,7 +684,7 @@ TEST_F(JsonTest, TreeRepresentation2)
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -759,7 +759,7 @@ TEST_F(JsonTest, TreeRepresentation3)
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -785,9 +785,10 @@ TEST_F(JsonTest, TreeRepresentationError)
 
   // Get the JSON's tree representation
   // This JSON is invalid and will raise an exception.
-  EXPECT_THROW(cuio_json::detail::get_tree_representation(
-                 tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cuio_json::detail::get_tree_representation(
+      tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()),
+    cudf::logic_error);
 }
 
 /**
@@ -876,7 +877,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
     records_orient_tree_traversal_cpu(input, cpu_tree, is_array_of_arrays, json_lines, stream);
   // gpu tree generation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
 
 #if LIBCUDF_JSON_DEBUG_DUMP
   printf("BEFORE traversal (gpu_tree):\n");
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index f98687df5fa..523d594f8ba 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -30,12 +30,14 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean dayFirst;
   private final boolean lines;
   private final boolean recoverWithNull;
+  private final boolean mixedTypesAsStrings;
 
   private JSONOptions(Builder builder) {
     super(builder);
     dayFirst = builder.dayFirst;
     lines = builder.lines;
     recoverWithNull = builder.recoverWithNull;
+    mixedTypesAsStrings = builder.mixedTypesAsStrings;
   }
 
   public boolean isDayFirst() {
@@ -51,6 +53,10 @@ public boolean isRecoverWithNull() {
     return recoverWithNull;
   }
 
+  public boolean isMixedTypesAsStrings() {
+    return mixedTypesAsStrings;
+  }
+
   @Override
   String[] getIncludeColumnNames() {
     throw new UnsupportedOperationException("JSON reader didn't support column prune");
@@ -66,6 +72,8 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
 
     private boolean recoverWithNull = false;
 
+    private boolean mixedTypesAsStrings = false;
+
     /**
      * Whether to parse dates as DD/MM versus MM/DD
      * @param dayFirst true: DD/MM, false, MM/DD
@@ -101,6 +109,17 @@ public Builder withRecoverWithNull(boolean recoverWithNull) {
       return this;
     }
 
+    /**
+     * Specify how to handle columns that contain mixed types.
+     *
+     * @param mixedTypesAsStrings true: return unparsed JSON, false: throw exception
+     * @return builder for chaining
+     */
+    public Builder withMixedTypesAsStrings(boolean mixedTypesAsStrings) {
+      this.mixedTypesAsStrings = mixedTypesAsStrings;
+      return this;
+    }
+
     @Override
     public Builder includeColumn(String... names) {
       throw new UnsupportedOperationException("JSON reader didn't support column prune");
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 50ea54ddaab..300c540b8c0 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -250,16 +250,18 @@ private static native long readJSON(String[] columnNames,
                                         int[] dTypeIds, int[] dTypeScales,
                                         String filePath, long address, long length,
                                         boolean dayFirst, boolean lines,
-                                        boolean recoverWithNulls) throws CudfException;
+                                        boolean recoverWithNulls,
+                                        boolean mixedTypesAsStrings) throws CudfException;
 
   private static native long readJSONFromDataSource(String[] columnNames,
                                       int[] dTypeIds, int[] dTypeScales,
                                       boolean dayFirst, boolean lines,
                                       boolean recoverWithNulls,
+                                      boolean mixedTypesAsStrings,
                                       long dsHandle) throws CudfException;
 
   private static native long readAndInferJSON(long address, long length,
-      boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException;
+      boolean dayFirst, boolean lines, boolean recoverWithNulls, boolean mixedTypesAsStrings) throws CudfException;
 
   /**
    * Read in Parquet formatted data.
@@ -1087,7 +1089,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
             readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
                     path.getAbsolutePath(),
                     0, 0,
-                    opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()))) {
+                    opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
+                    opts.isMixedTypesAsStrings()))) {
 
       return gatherJSONColumns(schema, twm);
     }
@@ -1139,7 +1142,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
     return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
-        opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()));
+        opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
+        opts.isMixedTypesAsStrings()));
   }
 
   /**
@@ -1162,7 +1166,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(),
             schema.getTypeIds(), schema.getTypeScales(), null,
             buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
-            opts.isRecoverWithNull()))) {
+            opts.isRecoverWithNull(), opts.isMixedTypesAsStrings()))) {
       return gatherJSONColumns(schema, twm);
     }
   }
@@ -1178,7 +1182,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
     long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(),
             schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(),
-            opts.isRecoverWithNull(), dsHandle))) {
+            opts.isRecoverWithNull(), opts.isMixedTypesAsStrings(), dsHandle))) {
       return gatherJSONColumns(schema, twm);
     } finally {
       DataSourceHelper.destroyWrapperDataSource(dsHandle);
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 295574858da..1ac15a3023c 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1392,7 +1392,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null) {
+    jboolean recover_with_null, jboolean mixed_types_as_string) {
 
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1411,7 +1411,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
                                                      .dayfirst(static_cast<bool>(day_first))
                                                      .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode);
+                                                     .recovery_mode(recovery_mode)
+                                                     .mixed_types_as_string(mixed_types_as_string);
 
     auto result =
         std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1469,7 +1470,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jboolean day_first, jboolean lines, jboolean recover_with_null, jlong ds_handle) {
+    jboolean day_first, jboolean lines, jboolean recover_with_null, jboolean mixed_types_as_string,
+    jlong ds_handle) {
 
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
@@ -1504,7 +1506,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
                                                      .dayfirst(static_cast<bool>(day_first))
                                                      .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode);
+                                                     .recovery_mode(recovery_mode)
+                                                     .mixed_types_as_string(mixed_types_as_string);
 
     if (!n_col_names.is_null() && data_types.size() > 0) {
       if (n_col_names.size() != n_types.size()) {
@@ -1536,7 +1539,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null) {
+    jboolean recover_with_null, jboolean mixed_types_as_string) {
 
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1586,7 +1589,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
                                                      .dayfirst(static_cast<bool>(day_first))
                                                      .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode);
+                                                     .recovery_mode(recovery_mode)
+                                                     .mixed_types_as_string(mixed_types_as_string);
 
     if (!n_col_names.is_null() && data_types.size() > 0) {
       if (n_col_names.size() != n_types.size()) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 8df8ebea8a7..73002644858 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -87,6 +87,8 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
   private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
   private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json");
+  private static final File TEST_MIXED_TYPE_1_JSON = TestUtils.getResourceAsFile("mixed_types_1.json");
+  private static final File TEST_MIXED_TYPE_2_JSON = TestUtils.getResourceAsFile("mixed_types_2.json");
 
   private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
       .column(DType.INT32, "A")
@@ -327,6 +329,54 @@ void testReadJSONFile() {
     }
   }
 
+  @Test
+  void testReadMixedType2JSONFileFeatureDisabled() {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "a")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .withMixedTypesAsStrings(false)
+            .build();
+    assertThrows(CudfException.class, () ->
+      Table.readJSON(schema, opts, TEST_MIXED_TYPE_2_JSON));
+  }
+
+  @Test
+  void testReadMixedType1JSONFile() {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "a")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .withMixedTypesAsStrings(true)
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("123", "123" )
+            .build();
+         Table table = Table.readJSON(schema, opts, TEST_MIXED_TYPE_1_JSON)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testReadMixedType2JSONFile() throws IOException {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "a")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .withMixedTypesAsStrings(true)
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("[1,2,3]", "{ \"b\": 1 }" )
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_MIXED_TYPE_2_JSON);
+         Table table = Table.readJSON(schema, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadJSONFromDataSource() throws IOException {
     Schema schema = Schema.builder()
diff --git a/java/src/test/resources/mixed_types_1.json b/java/src/test/resources/mixed_types_1.json
new file mode 100644
index 00000000000..21d625bbf2a
--- /dev/null
+++ b/java/src/test/resources/mixed_types_1.json
@@ -0,0 +1,2 @@
+{ "a": "123" }
+{ "a": 123 }
diff --git a/java/src/test/resources/mixed_types_2.json b/java/src/test/resources/mixed_types_2.json
new file mode 100644
index 00000000000..becad2d0db7
--- /dev/null
+++ b/java/src/test/resources/mixed_types_2.json
@@ -0,0 +1,2 @@
+{ "a": [1,2,3] }
+{ "a": { "b": 1 } }
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index ad618cc4ed6..b916c2b7ad9 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
@@ -27,6 +27,7 @@ cdef extern from "cudf/io/json.hpp" \
         size_type get_byte_range_offset() except +
         size_type get_byte_range_size() except +
         bool is_enabled_lines() except +
+        bool is_enabled_mixed_types_as_string() except +
         bool is_enabled_dayfirst() except +
         bool is_enabled_experimental() except +
 
@@ -39,6 +40,7 @@ cdef extern from "cudf/io/json.hpp" \
         void set_byte_range_offset(size_type offset) except +
         void set_byte_range_size(size_type size) except +
         void enable_lines(bool val) except +
+        void enable_mixed_types_as_string(bool val) except +
         void enable_dayfirst(bool val) except +
         void enable_experimental(bool val) except +
         void enable_keep_quotes(bool val) except +
@@ -74,6 +76,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& lines(
             bool val
         ) except +
+        json_reader_options_builder& mixed_types_as_string(
+            bool val
+        ) except +
         json_reader_options_builder& dayfirst(
             bool val
         ) except +
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index c361a3f00c4..9bbad0f61c3 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -50,7 +50,8 @@ cpdef read_json(object filepaths_or_buffers,
                 object compression,
                 object byte_range,
                 bool legacy,
-                bool keep_quotes):
+                bool keep_quotes,
+                bool mixed_types_as_string):
     """
     Cython function to call into libcudf API, see `read_json`.
 
@@ -128,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers,
         opts.set_dtypes(c_dtypes_schema_map)
 
     opts.enable_keep_quotes(keep_quotes)
+    opts.enable_mixed_types_as_string(mixed_types_as_string)
     # Read JSON
     cdef cudf_io_types.table_with_metadata c_result
 
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index ae2f0203642..35d91f9c062 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import warnings
 from collections import abc
@@ -25,6 +25,7 @@ def read_json(
     byte_range=None,
     keep_quotes=False,
     storage_options=None,
+    mixed_types_as_string=False,
     *args,
     **kwargs,
 ):
@@ -116,6 +117,7 @@ def read_json(
             byte_range,
             engine == "cudf_legacy",
             keep_quotes,
+            mixed_types_as_string,
         )
     else:
         warnings.warn(

From f24f0b528b16454a2b79182f77bb46a663ab2c25 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Mon, 22 Jan 2024 10:42:25 -0800
Subject: [PATCH 31/60] Work around incompatibilities between V2 page header
 handling and zStandard compression in Parquet writer (#14772)

In the version 2 Parquet page header, neither the repetition nor definition level data is compressed. The current Parquet writer achieves this by offsetting the input buffers passed to nvcomp to skip this level data. Doing so can lead to mis-aligned data being passed to nvcomp (for zstd, input currently must be aligned on a 4 byte boundary). This PR is a short-term fix that will print an error and exit if zStandard compression is used with V2 page headers. This also fixes an underestimation of the maximum V2 page header size.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14772
---
 cpp/src/io/parquet/page_enc.cu       | 14 ++++++++++++--
 cpp/src/io/parquet/writer_impl.cu    |  4 ++++
 cpp/tests/io/parquet_writer_test.cpp | 15 +++++++++++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 12af5888d2f..3cc4fda695f 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -57,6 +57,13 @@ constexpr int num_encode_warps  = encode_block_size / cudf::detail::warp_size;
 
 constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }
 
+// max V1 header size
+// also valid for dict page header (V1 or V2)
+constexpr int MAX_V1_HDR_SIZE = util::round_up_unsafe(27, 8);
+
+// max V2 header size
+constexpr int MAX_V2_HDR_SIZE = util::round_up_unsafe(49, 8);
+
 // do not truncate statistics
 constexpr int32_t NO_TRUNC_STATS = 0;
 
@@ -534,6 +541,9 @@ CUDF_KERNEL void __launch_bounds__(128)
   uint32_t const t          = threadIdx.x;
   auto const data_page_type = write_v2_headers ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE;
 
+  // Max page header size excluding statistics
+  auto const max_data_page_hdr_size = write_v2_headers ? MAX_V2_HDR_SIZE : MAX_V1_HDR_SIZE;
+
   if (t == 0) {
     col_g  = col_desc[blockIdx.x];
     ck_g   = chunks[blockIdx.y][blockIdx.x];
@@ -584,7 +594,7 @@ CUDF_KERNEL void __launch_bounds__(128)
         page_g.chunk           = &chunks[blockIdx.y][blockIdx.x];
         page_g.chunk_id        = blockIdx.y * num_columns + blockIdx.x;
         page_g.hdr_size        = 0;
-        page_g.max_hdr_size    = 32;
+        page_g.max_hdr_size    = MAX_V1_HDR_SIZE;
         page_g.max_data_size   = ck_g.uniq_data_size;
         page_g.start_row       = cur_row;
         page_g.num_rows        = ck_g.num_dict_entries;
@@ -684,7 +694,7 @@ CUDF_KERNEL void __launch_bounds__(128)
           page_g.chunk_id      = blockIdx.y * num_columns + blockIdx.x;
           page_g.page_type     = data_page_type;
           page_g.hdr_size      = 0;
-          page_g.max_hdr_size  = 32;  // Max size excluding statistics
+          page_g.max_hdr_size  = max_data_page_hdr_size;  // Max size excluding statistics
           if (ck_g.stats) {
             uint32_t stats_hdr_len = 16;
             if (col_g.stats_dtype == dtype_string || col_g.stats_dtype == dtype_byte_array) {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 90f52c0ee70..417577f7b89 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -2220,6 +2220,10 @@ writer::impl::~impl() { close(); }
 
 void writer::impl::init_state()
 {
+  // See issue #14781. Can remove this check once that is fixed.
+  CUDF_EXPECTS(not(_write_v2_headers and _compression == Compression::ZSTD),
+               "V2 page headers cannot be used with ZSTD compression");
+
   _current_chunk_offset.resize(_out_sink.size());
   // Write file header
   file_header_s fhdr;
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 9415e018c6a..946c0e23f08 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -1401,6 +1401,21 @@ TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
   EXPECT_EQ(max_value, std::string(max_val));
 }
 
+// See #14772.
+// zStandard compression cannot currently be used with V2 page headers due to buffer
+// alignment issues.
+// TODO: Remove this test when #14781 is closed.
+TEST_F(ParquetWriterTest, ZstdWithV2Header)
+{
+  auto const expected = table_view{};
+
+  cudf::io::parquet_writer_options const out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{"14772.pq"}, expected)
+      .compression(cudf::io::compression_type::ZSTD)
+      .write_v2_headers(true);
+  EXPECT_THROW(cudf::io::write_parquet(out_opts), cudf::logic_error);
+}
+
 // custom mem mapped data sink that supports device writes
 template <bool supports_device_writes>
 class custom_test_memmap_sink : public cudf::io::data_sink {

From b1468a51933c82fe126a416bd45698cebfa1069c Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Mon, 22 Jan 2024 13:55:57 -0800
Subject: [PATCH 32/60] Add CUDF_TEST_PROGRAM_MAIN macro to tests lacking it
 (#14751)

`JSON_PATH_TEST` and `ROW_CONVERSION_TEST` were not using the `CUDF_TEST_PROGRAM_MAIN`, and thus were not picking up the `GTEST_CUDF_RMM_MODE` env variable during nightly testing.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14751
---
 cpp/tests/json/json_tests.cpp          |  5 ++++-
 cpp/tests/transform/row_conversion.cpp | 19 +++++++++++++------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp
index a03880eef5d..548047f0410 100644
--- a/cpp/tests/json/json_tests.cpp
+++ b/cpp/tests/json/json_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/testing_main.hpp>
 
 #include <stdexcept>
 
@@ -1016,3 +1017,5 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls)
   do_test("$.x[*].array", "", "null", false);
   do_test("$.tup[*].a.x", "[\"5\"]", "[null,null,null,\"5\"]");
 }
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/transform/row_conversion.cpp b/cpp/tests/transform/row_conversion.cpp
index 542ccc5e2d5..77cc236a4c4 100644
--- a/cpp/tests/transform/row_conversion.cpp
+++ b/cpp/tests/transform/row_conversion.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
@@ -164,7 +165,7 @@ TEST_F(ColumnToRowTests, ManyStrings)
       return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))];
     });
 
-  auto const num_rows = 1000000;
+  auto const num_rows = 1'000'000;
   auto const num_cols = 50;
   std::vector<cudf::data_type> schema;
 
@@ -763,7 +764,7 @@ TEST_F(RowToColumnTests, Bigger)
   std::vector<cudf::column_view> views;
   std::vector<cudf::data_type> schema;
 
-  // 28 columns of 1 million rows
+  // 128 columns of 1 million rows
   constexpr auto num_rows = 1024 * 1024;
   for (int i = 0; i < 128; ++i) {
     cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
@@ -792,8 +793,8 @@ TEST_F(RowToColumnTests, Biggest)
   std::vector<cudf::column_view> views;
   std::vector<cudf::data_type> schema;
 
-  // 128 columns of 1 million rows
-  constexpr auto num_rows = 5 * 1024 * 1024;
+  // 128 columns of 2 million rows
+  constexpr auto num_rows = 2 * 1024 * 1024;
   for (int i = 0; i < 128; ++i) {
     cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
                                                                    r + num_rows * i + num_rows));
@@ -916,6 +917,10 @@ TEST_F(RowToColumnTests, BigStrings)
 
 TEST_F(RowToColumnTests, ManyStrings)
 {
+  // The sizing of this test is very sensitive to the state of the random number generator,
+  // i.e., depending on the order of execution, the number of times the largest string is
+  // selected will lead to out-of-memory exceptions. Seeding the RNG here helps prevent that.
+  srand(1);
   char const* TEST_STRINGS[] = {
     "These",
     "are",
@@ -954,7 +959,7 @@ TEST_F(RowToColumnTests, ManyStrings)
     "this string is the longest string because it is duplicated more than you can imagine "
     "this string is the longest string because it is duplicated more than you can imagine "
     "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine ",
     "a",
     "good test",
     "is required to produce reasonable confidence that this is working",
@@ -971,7 +976,7 @@ TEST_F(RowToColumnTests, ManyStrings)
       return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))];
     });
 
-  auto const num_rows = 500000;
+  auto const num_rows = 300'000;
   auto const num_cols = 50;
   std::vector<cudf::data_type> schema;
 
@@ -1002,3 +1007,5 @@ TEST_F(RowToColumnTests, ManyStrings)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in_view[0], *new_cols);
   }
 }
+
+CUDF_TEST_PROGRAM_MAIN()

From 42d8d786876dc6ecf8b1f93f4329e91715b826ad Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 22 Jan 2024 15:58:10 -0600
Subject: [PATCH 33/60] Add SHA-1 and SHA-2 hash functions. (#14391)

This PR adds support for SHA-1 and SHA-2 (SHA-256, SHA-512, and truncated digests SHA-224, SHA-384) hash functions.  Resolves #8641. Replaces #9215.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Matthew Roeschke (https://github.com/mroeschke)
  - David Wendt (https://github.com/davidwendt)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/14391
---
 cpp/CMakeLists.txt                            |   5 +
 cpp/benchmarks/hashing/hash.cpp               |  40 +-
 cpp/include/cudf/hashing.hpp                  |  72 ++-
 .../cudf/hashing/detail/hash_functions.cuh    |  43 +-
 cpp/include/cudf/hashing/detail/hashing.hpp   |  22 +-
 cpp/src/hash/md5_hash.cu                      |  23 -
 cpp/src/hash/sha1_hash.cu                     |  81 +++
 cpp/src/hash/sha224_hash.cu                   |  82 +++
 cpp/src/hash/sha256_hash.cu                   |  82 +++
 cpp/src/hash/sha384_hash.cu                   |  89 +++
 cpp/src/hash/sha512_hash.cu                   |  89 +++
 cpp/src/hash/sha_hash.cuh                     | 547 ++++++++++++++++++
 cpp/tests/CMakeLists.txt                      |   5 +
 cpp/tests/hashing/md5_test.cpp                | 115 ++--
 cpp/tests/hashing/sha1_test.cpp               | 208 +++++++
 cpp/tests/hashing/sha224_test.cpp             | 208 +++++++
 cpp/tests/hashing/sha256_test.cpp             | 209 +++++++
 cpp/tests/hashing/sha384_test.cpp             | 226 ++++++++
 cpp/tests/hashing/sha512_test.cpp             | 226 ++++++++
 python/cudf/cudf/_lib/cpp/hash.pxd            |  22 +-
 python/cudf/cudf/_lib/hash.pyx                |  28 +-
 python/cudf/cudf/tests/test_dataframe.py      |  19 +-
 python/cudf/cudf/tests/test_series.py         |   4 +-
 23 files changed, 2348 insertions(+), 97 deletions(-)
 create mode 100644 cpp/src/hash/sha1_hash.cu
 create mode 100644 cpp/src/hash/sha224_hash.cu
 create mode 100644 cpp/src/hash/sha256_hash.cu
 create mode 100644 cpp/src/hash/sha384_hash.cu
 create mode 100644 cpp/src/hash/sha512_hash.cu
 create mode 100644 cpp/src/hash/sha_hash.cuh
 create mode 100644 cpp/tests/hashing/sha1_test.cpp
 create mode 100644 cpp/tests/hashing/sha224_test.cpp
 create mode 100644 cpp/tests/hashing/sha256_test.cpp
 create mode 100644 cpp/tests/hashing/sha384_test.cpp
 create mode 100644 cpp/tests/hashing/sha512_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2c0f601ca74..90eaec6804a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -344,6 +344,11 @@ add_library(
   src/hash/md5_hash.cu
   src/hash/murmurhash3_x86_32.cu
   src/hash/murmurhash3_x64_128.cu
+  src/hash/sha1_hash.cu
+  src/hash/sha224_hash.cu
+  src/hash/sha256_hash.cu
+  src/hash/sha384_hash.cu
+  src/hash/sha512_hash.cu
   src/hash/spark_murmurhash3_x86_32.cu
   src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index 4930fc59ac3..1da7457eb82 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -67,6 +67,36 @@ static void bench_hash(nvbench::state& state)
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); });
+  } else if (hash_name == "sha1") {
+    // sha1 creates a 40-byte string
+    state.add_global_memory_writes<nvbench::int8_t>(40 * num_rows);
+
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = cudf::hashing::sha1(data->view()); });
+  } else if (hash_name == "sha224") {
+    // sha224 creates a 56-byte string
+    state.add_global_memory_writes<nvbench::int8_t>(56 * num_rows);
+
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = cudf::hashing::sha224(data->view()); });
+  } else if (hash_name == "sha256") {
+    // sha256 creates a 64-byte string
+    state.add_global_memory_writes<nvbench::int8_t>(64 * num_rows);
+
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = cudf::hashing::sha256(data->view()); });
+  } else if (hash_name == "sha384") {
+    // sha384 creates a 96-byte string
+    state.add_global_memory_writes<nvbench::int8_t>(96 * num_rows);
+
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = cudf::hashing::sha384(data->view()); });
+  } else if (hash_name == "sha512") {
+    // sha512 creates a 128-byte string
+    state.add_global_memory_writes<nvbench::int8_t>(128 * num_rows);
+
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = cudf::hashing::sha512(data->view()); });
   } else if (hash_name == "spark_murmurhash3_x86_32") {
     state.add_global_memory_writes<nvbench::int32_t>(num_rows);
 
@@ -82,4 +112,12 @@ NVBENCH_BENCH(bench_hash)
   .set_name("hashing")
   .add_int64_axis("num_rows", {65536, 16777216})
   .add_float64_axis("nulls", {0.0, 0.1})
-  .add_string_axis("hash_name", {"murmurhash3_x86_32", "md5", "spark_murmurhash3_x86_32"});
+  .add_string_axis("hash_name",
+                   {"murmurhash3_x86_32",
+                    "md5",
+                    "sha1",
+                    "sha224",
+                    "sha256",
+                    "sha384",
+                    "sha512",
+                    "spark_murmurhash3_x86_32"});
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 72e32715ed4..c3a57af1358 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,6 +145,76 @@ std::unique_ptr<column> md5(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Computes the SHA-1 hash value of each row in the given table
+ *
+ * @param input The table of columns to hash
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> sha1(
+  table_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes the SHA-224 hash value of each row in the given table
+ *
+ * @param input The table of columns to hash
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> sha224(
+  table_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes the SHA-256 hash value of each row in the given table
+ *
+ * @param input The table of columns to hash
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> sha256(
+  table_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes the SHA-384 hash value of each row in the given table
+ *
+ * @param input The table of columns to hash
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> sha384(
+  table_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes the SHA-512 hash value of each row in the given table
+ *
+ * @param input The table of columns to hash
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> sha512(
+  table_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Computes the XXHash_64 hash value of each row in the given table
  *
diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh
index 7a3d1990791..0ec41a20ef1 100644
--- a/cpp/include/cudf/hashing/detail/hash_functions.cuh
+++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,4 +68,45 @@ __device__ inline uint64_t rotate_bits_right(uint64_t x, uint32_t r)
   return (x >> r) | (x << (64 - r));
 }
 
+// Swap the endianness of a 32 bit value
+__device__ inline uint32_t swap_endian(uint32_t x)
+{
+  // The selector 0x0123 reverses the byte order
+  return __byte_perm(x, 0, 0x0123);
+}
+
+// Swap the endianness of a 64 bit value
+// There is no CUDA intrinsic for permuting bytes in 64 bit integers
+__device__ inline uint64_t swap_endian(uint64_t x)
+{
+  // Reverse the endianness of each 32 bit section
+  uint32_t low_bits  = swap_endian(static_cast<uint32_t>(x));
+  uint32_t high_bits = swap_endian(static_cast<uint32_t>(x >> 32));
+  // Reassemble a 64 bit result, swapping the low bits and high bits
+  return (static_cast<uint64_t>(low_bits) << 32) | (static_cast<uint64_t>(high_bits));
+};
+
+/**
+ * Modified GPU implementation of
+ * https://johnnylee-sde.github.io/Fast-unsigned-integer-to-hex-string/
+ * Copyright (c) 2015 Barry Clark
+ * Licensed under the MIT license.
+ * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
+ */
+__device__ inline void uint32ToLowercaseHexString(uint32_t num, char* destination)
+{
+  // Transform 0xABCD'1234 => 0x0000'ABCD'0000'1234 => 0x0B0A'0D0C'0201'0403
+  uint64_t x = num;
+  x          = ((x & 0xFFFF'0000u) << 16) | ((x & 0xFFFF));
+  x          = ((x & 0x000F'0000'000Fu) << 8) | ((x & 0x00F0'0000'00F0u) >> 4) |
+      ((x & 0x0F00'0000'0F00u) << 16) | ((x & 0xF000'0000'F000) << 4);
+
+  // Calculate a mask of ascii value offsets for bytes that contain alphabetical hex digits
+  uint64_t offsets = (((x + 0x0606'0606'0606'0606) >> 4) & 0x0101'0101'0101'0101) * 0x27;
+
+  x |= 0x3030'3030'3030'3030;
+  x += offsets;
+  std::memcpy(destination, reinterpret_cast<uint8_t*>(&x), 8);
+}
+
 }  // namespace cudf::hashing::detail
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index f08d0fbb849..eaeb5d6b068 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,6 +46,26 @@ std::unique_ptr<column> md5(table_view const& input,
                             rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr);
 
+std::unique_ptr<column> sha1(table_view const& input,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
+
+std::unique_ptr<column> sha224(table_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
+
+std::unique_ptr<column> sha256(table_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
+
+std::unique_ptr<column> sha384(table_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
+
+std::unique_ptr<column> sha512(table_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
+
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view,
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index 002c9a9137b..b34455905d9 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -108,29 +108,6 @@ auto __device__ inline get_element_pointer_and_size(string_view const& element)
   return thrust::make_pair(reinterpret_cast<uint8_t const*>(element.data()), element.size_bytes());
 }
 
-/**
- * Modified GPU implementation of
- * https://johnnylee-sde.github.io/Fast-unsigned-integer-to-hex-string/
- * Copyright (c) 2015 Barry Clark
- * Licensed under the MIT license.
- * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
- */
-void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destination)
-{
-  // Transform 0xABCD'1234 => 0x0000'ABCD'0000'1234 => 0x0B0A'0D0C'0201'0403
-  uint64_t x = num;
-  x          = ((x & 0xFFFF'0000u) << 16) | ((x & 0xFFFF));
-  x          = ((x & 0x000F'0000'000Fu) << 8) | ((x & 0x00F0'0000'00F0u) >> 4) |
-      ((x & 0x0F00'0000'0F00u) << 16) | ((x & 0xF000'0000'F000) << 4);
-
-  // Calculate a mask of ascii value offsets for bytes that contain alphabetical hex digits
-  uint64_t offsets = (((x + 0x0606'0606'0606'0606) >> 4) & 0x0101'0101'0101'0101) * 0x27;
-
-  x |= 0x3030'3030'3030'3030;
-  x += offsets;
-  std::memcpy(destination, reinterpret_cast<uint8_t*>(&x), 8);
-}
-
 // The MD5 algorithm and its hash/shift constants are officially specified in
 // RFC 1321. For convenience, these values can also be found on Wikipedia:
 // https://en.wikipedia.org/wiki/MD5
diff --git a/cpp/src/hash/sha1_hash.cu b/cpp/src/hash/sha1_hash.cu
new file mode 100644
index 00000000000..71253d279b9
--- /dev/null
+++ b/cpp/src/hash/sha1_hash.cu
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sha_hash.cuh"
+
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace hashing {
+namespace detail {
+
+namespace {
+
+struct sha1_hash_state {
+  uint64_t message_length = 0;
+  uint32_t buffer_length  = 0;
+  uint32_t hash_value[5]  = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0};
+  uint8_t buffer[64];
+};
+
+struct SHA1Hash : HashBase<SHA1Hash> {
+  __device__ inline SHA1Hash(char* result_location) : HashBase<SHA1Hash>(result_location) {}
+
+  // Intermediate data type storing the hash state
+  using hash_state = sha1_hash_state;
+  // The word type used by this hash function
+  using sha_word_type = uint32_t;
+  // Number of bytes processed in each hash step
+  static constexpr uint32_t message_chunk_size = 64;
+  // Digest size in bytes
+  static constexpr uint32_t digest_size = 40;
+  // Number of bytes used for the message length
+  static constexpr uint32_t message_length_size = 8;
+
+  __device__ inline void hash_step(hash_state& state) { sha1_hash_step(state); }
+
+  hash_state state;
+};
+
+}  // namespace
+
+std::unique_ptr<column> sha1(table_view const& input,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+{
+  return sha_hash<SHA1Hash>(input, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> sha1(table_view const& input,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::sha1(input, stream, mr);
+}
+
+}  // namespace hashing
+}  // namespace cudf
diff --git a/cpp/src/hash/sha224_hash.cu b/cpp/src/hash/sha224_hash.cu
new file mode 100644
index 00000000000..61480a78776
--- /dev/null
+++ b/cpp/src/hash/sha224_hash.cu
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sha_hash.cuh"
+
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace hashing {
+namespace detail {
+
+namespace {
+
+struct sha224_hash_state {
+  uint64_t message_length = 0;
+  uint32_t buffer_length  = 0;
+  uint32_t hash_value[8]  = {
+    0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4};
+  uint8_t buffer[64];
+};
+
+struct SHA224Hash : HashBase<SHA224Hash> {
+  __device__ inline SHA224Hash(char* result_location) : HashBase<SHA224Hash>(result_location) {}
+
+  // Intermediate data type storing the hash state
+  using hash_state = sha224_hash_state;
+  // The word type used by this hash function
+  using sha_word_type = uint32_t;
+  // Number of bytes processed in each hash step
+  static constexpr uint32_t message_chunk_size = 64;
+  // Digest size in bytes. This is truncated from SHA-256.
+  static constexpr uint32_t digest_size = 56;
+  // Number of bytes used for the message length
+  static constexpr uint32_t message_length_size = 8;
+
+  __device__ inline void hash_step(hash_state& state) { sha256_hash_step(state); }
+
+  hash_state state;
+};
+
+}  // namespace
+
+std::unique_ptr<column> sha224(table_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  return sha_hash<SHA224Hash>(input, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> sha224(table_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::sha224(input, stream, mr);
+}
+
+}  // namespace hashing
+}  // namespace cudf
diff --git a/cpp/src/hash/sha256_hash.cu b/cpp/src/hash/sha256_hash.cu
new file mode 100644
index 00000000000..b15cfe09d52
--- /dev/null
+++ b/cpp/src/hash/sha256_hash.cu
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sha_hash.cuh"
+
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace hashing {
+namespace detail {
+
+namespace {
+
+struct sha256_hash_state {
+  uint64_t message_length = 0;
+  uint32_t buffer_length  = 0;
+  uint32_t hash_value[8]  = {
+    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
+  uint8_t buffer[64];
+};
+
+struct SHA256Hash : HashBase<SHA256Hash> {
+  __device__ inline SHA256Hash(char* result_location) : HashBase<SHA256Hash>(result_location) {}
+
+  // Intermediate data type storing the hash state
+  using hash_state = sha256_hash_state;
+  // The word type used by this hash function
+  using sha_word_type = uint32_t;
+  // Number of bytes processed in each hash step
+  static constexpr uint32_t message_chunk_size = 64;
+  // Digest size in bytes
+  static constexpr uint32_t digest_size = 64;
+  // Number of bytes used for the message length
+  static constexpr uint32_t message_length_size = 8;
+
+  __device__ inline void hash_step(hash_state& state) { sha256_hash_step(state); }
+
+  hash_state state;
+};
+
+}  // namespace
+
+std::unique_ptr<column> sha256(table_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  return sha_hash<SHA256Hash>(input, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> sha256(table_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::sha256(input, stream, mr);
+}
+
+}  // namespace hashing
+}  // namespace cudf
diff --git a/cpp/src/hash/sha384_hash.cu b/cpp/src/hash/sha384_hash.cu
new file mode 100644
index 00000000000..3075d2c62f8
--- /dev/null
+++ b/cpp/src/hash/sha384_hash.cu
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sha_hash.cuh"
+
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace hashing {
+namespace detail {
+
+namespace {
+
+// Need alignas(16) to avoid compiler bug.
+struct alignas(16) sha384_hash_state {
+  uint64_t message_length = 0;
+  uint32_t buffer_length  = 0;
+  uint64_t hash_value[8]  = {0xcbbb9d5dc1059ed8,
+                             0x629a292a367cd507,
+                             0x9159015a3070dd17,
+                             0x152fecd8f70e5939,
+                             0x67332667ffc00b31,
+                             0x8eb44a8768581511,
+                             0xdb0c2e0d64f98fa7,
+                             0x47b5481dbefa4fa4};
+  uint8_t buffer[128];
+};
+
+struct SHA384Hash : HashBase<SHA384Hash> {
+  __device__ inline SHA384Hash(char* result_location) : HashBase<SHA384Hash>(result_location) {}
+
+  // Intermediate data type storing the hash state
+  using hash_state = sha384_hash_state;
+  // The word type used by this hash function
+  using sha_word_type = uint64_t;
+  // Number of bytes processed in each hash step
+  static constexpr uint32_t message_chunk_size = 128;
+  // Digest size in bytes. This is truncated from SHA-512.
+  static constexpr uint32_t digest_size = 96;
+  // Number of bytes used for the message length
+  static constexpr uint32_t message_length_size = 16;
+
+  __device__ inline void hash_step(hash_state& state) { sha512_hash_step(state); }
+
+  hash_state state;
+};
+
+}  // namespace
+
+std::unique_ptr<column> sha384(table_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  return sha_hash<SHA384Hash>(input, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> sha384(table_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::sha384(input, stream, mr);
+}
+
+}  // namespace hashing
+}  // namespace cudf
diff --git a/cpp/src/hash/sha512_hash.cu b/cpp/src/hash/sha512_hash.cu
new file mode 100644
index 00000000000..d073cf1edca
--- /dev/null
+++ b/cpp/src/hash/sha512_hash.cu
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sha_hash.cuh"
+
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace hashing {
+namespace detail {
+
+namespace {
+
+// Need alignas(16) to avoid compiler bug.
+struct alignas(16) sha512_hash_state {
+  uint64_t message_length = 0;
+  uint32_t buffer_length  = 0;
+  uint64_t hash_value[8]  = {0x6a09e667f3bcc908,
+                             0xbb67ae8584caa73b,
+                             0x3c6ef372fe94f82b,
+                             0xa54ff53a5f1d36f1,
+                             0x510e527fade682d1,
+                             0x9b05688c2b3e6c1f,
+                             0x1f83d9abfb41bd6b,
+                             0x5be0cd19137e2179};
+  uint8_t buffer[128];
+};
+
+struct SHA512Hash : HashBase<SHA512Hash> {
+  __device__ inline SHA512Hash(char* result_location) : HashBase<SHA512Hash>(result_location) {}
+
+  // Intermediate data type storing the hash state
+  using hash_state = sha512_hash_state;
+  // The word type used by this hash function
+  using sha_word_type = uint64_t;
+  // Number of bytes processed in each hash step
+  static constexpr uint32_t message_chunk_size = 128;
+  // Digest size in bytes
+  static constexpr uint32_t digest_size = 128;
+  // Number of bytes used for the message length
+  static constexpr uint32_t message_length_size = 16;
+
+  __device__ inline void hash_step(hash_state& state) { sha512_hash_step(state); }
+
+  hash_state state;
+};
+
+}  // namespace
+
+std::unique_ptr<column> sha512(table_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  return sha_hash<SHA512Hash>(input, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> sha512(table_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::sha512(input, stream, mr);
+}
+
+}  // namespace hashing
+}  // namespace cudf
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
new file mode 100644
index 00000000000..0a22ee34918
--- /dev/null
+++ b/cpp/src/hash/sha_hash.cuh
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/hashing/detail/hash_functions.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/string_view.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/traits.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace cudf {
+namespace hashing {
+namespace detail {
+
+const __constant__ uint32_t sha256_hash_constants[64] = {
+  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
+
+const __constant__ uint64_t sha512_hash_constants[80] = {
+  0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
+  0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
+  0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
+  0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694,
+  0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
+  0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
+  0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4,
+  0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70,
+  0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
+  0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b,
+  0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30,
+  0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
+  0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
+  0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
+  0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
+  0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b,
+  0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
+  0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
+  0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
+  0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
+};
+
+/**
+ * @brief A CRTP helper function
+ *
+ * https://www.fluentcpp.com/2017/05/19/crtp-helper/
+ *
+ * Does two things:
+ * 1. Makes "crtp" explicit in the inheritance structure of a CRTP base class.
+ * 2. Avoids having to `static_cast` in a lot of places
+ *
+ * @tparam T The derived class in a CRTP hierarchy
+ */
+template <typename T>
+struct crtp {
+  __device__ inline T& underlying() { return static_cast<T&>(*this); }
+  __device__ inline T const& underlying() const { return static_cast<T const&>(*this); }
+};
+
+template <typename Hasher>
+struct HashBase : public crtp<Hasher> {
+  char* result_location;
+
+  __device__ inline HashBase(char* result_location) : result_location(result_location) {}
+
+  /**
+   * @brief Execute SHA on input data chunks.
+   *
+   * This accepts arbitrary data, handles it as bytes, and calls the hash step
+   * when the buffer is filled up to message_chunk_size bytes.
+   */
+  __device__ inline void process(uint8_t const* data, uint32_t len)
+  {
+    auto& state = this->underlying().state;
+    state.message_length += len;
+
+    if (state.buffer_length + len < Hasher::message_chunk_size) {
+      // The buffer will not be filled by this data. We copy the new data into
+      // the buffer but do not trigger a hash step yet.
+      memcpy(state.buffer + state.buffer_length, data, len);
+      state.buffer_length += len;
+    } else {
+      // The buffer will be filled by this data. Copy a chunk of the data to fill
+      // the buffer and trigger a hash step.
+      uint32_t copylen = Hasher::message_chunk_size - state.buffer_length;
+      memcpy(state.buffer + state.buffer_length, data, copylen);
+      this->underlying().hash_step(state);
+
+      // Take buffer-sized chunks of the data and do a hash step on each chunk.
+      while (len > Hasher::message_chunk_size + copylen) {
+        memcpy(state.buffer, data + copylen, Hasher::message_chunk_size);
+        this->underlying().hash_step(state);
+        copylen += Hasher::message_chunk_size;
+      }
+
+      // The remaining data chunk does not fill the buffer. We copy the data into
+      // the buffer but do not trigger a hash step yet.
+      memcpy(state.buffer, data + copylen, len - copylen);
+      state.buffer_length = len - copylen;
+    }
+  }
+
+  template <typename T>
+  __device__ inline void process_fixed_width(T const& key)
+  {
+    uint8_t const* data    = reinterpret_cast<uint8_t const*>(&key);
+    uint32_t constexpr len = sizeof(T);
+    process(data, len);
+  }
+
+  /**
+   * @brief Finalize SHA element processing.
+   *
+   * This method fills the remainder of the message buffer with zeros, appends
+   * the message length (in another step of the hash, if needed), and performs
+   * the final hash step.
+   */
+  __device__ inline void finalize()
+  {
+    auto& state = this->underlying().state;
+    // Message length in bits.
+    uint64_t const message_length_in_bits = (static_cast<uint64_t>(state.message_length)) << 3;
+    // Add a one bit flag (10000000) to signal the end of the message
+    uint8_t constexpr end_of_message = 0x80;
+    // 1 byte for the end of the message flag
+    uint32_t constexpr end_of_message_size = 1;
+
+    thrust::fill_n(
+      thrust::seq, state.buffer + state.buffer_length, end_of_message_size, end_of_message);
+
+    // SHA-512 uses a 128-bit message length instead of a 64-bit message length
+    // but this code does not support messages with lengths exceeding UINT64_MAX
+    // bits. We always pad the upper 64 bits with zeros.
+    uint32_t constexpr message_length_supported_size = sizeof(message_length_in_bits);
+
+    if (state.buffer_length + Hasher::message_length_size + end_of_message_size <=
+        Hasher::message_chunk_size) {
+      // Fill the remainder of the buffer with zeros up to the space reserved
+      // for the message length. The message length fits in this hash step.
+      thrust::fill(thrust::seq,
+                   state.buffer + state.buffer_length + end_of_message_size,
+                   state.buffer + Hasher::message_chunk_size - message_length_supported_size,
+                   0x00);
+    } else {
+      // Fill the remainder of the buffer with zeros. The message length doesn't
+      // fit and will be processed in a subsequent hash step comprised of only
+      // zeros followed by the message length.
+      thrust::fill(thrust::seq,
+                   state.buffer + state.buffer_length + end_of_message_size,
+                   state.buffer + Hasher::message_chunk_size,
+                   0x00);
+      this->underlying().hash_step(state);
+
+      // Fill the entire message with zeros up to the final bytes reserved for
+      // the message length.
+      thrust::fill_n(thrust::seq,
+                     state.buffer,
+                     Hasher::message_chunk_size - message_length_supported_size,
+                     0x00);
+    }
+
+    // Convert the 64-bit message length from little-endian to big-endian.
+    uint64_t const full_length_flipped = swap_endian(message_length_in_bits);
+    memcpy(state.buffer + Hasher::message_chunk_size - message_length_supported_size,
+           reinterpret_cast<uint8_t const*>(&full_length_flipped),
+           message_length_supported_size);
+    this->underlying().hash_step(state);
+
+    // Each byte in the word generates two bytes in the hexadecimal string digest.
+    // SHA-224 and SHA-384 digests are truncated because their digest does not
+    // include all of the hash values.
+    auto constexpr num_words_to_copy =
+      Hasher::digest_size / (2 * sizeof(typename Hasher::sha_word_type));
+    for (int i = 0; i < num_words_to_copy; i++) {
+      // Convert word representation from big-endian to little-endian.
+      typename Hasher::sha_word_type flipped = swap_endian(state.hash_value[i]);
+      if constexpr (std::is_same_v<typename Hasher::sha_word_type, uint32_t>) {
+        uint32ToLowercaseHexString(flipped, result_location + (8 * i));
+      } else if constexpr (std::is_same_v<typename Hasher::sha_word_type, uint64_t>) {
+        uint32_t low_bits = static_cast<uint32_t>(flipped);
+        uint32ToLowercaseHexString(low_bits, result_location + (16 * i));
+        uint32_t high_bits = static_cast<uint32_t>(flipped >> 32);
+        uint32ToLowercaseHexString(high_bits, result_location + (16 * i) + 8);
+      } else {
+        cudf_assert(false && "Unsupported SHA word type.");
+      }
+    }
+  };
+};
+
+template <typename Hasher>
+struct HasherDispatcher {
+  Hasher* hasher;
+  column_device_view input_col;
+
+  __device__ inline HasherDispatcher(Hasher* hasher, column_device_view const& input_col)
+    : hasher{hasher}, input_col{input_col}
+  {
+  }
+
+  template <typename Element,
+            CUDF_ENABLE_IF(is_fixed_width<Element>() and not is_floating_point<Element>() and
+                           not is_chrono<Element>())>
+  __device__ inline void operator()(size_type row_index)
+  {
+    Element const& key = input_col.element<Element>(row_index);
+    hasher->process_fixed_width(key);
+  }
+
+  template <typename Element, CUDF_ENABLE_IF(is_floating_point<Element>())>
+  __device__ inline void operator()(size_type row_index)
+  {
+    Element const& key = input_col.element<Element>(row_index);
+    if (isnan(key)) {
+      Element nan = std::numeric_limits<Element>::quiet_NaN();
+      hasher->process_fixed_width(nan);
+    } else if (key == Element{0.0}) {
+      hasher->process_fixed_width(Element{0.0});
+    } else {
+      hasher->process_fixed_width(key);
+    }
+  }
+
+  template <typename Element, CUDF_ENABLE_IF(std::is_same_v<Element, string_view>)>
+  __device__ inline void operator()(size_type row_index)
+  {
+    string_view key     = input_col.element<string_view>(row_index);
+    uint8_t const* data = reinterpret_cast<uint8_t const*>(key.data());
+    uint32_t const len  = static_cast<uint32_t>(key.size_bytes());
+    hasher->process(data, len);
+  }
+
+  template <typename Element,
+            CUDF_ENABLE_IF((not is_fixed_width<Element>() or is_chrono<Element>()) and
+                           not std::is_same_v<Element, string_view>)>
+  __device__ inline void operator()(size_type row_index)
+  {
+    (void)row_index;
+    cudf_assert(false && "Unsupported type for hash function.");
+  }
+};
+
+/**
+ * @brief Core SHA-1 algorithm implementation
+ *
+ * Processes a single 512-bit chunk, updating the hash value so far.
+ * This does not zero out the buffer contents.
+ */
+template <typename hash_state>
+__device__ inline void sha1_hash_step(hash_state& state)
+{
+  uint32_t words[80];
+
+  // The 512-bit message buffer fills the first 16 words.
+  memcpy(&words[0], state.buffer, sizeof(words[0]) * 16);
+  for (int i = 0; i < 16; i++) {
+    // Convert word representation from little-endian to big-endian.
+    words[i] = swap_endian(words[i]);
+  }
+
+  // The rest of the 80 words are generated from the first 16 words.
+  for (int i = 16; i < 80; i++) {
+    uint32_t const temp = words[i - 3] ^ words[i - 8] ^ words[i - 14] ^ words[i - 16];
+    words[i]            = rotate_bits_left(temp, 1);
+  }
+
+  uint32_t A = state.hash_value[0];
+  uint32_t B = state.hash_value[1];
+  uint32_t C = state.hash_value[2];
+  uint32_t D = state.hash_value[3];
+  uint32_t E = state.hash_value[4];
+
+  for (int i = 0; i < 80; i++) {
+    uint32_t F;
+    uint32_t k;
+    uint32_t temp;
+    switch (i / 20) {
+      case 0:
+        F = D ^ (B & (C ^ D));
+        k = 0x5a827999;
+        break;
+      case 1:
+        F = B ^ C ^ D;
+        k = 0x6ed9eba1;
+        break;
+      case 2:
+        F = (B & C) | (B & D) | (C & D);
+        k = 0x8f1bbcdc;
+        break;
+      case 3:
+        F = B ^ C ^ D;
+        k = 0xca62c1d6;
+        break;
+    }
+    temp = rotate_bits_left(A, 5) + F + E + k + words[i];
+    E    = D;
+    D    = C;
+    C    = rotate_bits_left(B, 30);
+    B    = A;
+    A    = temp;
+  }
+
+  state.hash_value[0] += A;
+  state.hash_value[1] += B;
+  state.hash_value[2] += C;
+  state.hash_value[3] += D;
+  state.hash_value[4] += E;
+
+  state.buffer_length = 0;
+}
+
+/**
+ * @brief Core SHA-256 algorithm implementation
+ *
+ * Processes a single 512-bit chunk, updating the hash value so far.
+ * This does not zero out the buffer contents.
+ */
+template <typename hash_state>
+__device__ inline void sha256_hash_step(hash_state& state)
+{
+  uint32_t words[64];
+
+  // The 512-bit message buffer fills the first 16 words.
+  memcpy(&words[0], state.buffer, sizeof(words[0]) * 16);
+  for (int i = 0; i < 16; i++) {
+    // Convert word representation from little-endian to big-endian.
+    words[i] = swap_endian(words[i]);
+  }
+
+  // The rest of the 64 words are generated from the first 16 words.
+  for (int i = 16; i < 64; i++) {
+    uint32_t const s0 = rotate_bits_right(words[i - 15], 7) ^ rotate_bits_right(words[i - 15], 18) ^
+                        (words[i - 15] >> 3);
+    uint32_t const s1 = rotate_bits_right(words[i - 2], 17) ^ rotate_bits_right(words[i - 2], 19) ^
+                        (words[i - 2] >> 10);
+    words[i] = words[i - 16] + s0 + words[i - 7] + s1;
+  }
+
+  uint32_t A = state.hash_value[0];
+  uint32_t B = state.hash_value[1];
+  uint32_t C = state.hash_value[2];
+  uint32_t D = state.hash_value[3];
+  uint32_t E = state.hash_value[4];
+  uint32_t F = state.hash_value[5];
+  uint32_t G = state.hash_value[6];
+  uint32_t H = state.hash_value[7];
+
+  for (int i = 0; i < 64; i++) {
+    uint32_t const s1 =
+      rotate_bits_right(E, 6) ^ rotate_bits_right(E, 11) ^ rotate_bits_right(E, 25);
+    uint32_t const ch    = (E & F) ^ ((~E) & G);
+    uint32_t const temp1 = H + s1 + ch + sha256_hash_constants[i] + words[i];
+    uint32_t const s0 =
+      rotate_bits_right(A, 2) ^ rotate_bits_right(A, 13) ^ rotate_bits_right(A, 22);
+    uint32_t const maj   = (A & B) ^ (A & C) ^ (B & C);
+    uint32_t const temp2 = s0 + maj;
+
+    H = G;
+    G = F;
+    F = E;
+    E = D + temp1;
+    D = C;
+    C = B;
+    B = A;
+    A = temp1 + temp2;
+  }
+
+  state.hash_value[0] += A;
+  state.hash_value[1] += B;
+  state.hash_value[2] += C;
+  state.hash_value[3] += D;
+  state.hash_value[4] += E;
+  state.hash_value[5] += F;
+  state.hash_value[6] += G;
+  state.hash_value[7] += H;
+
+  state.buffer_length = 0;
+}
+
+/**
+ * @brief Core SHA-512 algorithm implementation
+ *
+ * Processes a single 1024-bit chunk, updating the hash value so far.
+ * This does not zero out the buffer contents.
+ */
+template <typename hash_state>
+__device__ inline void sha512_hash_step(hash_state& state)
+{
+  uint64_t words[80];
+
+  // The 1024-bit message buffer fills the first 16 words.
+  memcpy(&words[0], state.buffer, sizeof(words[0]) * 16);
+  for (int i = 0; i < 16; i++) {
+    // Convert word representation from little-endian to big-endian.
+    words[i] = swap_endian(words[i]);
+  }
+
+  // The rest of the 80 words are generated from the first 16 words.
+  for (int i = 16; i < 80; i++) {
+    uint64_t const s0 = rotate_bits_right(words[i - 15], 1) ^ rotate_bits_right(words[i - 15], 8) ^
+                        (words[i - 15] >> 7);
+    uint64_t const s1 = rotate_bits_right(words[i - 2], 19) ^ rotate_bits_right(words[i - 2], 61) ^
+                        (words[i - 2] >> 6);
+    words[i] = words[i - 16] + s0 + words[i - 7] + s1;
+  }
+
+  uint64_t A = state.hash_value[0];
+  uint64_t B = state.hash_value[1];
+  uint64_t C = state.hash_value[2];
+  uint64_t D = state.hash_value[3];
+  uint64_t E = state.hash_value[4];
+  uint64_t F = state.hash_value[5];
+  uint64_t G = state.hash_value[6];
+  uint64_t H = state.hash_value[7];
+
+  for (int i = 0; i < 80; i++) {
+    uint64_t const s1 =
+      rotate_bits_right(E, 14) ^ rotate_bits_right(E, 18) ^ rotate_bits_right(E, 41);
+    uint64_t const ch    = (E & F) ^ ((~E) & G);
+    uint64_t const temp1 = H + s1 + ch + sha512_hash_constants[i] + words[i];
+    uint64_t const s0 =
+      rotate_bits_right(A, 28) ^ rotate_bits_right(A, 34) ^ rotate_bits_right(A, 39);
+    uint64_t const maj   = (A & B) ^ (A & C) ^ (B & C);
+    uint64_t const temp2 = s0 + maj;
+
+    H = G;
+    G = F;
+    F = E;
+    E = D + temp1;
+    D = C;
+    C = B;
+    B = A;
+    A = temp1 + temp2;
+  }
+
+  state.hash_value[0] += A;
+  state.hash_value[1] += B;
+  state.hash_value[2] += C;
+  state.hash_value[3] += D;
+  state.hash_value[4] += E;
+  state.hash_value[5] += F;
+  state.hash_value[6] += G;
+  state.hash_value[7] += H;
+
+  state.buffer_length = 0;
+}
+
+// SHA supported leaf data type check
+bool inline sha_leaf_type_check(data_type dt)
+{
+  return (is_fixed_width(dt) && !is_chrono(dt)) || (dt.id() == type_id::STRING);
+}
+
+/**
+ * @brief Call a SHA-1 or SHA-2 hash function on a table view.
+ *
+ * @tparam Hasher The struct used for computing SHA hashes.
+ *
+ * @param input The input table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to use for the device memory allocation
+ * @return A new column with the computed hash function result
+ */
+template <typename Hasher>
+std::unique_ptr<column> sha_hash(table_view const& input,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  if (input.num_rows() == 0) { return cudf::make_empty_column(cudf::type_id::STRING); }
+
+  // Accepts string and fixed width columns.
+  // TODO: Accept single layer list columns holding those types.
+  CUDF_EXPECTS(
+    std::all_of(
+      input.begin(), input.end(), [](auto const& col) { return sha_leaf_type_check(col.type()); }),
+    "Unsupported column type for hash function.");
+
+  // Result column allocation and creation
+  auto begin = thrust::make_constant_iterator(Hasher::digest_size);
+  auto [offsets_column, bytes] =
+    cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
+
+  auto chars   = rmm::device_uvector<char>(bytes, stream, mr);
+  auto d_chars = chars.data();
+
+  auto const device_input = table_device_view::create(input, stream);
+
+  // Hash each row, hashing each element sequentially left to right
+  thrust::for_each(rmm::exec_policy(stream),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(input.num_rows()),
+                   [d_chars, device_input = *device_input] __device__(auto row_index) {
+                     Hasher hasher(d_chars + (row_index * Hasher::digest_size));
+                     for (auto const& col : device_input) {
+                       if (col.is_valid(row_index)) {
+                         cudf::type_dispatcher<dispatch_storage_type>(
+                           col.type(), HasherDispatcher(&hasher, col), row_index);
+                       }
+                     }
+                     hasher.finalize();
+                   });
+
+  return make_strings_column(input.num_rows(), std::move(offsets_column), chars.release(), 0, {});
+}
+
+}  // namespace detail
+}  // namespace hashing
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a3b982a6719..eee736613fe 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -169,6 +169,11 @@ ConfigureTest(
   hashing/md5_test.cpp
   hashing/murmurhash3_x86_32_test.cpp
   hashing/murmurhash3_x64_128_test.cpp
+  hashing/sha1_test.cpp
+  hashing/sha224_test.cpp
+  hashing/sha256_test.cpp
+  hashing/sha384_test.cpp
+  hashing/sha512_test.cpp
   hashing/spark_murmurhash3_x86_32_test.cpp
   hashing/xxhash_64_test.cpp
 )
diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp
index 52ca52eb2ff..9361c4e748c 100644
--- a/cpp/tests/hashing/md5_test.cpp
+++ b/cpp/tests/hashing/md5_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/iterator.cuh>
 #include <cudf/hashing.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -35,30 +34,43 @@ TEST_F(MD5HashTest, MultiValue)
      "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
      "MD5 hash function. This string needed to be longer.",
      "All work and no play makes Jack a dull boy",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
-
+     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+     "Multi-byte characters: é¼³⅝"});
+
+  /*
+  These outputs can be generated with shell:
+  ```
+  echo -n "input string" | md5sum
+  ```
+  Or with Python:
+  ```
+  import hashlib
+  print(hashlib.md5("input string".encode()).hexdigest())
+  ```
+  */
   cudf::test::strings_column_wrapper const md5_string_results1(
     {"d41d8cd98f00b204e9800998ecf8427e",
      "682240021651ae166d08fe2a014d5c09",
      "3669d5225fddbb34676312ca3b78bbd9",
      "c61a4185135eda043f35e92c3505e180",
-     "52da74c75cb6575d25be29e66bd0adde"});
+     "52da74c75cb6575d25be29e66bd0adde",
+     "65d1f8a3274d134f1ea9e6e854c72caa"});
 
   cudf::test::strings_column_wrapper const md5_string_results2(
     {"d41d8cd98f00b204e9800998ecf8427e",
      "e5a5682e82278e78dbaad9a689df7a73",
      "4121ab1bb6e84172fd94822645862ae9",
      "28970886501efe20164213855afe5850",
-     "6bc1b872103cc6a02d882245b8516e2e"});
+     "6bc1b872103cc6a02d882245b8516e2e",
+     "0772a7e13ec8fef61474c131598762f7"});
 
   using limits = std::numeric_limits<int32_t>;
   cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
-    {0, 100, -100, limits::min(), limits::max()});
+    {0, -1, 100, -100, limits::min(), limits::max()});
 
-  // Different truth values should be equal
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col({0, 1, 1, 1, 0, 1});
 
+  // Test string inputs against known outputs
   auto const string_input1      = cudf::table_view({strings_col});
   auto const string_input2      = cudf::table_view({strings_col, strings_col});
   auto const md5_string_output1 = cudf::hashing::md5(string_input1);
@@ -68,47 +80,23 @@ TEST_F(MD5HashTest, MultiValue)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_string_output1->view(), md5_string_results1);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_string_output2->view(), md5_string_results2);
 
-  auto const input1      = cudf::table_view({strings_col, ints_col, bools_col1});
-  auto const input2      = cudf::table_view({strings_col, ints_col, bools_col2});
+  // Test non-string inputs for self-consistency
+  auto const input1      = cudf::table_view({strings_col, ints_col, bools_col});
+  auto const input2      = cudf::table_view({strings_col, ints_col, bools_col});
   auto const md5_output1 = cudf::hashing::md5(input1);
   auto const md5_output2 = cudf::hashing::md5(input2);
   EXPECT_EQ(input1.num_rows(), md5_output1->size());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_output1->view(), md5_output2->view());
 }
 
-TEST_F(MD5HashTest, MultiValueNulls)
+TEST_F(MD5HashTest, EmptyNullEquivalence)
 {
-  // Nulls with different values should be equal
-  cudf::test::strings_column_wrapper const strings_col1(
-    {"",
-     "Different but null!",
-     "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
-     "MD5 hash function. This string needed to be longer.",
-     "All work and no play makes Jack a dull boy",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-    {1, 0, 0, 1, 0});
-  cudf::test::strings_column_wrapper const strings_col2(
-    {"",
-     "A 60 character string to test MD5's message padding algorithm",
-     "Very different... but null",
-     "All work and no play makes Jack a dull boy",
-     ""},
-    {1, 0, 0, 1, 1});  // empty string is equivalent to null
-
-  // Nulls with different values should be equal
-  using limits = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
-    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
-    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
-
-  // Nulls with different values should be equal
-  // Different truth values should be equal
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
+  // Test that empty strings hash the same as nulls
+  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0});
+  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1});
 
-  auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1});
-  auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2});
+  auto const input1 = cudf::table_view({strings_col1});
+  auto const input2 = cudf::table_view({strings_col2});
 
   auto const output1 = cudf::hashing::md5(input1);
   auto const output2 = cudf::hashing::md5(input2);
@@ -117,10 +105,12 @@ TEST_F(MD5HashTest, MultiValueNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
 }
 
-TEST_F(MD5HashTest, StringListsNulls)
+TEST_F(MD5HashTest, StringLists)
 {
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; });
 
+  // Test of data serialization: a string should hash the same as a list of
+  // strings that concatenate to the same input.
   cudf::test::strings_column_wrapper const strings_col(
     {"",
      "A 60 character string to test MD5's message padding algorithm",
@@ -131,7 +121,7 @@ TEST_F(MD5HashTest, StringListsNulls)
 
   cudf::test::lists_column_wrapper<cudf::string_view> strings_list_col(
     {{""},
-     {{"NULL", "A 60 character string to test MD5's message padding algorithm"}, validity},
+     {{"", "A 60 character string to test MD5's message padding algorithm"}, validity},
      {"A very long (greater than 128 bytes/char string) to test a multi hash-step data point in "
       "the "
       "MD5 hash function. This string needed to be longer.",
@@ -153,7 +143,7 @@ class MD5HashTestTyped : public cudf::test::BaseFixture {};
 
 TYPED_TEST_SUITE(MD5HashTestTyped, cudf::test::NumericTypes);
 
-TYPED_TEST(MD5HashTestTyped, Equality)
+TYPED_TEST(MD5HashTestTyped, NoNulls)
 {
   cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8});
   auto const input = cudf::table_view({col});
@@ -166,31 +156,26 @@ TYPED_TEST(MD5HashTestTyped, Equality)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
 }
 
-TYPED_TEST(MD5HashTestTyped, EqualityNulls)
+TYPED_TEST(MD5HashTestTyped, WithNulls)
 {
-  using T = TypeParam;
-
-  // Nulls with different values should be equal
-  cudf::test::fixed_width_column_wrapper<T> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-  cudf::test::fixed_width_column_wrapper<T> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-
-  auto const input1 = cudf::table_view({col1});
-  auto const input2 = cudf::table_view({col2});
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  auto const input = cudf::table_view({col});
 
-  auto const output1 = cudf::hashing::md5(input1);
-  auto const output2 = cudf::hashing::md5(input2);
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::md5(input);
+  auto const output2 = cudf::hashing::md5(input);
 
-  EXPECT_EQ(input1.num_rows(), output1->size());
+  EXPECT_EQ(input.num_rows(), output1->size());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
 }
 
 TEST_F(MD5HashTest, TestBoolListsWithNulls)
 {
-  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 255, 255, 16, 27, 18, 100, 1, 2},
+  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 0, 0, 0, 1, 1, 1, 0, 0},
                                                           {1, 0, 0, 0, 1, 1, 1, 0, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 255, 255, 32, 81, 68, 3, 101, 4},
+  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 0, 0, 1, 0, 1, 0, 1, 0},
                                                           {1, 0, 0, 1, 0, 1, 0, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const col3({0, 255, 255, 64, 49, 42, 5, 6, 102},
+  cudf::test::fixed_width_column_wrapper<bool> const col3({0, 0, 0, 1, 1, 0, 0, 0, 1},
                                                           {1, 0, 0, 1, 1, 0, 0, 0, 1});
 
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; });
@@ -218,16 +203,16 @@ TYPED_TEST(MD5HashListTestTyped, TestListsWithNulls)
 {
   using T = TypeParam;
 
-  cudf::test::fixed_width_column_wrapper<T> const col1({0, 255, 255, 16, 27, 18, 100, 1, 2},
+  cudf::test::fixed_width_column_wrapper<T> const col1({0, 0, 0, 0, 27, 18, 100, 0, 0},
                                                        {1, 0, 0, 0, 1, 1, 1, 0, 0});
-  cudf::test::fixed_width_column_wrapper<T> const col2({0, 255, 255, 32, 81, 68, 3, 101, 4},
+  cudf::test::fixed_width_column_wrapper<T> const col2({0, 0, 0, 32, 0, 68, 0, 101, 0},
                                                        {1, 0, 0, 1, 0, 1, 0, 1, 0});
-  cudf::test::fixed_width_column_wrapper<T> const col3({0, 255, 255, 64, 49, 42, 5, 6, 102},
+  cudf::test::fixed_width_column_wrapper<T> const col3({0, 0, 0, 64, 49, 0, 0, 0, 102},
                                                        {1, 0, 0, 1, 1, 0, 0, 0, 1});
 
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; });
   cudf::test::lists_column_wrapper<T> const list_col(
-    {{0, 0, 0}, {127}, {}, {{32, 127, 64}, validity}, {27, 49}, {18, 68}, {100}, {101}, {102}},
+    {{0, 0, 0}, {}, {}, {{32, 0, 64}, validity}, {27, 49}, {18, 68}, {100}, {101}, {102}},
     validity);
 
   auto const input1 = cudf::table_view({col1, col2, col3});
diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp
new file mode 100644
index 00000000000..31145e4c3c4
--- /dev/null
+++ b/cpp/tests/hashing/sha1_test.cpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/hashing.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+class SHA1HashTest : public cudf::test::BaseFixture {};
+
+TEST_F(SHA1HashTest, EmptyTable)
+{
+  auto const empty_table        = cudf::table_view{};
+  auto const empty_column       = cudf::make_empty_column(cudf::data_type(cudf::type_id::STRING));
+  auto const output_empty_table = cudf::hashing::sha1(empty_table);
+  EXPECT_EQ(empty_column->size(), output_empty_table->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_empty_table->view());
+
+  auto const table_one_empty_column  = cudf::table_view{{empty_column->view()}};
+  auto const output_one_empty_column = cudf::hashing::sha1(empty_table);
+  EXPECT_EQ(empty_column->size(), output_one_empty_column->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_one_empty_column->view());
+}
+
+TEST_F(SHA1HashTest, MultiValue)
+{
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "0",
+     "A 56 character string to test message padding algorithm.",
+     "A 63 character string to test message padding algorithm, again.",
+     "A 64 character string to test message padding algorithm, again!!",
+     "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in "
+     "the hash function being tested. This string needed to be longer.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+     "Multi-byte characters: é¼³⅝"});
+
+  /*
+  These outputs can be generated with shell:
+  ```
+  echo -n "input string" | sha1sum
+  ```
+  Or with Python:
+  ```
+  import hashlib
+  print(hashlib.sha1("input string".encode()).hexdigest())
+  ```
+  */
+  cudf::test::strings_column_wrapper const sha1_string_results1(
+    {"da39a3ee5e6b4b0d3255bfef95601890afd80709",
+     "b6589fc6ab0dc82cf12099d1c2d40ab994e8410c",
+     "cb73203438ab46ea54491c53e288a2703c440c4a",
+     "c595ebd13a785c1c2659e010a42e2ff9987ef51f",
+     "4ffaf61804c55b8c2171be548bef2e1d0baca17a",
+     "595965dd18f38087186162c788485fe249242131",
+     "a62ca720fbab830c8890044eacbeac216f1ca2e4",
+     "11e16c52273b5669a41d17ec7c187475193f88b3",
+     "0826be2f7b9340eed269c7f9f3f3662c0a3ece68"});
+
+  cudf::test::strings_column_wrapper const sha1_string_results2(
+    {"da39a3ee5e6b4b0d3255bfef95601890afd80709",
+     "fb96549631c835eb239cd614cc6b5cb7d295121a",
+     "e3977ee0ea7f238134ec93c79988fa84b7c5d79e",
+     "f6f75b6fa3c3d8d86b44fcb2c98c9ad4b37dcdd0",
+     "c7abd431a775c604edf41a62f7f215e7258dc16a",
+     "153fdf20d2bd8ae76241197314d6e0be7fe10f50",
+     "8c3656f7cb37898f9296c1965000d6da13fed64e",
+     "b4a848399375ec842c2cb445d98b5f80a4dce94f",
+     "106a56e997aa6a149cc5091750574a25c324fa65"});
+
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, -1, 100, -100, limits::min(), limits::max(), 1, 2, 3});
+
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col({0, 1, 1, 1, 0, 1, 1, 1, 0});
+
+  // Test string inputs against known outputs
+  auto const string_input1       = cudf::table_view({strings_col});
+  auto const string_input2       = cudf::table_view({strings_col, strings_col});
+  auto const sha1_string_output1 = cudf::hashing::sha1(string_input1);
+  auto const sha1_string_output2 = cudf::hashing::sha1(string_input2);
+  EXPECT_EQ(string_input1.num_rows(), sha1_string_output1->size());
+  EXPECT_EQ(string_input2.num_rows(), sha1_string_output2->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha1_string_output1->view(), sha1_string_results1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha1_string_output2->view(), sha1_string_results2);
+
+  // Test non-string inputs for self-consistency
+  auto const input1       = cudf::table_view({strings_col, ints_col, bools_col});
+  auto const input2       = cudf::table_view({strings_col, ints_col, bools_col});
+  auto const sha1_output1 = cudf::hashing::sha1(input1);
+  auto const sha1_output2 = cudf::hashing::sha1(input2);
+  EXPECT_EQ(input1.num_rows(), sha1_output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha1_output1->view(), sha1_output2->view());
+}
+
+TEST_F(SHA1HashTest, EmptyNullEquivalence)
+{
+  // Test that empty strings hash the same as nulls
+  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0});
+  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1});
+
+  auto const input1 = cudf::table_view({strings_col1});
+  auto const input2 = cudf::table_view({strings_col2});
+
+  auto const output1 = cudf::hashing::sha1(input1);
+  auto const output2 = cudf::hashing::sha1(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TEST_F(SHA1HashTest, ListsUnsupported)
+{
+  cudf::test::lists_column_wrapper<cudf::string_view> strings_list_col(
+    {{""},
+     {"", "Some inputs"},
+     {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
+     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
+
+  auto const input = cudf::table_view({strings_list_col});
+
+  EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error);
+}
+
+TEST_F(SHA1HashTest, StructsUnsupported)
+{
+  auto child_col   = cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2, 3};
+  auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
+  auto const input = cudf::table_view({struct_col});
+
+  EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error);
+}
+
+template <typename T>
+class SHA1HashTestTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_CASE(SHA1HashTestTyped, cudf::test::NumericTypes);
+
+TYPED_TEST(SHA1HashTestTyped, NoNulls)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8});
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::sha1(input);
+  auto const output2 = cudf::hashing::sha1(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TYPED_TEST(SHA1HashTestTyped, WithNulls)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::sha1(input);
+  auto const output2 = cudf::hashing::sha1(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+template <typename T>
+class SHA1HashTestFloatTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_CASE(SHA1HashTestFloatTyped, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(SHA1HashTestFloatTyped, TestExtremes)
+{
+  using T = TypeParam;
+  T min   = std::numeric_limits<T>::min();
+  T max   = std::numeric_limits<T>::max();
+  T nan   = std::numeric_limits<T>::quiet_NaN();
+  T inf   = std::numeric_limits<T>::infinity();
+
+  cudf::test::fixed_width_column_wrapper<T> const col1(
+    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col2(
+    {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
+
+  auto const input1 = cudf::table_view({col1});
+  auto const input2 = cudf::table_view({col2});
+
+  auto const output1 = cudf::hashing::sha1(input1);
+  auto const output2 = cudf::hashing::sha1(input2);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp
new file mode 100644
index 00000000000..9aa1ee0fac2
--- /dev/null
+++ b/cpp/tests/hashing/sha224_test.cpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/hashing.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+class SHA224HashTest : public cudf::test::BaseFixture {};
+
+TEST_F(SHA224HashTest, EmptyTable)
+{
+  auto const empty_table        = cudf::table_view{};
+  auto const empty_column       = cudf::make_empty_column(cudf::data_type(cudf::type_id::STRING));
+  auto const output_empty_table = cudf::hashing::sha224(empty_table);
+  EXPECT_EQ(empty_column->size(), output_empty_table->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_empty_table->view());
+
+  auto const table_one_empty_column  = cudf::table_view{{empty_column->view()}};
+  auto const output_one_empty_column = cudf::hashing::sha224(empty_table);
+  EXPECT_EQ(empty_column->size(), output_one_empty_column->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_one_empty_column->view());
+}
+
+TEST_F(SHA224HashTest, MultiValue)
+{
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "0",
+     "A 56 character string to test message padding algorithm.",
+     "A 63 character string to test message padding algorithm, again.",
+     "A 64 character string to test message padding algorithm, again!!",
+     "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in "
+     "the hash function being tested. This string needed to be longer.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+     "Multi-byte characters: é¼³⅝"});
+
+  /*
+  These outputs can be generated with shell:
+  ```
+  echo -n "input string" | sha224sum
+  ```
+  Or with Python:
+  ```
+  import hashlib
+  print(hashlib.sha224("input string".encode()).hexdigest())
+  ```
+  */
+  cudf::test::strings_column_wrapper const sha224_string_results1(
+    {"d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f",
+     "dfd5f9139a820075df69d7895015360b76d0360f3d4b77a845689614",
+     "5d1ed8373987e403482cefe1662a63fa3076c0a5331d141f41654bbe",
+     "0662c91000b99de7a20c89097dd62f59120398d52499497489ccff95",
+     "f9ea303770699483f3e53263b32a3b3c876d1b8808ce84df4b8ca1c4",
+     "2da6cd4bdaa0a99fd7236cd5507c52e12328e71192e83b32d2f110f9",
+     "e7d0adb165079efc6c6343112f8b154aa3644ca6326f658aaa0f8e4a",
+     "309cc09eaa051beea7d0b0159daca9b4e8a533cb554e8f382c82709e",
+     "6c728722ae8eafd058672bd92958199ff3a5a129e8c076752f7650f8"});
+
+  cudf::test::strings_column_wrapper const sha224_string_results2(
+    {"d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f",
+     "5538ae2b02d4ae0b7090dc908ca69cd11a2ffad43c7435f1dbad5e6a",
+     "8e1955a473a149368dc0a931f99379b44b0bb752f206dbdf68629232",
+     "8581001e08295b7884428c022378cfdd643c977aefe4512f0252dc30",
+     "d5854dfe3c32996345b103a6a16c7bdfa924723d620b150737e77370",
+     "dd56deac5f2caa579a440ee814fc04a3afaf805d567087ac3317beb3",
+     "14fb559f6309604bedd89183f585f3b433932b5b0e675848feebf8ec",
+     "d219eefea538491efcb69bc5bbef4177ad991d1b6e1367b5981b8c31",
+     "5d5c2eace7ee553fe5cd25c8a8916e1eda81a5a5ca36a6338118a661"});
+
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, -1, 100, -100, limits::min(), limits::max(), 1, 2, 3});
+
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col({0, 1, 1, 1, 0, 1, 1, 1, 0});
+
+  // Test string inputs against known outputs
+  auto const string_input1         = cudf::table_view({strings_col});
+  auto const string_input2         = cudf::table_view({strings_col, strings_col});
+  auto const sha224_string_output1 = cudf::hashing::sha224(string_input1);
+  auto const sha224_string_output2 = cudf::hashing::sha224(string_input2);
+  EXPECT_EQ(string_input1.num_rows(), sha224_string_output1->size());
+  EXPECT_EQ(string_input2.num_rows(), sha224_string_output2->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha224_string_output1->view(), sha224_string_results1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha224_string_output2->view(), sha224_string_results2);
+
+  // Test non-string inputs for self-consistency
+  auto const input1         = cudf::table_view({strings_col, ints_col, bools_col});
+  auto const input2         = cudf::table_view({strings_col, ints_col, bools_col});
+  auto const sha224_output1 = cudf::hashing::sha224(input1);
+  auto const sha224_output2 = cudf::hashing::sha224(input2);
+  EXPECT_EQ(input1.num_rows(), sha224_output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha224_output1->view(), sha224_output2->view());
+}
+
+TEST_F(SHA224HashTest, EmptyNullEquivalence)
+{
+  // Test that empty strings hash the same as nulls
+  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0});
+  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1});
+
+  auto const input1 = cudf::table_view({strings_col1});
+  auto const input2 = cudf::table_view({strings_col2});
+
+  auto const output1 = cudf::hashing::sha224(input1);
+  auto const output2 = cudf::hashing::sha224(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TEST_F(SHA224HashTest, ListsUnsupported)
+{
+  cudf::test::lists_column_wrapper<cudf::string_view> strings_list_col(
+    {{""},
+     {"", "Some inputs"},
+     {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
+     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
+
+  auto const input = cudf::table_view({strings_list_col});
+
+  EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error);
+}
+
+TEST_F(SHA224HashTest, StructsUnsupported)
+{
+  auto child_col   = cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2, 3};
+  auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
+  auto const input = cudf::table_view({struct_col});
+
+  EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error);
+}
+
+template <typename T>
+class SHA224HashTestTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_CASE(SHA224HashTestTyped, cudf::test::NumericTypes);
+
+TYPED_TEST(SHA224HashTestTyped, NoNulls)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8});
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::sha224(input);
+  auto const output2 = cudf::hashing::sha224(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TYPED_TEST(SHA224HashTestTyped, WithNulls)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::sha224(input);
+  auto const output2 = cudf::hashing::sha224(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+template <typename T>
+class SHA224HashTestFloatTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_CASE(SHA224HashTestFloatTyped, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(SHA224HashTestFloatTyped, TestExtremes)
+{
+  using T = TypeParam;
+  T min   = std::numeric_limits<T>::min();
+  T max   = std::numeric_limits<T>::max();
+  T nan   = std::numeric_limits<T>::quiet_NaN();
+  T inf   = std::numeric_limits<T>::infinity();
+
+  cudf::test::fixed_width_column_wrapper<T> const col1(
+    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col2(
+    {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
+
+  auto const input1 = cudf::table_view({col1});
+  auto const input2 = cudf::table_view({col2});
+
+  auto const output1 = cudf::hashing::sha224(input1);
+  auto const output2 = cudf::hashing::sha224(input2);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp
new file mode 100644
index 00000000000..4fed8c55fc2
--- /dev/null
+++ b/cpp/tests/hashing/sha256_test.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/hashing.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
+
+class SHA256HashTest : public cudf::test::BaseFixture {};
+
+TEST_F(SHA256HashTest, EmptyTable)
+{
+  auto const empty_table        = cudf::table_view{};
+  auto const empty_column       = cudf::make_empty_column(cudf::data_type(cudf::type_id::STRING));
+  auto const output_empty_table = cudf::hashing::sha256(empty_table);
+  EXPECT_EQ(empty_column->size(), output_empty_table->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_empty_table->view());
+
+  auto const table_one_empty_column  = cudf::table_view{{empty_column->view()}};
+  auto const output_one_empty_column = cudf::hashing::sha256(empty_table);
+  EXPECT_EQ(empty_column->size(), output_one_empty_column->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_one_empty_column->view());
+}
+
+TEST_F(SHA256HashTest, MultiValue)
+{
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "0",
+     "A 56 character string to test message padding algorithm.",
+     "A 63 character string to test message padding algorithm, again.",
+     "A 64 character string to test message padding algorithm, again!!",
+     "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in "
+     "the hash function being tested. This string needed to be longer.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+     "Multi-byte characters: é¼³⅝"});
+
+  /*
+  These outputs can be generated with shell:
+  ```
+  echo -n "input string" | sha256sum
+  ```
+  Or with Python:
+  ```
+  import hashlib
+  print(hashlib.sha256("input string".encode()).hexdigest())
+  ```
+  */
+  cudf::test::strings_column_wrapper const sha256_string_results1(
+    {"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+     "5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91b46729d73a27fb57e9",
+     "d16883c666112142c1d72c9080b41161be7563250539e3f6ab6e2fdf2210074b",
+     "11174fa180460f5d683c2e63fcdd897dcbf10c28a9225d3ced9a8bbc3774415d",
+     "10a7d211e692c6f71bb9f7524ba1437588c2797356f05fc585340f002fe7015e",
+     "339d610dcb030bb4222bcf18c8ab82d911bfe7fb95b2cd9f6785fd4562b02401",
+     "2ce9936a4a2234bf8a76c37d92e01d549d03949792242e7f8a1ad68575e4e4a8",
+     "255fdd4d80a72f67921eb36f3e1157ea3e995068cee80e430c034e0d3692f614",
+     "9f9a89d448937f853c0067a3e2cb732d703eca971e3fb0f88fc73a730b7a85f4"});
+
+  cudf::test::strings_column_wrapper const sha256_string_results2(
+    {"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+     "f1534392279bddbf9d43dde8701cb5be14b82f76ec6607bf8d6ad557f60f304e",
+     "96c204fa5d44b2487abfec105a05f8ae634551604f6596202ca99e3724e3953a",
+     "2e7be264f3ecbb2930e7c54bf6c5fc1f310a8c63c50916bb713f34699ed11719",
+     "224e4dce71d5dbd5e79ba65aaced7ad9c4f45dda146278087b2b61d164f056f0",
+     "91f3108d4e9c696fdb37ae49fdc6a2237f1d1f977b7216406cc8a6365355f43b",
+     "490be480afe271685e9c1fdf46daac0b9bf7f25602e153ca92a0ddb0e4b662ef",
+     "4ddc45855d7ce3ab09efacff1fbafb33502f7dd468dc5a62826689c1c658dbce",
+     "bed32be19e1f432f5caec2b8bf914a968dfa5a5cba3868ea640ba9cbb0f9c9c8"});
+
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, -1, 100, -100, limits::min(), limits::max(), 1, 2, 3});
+
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col({0, 1, 1, 1, 0, 1, 1, 1, 0});
+
+  // Test string inputs against known outputs
+  auto const string_input1         = cudf::table_view({strings_col});
+  auto const string_input2         = cudf::table_view({strings_col, strings_col});
+  auto const sha256_string_output1 = cudf::hashing::sha256(string_input1);
+  auto const sha256_string_output2 = cudf::hashing::sha256(string_input2);
+  EXPECT_EQ(string_input1.num_rows(), sha256_string_output1->size());
+  EXPECT_EQ(string_input2.num_rows(), sha256_string_output2->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha256_string_output1->view(), sha256_string_results1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha256_string_output2->view(), sha256_string_results2);
+
+  // Test non-string inputs for self-consistency
+  auto const input1         = cudf::table_view({strings_col, ints_col, bools_col});
+  auto const input2         = cudf::table_view({strings_col, ints_col, bools_col});
+  auto const sha256_output1 = cudf::hashing::sha256(input1);
+  auto const sha256_output2 = cudf::hashing::sha256(input2);
+  EXPECT_EQ(input1.num_rows(), sha256_output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha256_output1->view(), sha256_output2->view());
+}
+TEST_F(SHA256HashTest, EmptyNullEquivalence)
+{
+  // Test that empty strings hash the same as nulls
+  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0});
+  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1});
+
+  auto const input1 = cudf::table_view({strings_col1});
+  auto const input2 = cudf::table_view({strings_col2});
+
+  auto const output1 = cudf::hashing::sha256(input1);
+  auto const output2 = cudf::hashing::sha256(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TEST_F(SHA256HashTest, ListsUnsupported)
+{
+  cudf::test::lists_column_wrapper<cudf::string_view> strings_list_col(
+    {{""},
+     {"", "Some inputs"},
+     {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
+     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
+
+  auto const input = cudf::table_view({strings_list_col});
+
+  EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error);
+}
+
+TEST_F(SHA256HashTest, StructsUnsupported)
+{
+  auto child_col   = cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2, 3};
+  auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
+  auto const input = cudf::table_view({struct_col});
+
+  EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error);
+}
+
+template <typename T>
+class SHA256HashTestTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_CASE(SHA256HashTestTyped, cudf::test::NumericTypes);
+
+TYPED_TEST(SHA256HashTestTyped, NoNulls)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8});
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::sha256(input);
+  auto const output2 = cudf::hashing::sha256(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TYPED_TEST(SHA256HashTestTyped, WithNulls)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::sha256(input);
+  auto const output2 = cudf::hashing::sha256(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+template <typename T>
+class SHA256HashTestFloatTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_CASE(SHA256HashTestFloatTyped, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(SHA256HashTestFloatTyped, TestExtremes)
+{
+  using T = TypeParam;
+  T min   = std::numeric_limits<T>::min();
+  T max   = std::numeric_limits<T>::max();
+  T nan   = std::numeric_limits<T>::quiet_NaN();
+  T inf   = std::numeric_limits<T>::infinity();
+
+  cudf::test::fixed_width_column_wrapper<T> const col1(
+    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col2(
+    {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
+
+  auto const input1 = cudf::table_view({col1});
+  auto const input2 = cudf::table_view({col2});
+
+  auto const output1 = cudf::hashing::sha256(input1);
+  auto const output2 = cudf::hashing::sha256(input2);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp
new file mode 100644
index 00000000000..49b9b5ef3a5
--- /dev/null
+++ b/cpp/tests/hashing/sha384_test.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/hashing.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+class SHA384HashTest : public cudf::test::BaseFixture {};
+
+TEST_F(SHA384HashTest, EmptyTable)
+{
+  auto const empty_table        = cudf::table_view{};
+  auto const empty_column       = cudf::make_empty_column(cudf::data_type(cudf::type_id::STRING));
+  auto const output_empty_table = cudf::hashing::sha384(empty_table);
+  EXPECT_EQ(empty_column->size(), output_empty_table->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_empty_table->view());
+
+  auto const table_one_empty_column  = cudf::table_view{{empty_column->view()}};
+  auto const output_one_empty_column = cudf::hashing::sha384(empty_table);
+  EXPECT_EQ(empty_column->size(), output_one_empty_column->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_one_empty_column->view());
+}
+
+TEST_F(SHA384HashTest, MultiValue)
+{
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "0",
+     "A 56 character string to test message padding algorithm.",
+     "A 63 character string to test message padding algorithm, again.",
+     "A 64 character string to test message padding algorithm, again!!",
+     "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in "
+     "the hash function being tested. This string needed to be longer.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+     "Multi-byte characters: é¼³⅝"});
+
+  /*
+  These outputs can be generated with shell:
+  ```
+  echo -n "input string" | sha384sum
+  ```
+  Or with Python:
+  ```
+  import hashlib
+  print(hashlib.sha384("input string".encode()).hexdigest())
+  ```
+  */
+  cudf::test::strings_column_wrapper const sha384_string_results1(
+    {"38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b"
+     "95b",
+     "5f91550edb03f0bb8917da57f0f8818976f5da971307b7ee4886bb951c4891a1f16f840dae8f655aa5df718884ebc"
+     "15b",
+     "982000cce895dc439edbcb7ba5b908cb5b7e939fe913d58506a486735a914b0dfbcebb02c33c428287baa0bfc7fe0"
+     "948",
+     "c3ea54e4d6d97c2a84dac9ac48ed9dd1a49118be880d8466044720cfdcd23427bf556f12204bb34ede29dbf207033"
+     "78c",
+     "5d7a853a18138fa90feac07c896dfca65a0f1eb2ed40f1fd7be6238dd7ef429bb1aeb0236735500eb954c9b4ba923"
+     "254",
+     "c72bcaf3a4b01986711cd5d2614aa8f9d7fad61455613eac4561b1468f9a25dd26566c8ad1190dec7567be4f6fc1d"
+     "b29",
+     "281826f23bebb3f835d2f15edcb0cdb3078ae2d7dc516f3a366af172dff4db6dd5833bc1e5ee411d52c598773e939"
+     "7b6",
+     "3a9d1a870a5f6a4c04df1daf1808163d33852897ebc757a5b028a1214fbc758485a392159b11bc360cfadc79f9512"
+     "822",
+     "f6d9687e48ef1f69f7523c2a06c338e2b2e6cb251823d46bfa7f9ba65a071693919726b85f6dd77726a73c57a0e3a"
+     "4a5"});
+
+  cudf::test::strings_column_wrapper const sha384_string_results2(
+    {"38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b"
+     "95b",
+     "34ae2cd40efabf896d8d4173e500278d10671b2d914efb5480e8349190bc7e8e1d532ad568d00a8295ea536a9b42b"
+     "bc6",
+     "e80c25efd8032ea94dad1509a68f9bf745ce1184b8a148714c28c7e0fae1100ab14057417394f83118eaa151e014d"
+     "917",
+     "69eaddc4ef2ed967fc6a86d3ed3777b2c2015df4cf8bbbf65681556f451a4a0ae805a89c2d56641b4422b5f248c56"
+     "77d",
+     "112a6f9c74741d490747db90f5e901a88b7a32f637c030d6d96e5f89a70a5f1ee209e018648842c0e1d32002f95fd"
+     "d07",
+     "dc6f24bb0eb2c96fb53c52c402f073de089f3aeae9594be0c4f4cb31b13bd48769b80aa97d83a25ece1edf0c83373"
+     "f56",
+     "781a33adfdcdcbb514318728c074fbb59d44002995825642e0c9bfef8a2ccf3fb637b39ff3dd265df8cd93c86e945"
+     "ce9",
+     "d2efb1591c4503f23c34ddb4da6bb1017d3d4d7c9f23ee6aa52e71c98d41060bc35eb22f41b6130d5c42a6e717fb3"
+     "edf",
+     "46e493cdd8b1e43ce2e90b6934a39e724949a1f8ea6709e09dbc68172089de864873ee7e10decdff98b44fbce2ba8"
+     "146"});
+
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, -1, 100, -100, limits::min(), limits::max(), 1, 2, 3});
+
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col({0, 1, 1, 1, 0, 1, 1, 1, 0});
+
+  // Test string inputs against known outputs
+  auto const string_input1         = cudf::table_view({strings_col});
+  auto const string_input2         = cudf::table_view({strings_col, strings_col});
+  auto const sha384_string_output1 = cudf::hashing::sha384(string_input1);
+  auto const sha384_string_output2 = cudf::hashing::sha384(string_input2);
+  EXPECT_EQ(string_input1.num_rows(), sha384_string_output1->size());
+  EXPECT_EQ(string_input2.num_rows(), sha384_string_output2->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_string_output1->view(), sha384_string_results1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_string_output2->view(), sha384_string_results2);
+
+  // Test non-string inputs for self-consistency
+  auto const input1         = cudf::table_view({strings_col, ints_col, bools_col});
+  auto const input2         = cudf::table_view({strings_col, ints_col, bools_col});
+  auto const sha384_output1 = cudf::hashing::sha384(input1);
+  auto const sha384_output2 = cudf::hashing::sha384(input2);
+  EXPECT_EQ(input1.num_rows(), sha384_output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_output1->view(), sha384_output2->view());
+}
+
+TEST_F(SHA384HashTest, EmptyNullEquivalence)
+{
+  // Test that empty strings hash the same as nulls
+  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0});
+  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1});
+
+  auto const input1 = cudf::table_view({strings_col1});
+  auto const input2 = cudf::table_view({strings_col2});
+
+  auto const output1 = cudf::hashing::sha384(input1);
+  auto const output2 = cudf::hashing::sha384(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TEST_F(SHA384HashTest, ListsUnsupported)
+{
+  cudf::test::lists_column_wrapper<cudf::string_view> strings_list_col(
+    {{""},
+     {"", "Some inputs"},
+     {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
+     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
+
+  auto const input = cudf::table_view({strings_list_col});
+
+  EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error);
+}
+
+TEST_F(SHA384HashTest, StructsUnsupported)
+{
+  auto child_col   = cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2, 3};
+  auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
+  auto const input = cudf::table_view({struct_col});
+
+  EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error);
+}
+
+template <typename T>
+class SHA384HashTestTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_CASE(SHA384HashTestTyped, cudf::test::NumericTypes);
+
+TYPED_TEST(SHA384HashTestTyped, NoNulls)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8});
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::sha384(input);
+  auto const output2 = cudf::hashing::sha384(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TYPED_TEST(SHA384HashTestTyped, WithNulls)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::sha384(input);
+  auto const output2 = cudf::hashing::sha384(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+template <typename T>
+class SHA384HashTestFloatTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_CASE(SHA384HashTestFloatTyped, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(SHA384HashTestFloatTyped, TestExtremes)
+{
+  using T = TypeParam;
+  T min   = std::numeric_limits<T>::min();
+  T max   = std::numeric_limits<T>::max();
+  T nan   = std::numeric_limits<T>::quiet_NaN();
+  T inf   = std::numeric_limits<T>::infinity();
+
+  cudf::test::fixed_width_column_wrapper<T> const col1(
+    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col2(
+    {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
+
+  auto const input1 = cudf::table_view({col1});
+  auto const input2 = cudf::table_view({col2});
+
+  auto const output1 = cudf::hashing::sha384(input1);
+  auto const output2 = cudf::hashing::sha384(input2);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp
new file mode 100644
index 00000000000..df0315099fb
--- /dev/null
+++ b/cpp/tests/hashing/sha512_test.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/hashing.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+class SHA512HashTest : public cudf::test::BaseFixture {};
+
+TEST_F(SHA512HashTest, EmptyTable)
+{
+  auto const empty_table        = cudf::table_view{};
+  auto const empty_column       = cudf::make_empty_column(cudf::data_type(cudf::type_id::STRING));
+  auto const output_empty_table = cudf::hashing::sha512(empty_table);
+  EXPECT_EQ(empty_column->size(), output_empty_table->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_empty_table->view());
+
+  auto const table_one_empty_column  = cudf::table_view{{empty_column->view()}};
+  auto const output_one_empty_column = cudf::hashing::sha512(empty_table);
+  EXPECT_EQ(empty_column->size(), output_one_empty_column->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_one_empty_column->view());
+}
+
+TEST_F(SHA512HashTest, MultiValue)
+{
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "0",
+     "A 56 character string to test message padding algorithm.",
+     "A 63 character string to test message padding algorithm, again.",
+     "A 64 character string to test message padding algorithm, again!!",
+     "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in "
+     "the hash function being tested. This string needed to be longer.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+     "Multi-byte characters: é¼³⅝"});
+
+  /*
+  These outputs can be generated with shell:
+  ```
+  echo -n "input string" | sha512sum
+  ```
+  Or with Python:
+  ```
+  import hashlib
+  print(hashlib.sha512("input string".encode()).hexdigest())
+  ```
+  */
+  cudf::test::strings_column_wrapper const sha512_string_results1(
+    {"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877ee"
+     "c2f63b931bd47417a81a538327af927da3e",
+     "31bca02094eb78126a517b206a88c73cfa9ec6f704c7030d18212cace820f025f00bf0ea68dbf3f3a5436ca63b53b"
+     "f7bf80ad8d5de7d8359d0b7fed9dbc3ab99",
+     "1d8b355dbe0c4ad81c9815a1490f0b6a6fa710e42ca60767ffd6d845acd116defe307c9496a80c4a67653873af6ed"
+     "83e2e04c2102f55f9cd402677b246832e4c",
+     "8ac8ae9de5597aa630f071f81fcb94dc93b6a8f92d8f2cdd5a469764a5daf6ef387b6465ae097dcd6e0c64286260d"
+     "cc3d2c789d2cf5960df648c78a765e6c27c",
+     "9c436e24be60e17425a1a829642d97e7180b57485cf95db007cf5b32bbae1f2325b6874b3377e37806b15b739bffa"
+     "412ea6d095b726487d70e7b50e92d56c750",
+     "6a25ca1f20f6e79faea2a0770075e4262beb66b40f59c22d3e8abdb6188ef8d8914faf5dbf6df76165bb61b81dfda"
+     "46643f0d6366a39f7bd3d270312f9d3cf87",
+     "bae9eb4b5c05a4c5f85750b70b2f0ce78e387f992f0927a017eb40bd180a13004f6252a6bbf9816f195fb7d86668c"
+     "393dc0985aaf7168f48e8b905f3b9b02df2",
+     "05a4ca1c523dcab32edb7d8793934a4cdf41a9062b229d711f5326e297bda83fa965118b9d7636172b43688e8e149"
+     "008b3f967f1a969962b7e959af894a8a315",
+     "1a15d73f16820b25f2af1c824a00a6ab18fe3eb91adaae31f441f4eca7ca11baf56d2f56e4f600781bf3637a49a4f"
+     "bdbd5d7e0d8e894c51144e28eed59b3721a"});
+
+  cudf::test::strings_column_wrapper const sha512_string_results2(
+    {"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877ee"
+     "c2f63b931bd47417a81a538327af927da3e",
+     "8ab3361c051a97ddc3c665d29f2762f8ac4240d08995f8724b6d07d8cbedd32c28f589ccdae514f20a6c8eea6f755"
+     "408dd3dd6837d66932ca2352eaeab594427",
+     "338b22eb841420affff9904f903ed14c91bf8f4d1b10f25c145a31018367607a2cf562121ba7eaa2d08db3382cc82"
+     "149805198c1fa3e7dc714fc2782e0f6ebd8",
+     "d3045ecde16ea036d2f2ff3fa685beb46d5fcb73de71f0aee653265f18b22e4c131255e6eb5ad3be2f32914408ec6"
+     "67911b49d951714decbdbfca1957be8ba10",
+     "da7706221f8861ef522ab9555f57306382fb18c337536545d839e431dede4ff9f9affafb82ab5588734a8fc6631e6"
+     "a0cd864634b62e24a42755c863c5d5c5848",
+     "04dadc8fdf205fe535c8eb38f20882fc2a0e308081052d7588e74f6620aa207749039468c126db7407050def80415"
+     "1d037cb188d5d4d459015032972a9e9f001",
+     "aae2e742074847889a029a8d3170f9e17177d48ec0b9dabe572aa68dd3001af0c512f164ba84aa75b13950948170a"
+     "0912912d16c98d2f05cb633c0d5b6a9105e",
+     "77f46e99a7a51ac04b4380ebca70c0782381629f711169a3b9dad3fc9aa6221a9c0cdaa9b9ea4329773e773e2987c"
+     "d1eebe0661386909684927d67819a2cf736",
+     "023f99dea2a46cb4f0672645c4123697a57e2911c1889bcb5339383f81d78e0efbcca11568621b732e7ac13bef576"
+     "a79f0dfb0a1db2a2ede8a14e860e3a9f1bc"});
+
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, -1, 100, -100, limits::min(), limits::max(), 1, 2, 3});
+
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col({0, 1, 1, 1, 0, 1, 1, 1, 0});
+
+  // Test string inputs against known outputs
+  auto const string_input1         = cudf::table_view({strings_col});
+  auto const string_input2         = cudf::table_view({strings_col, strings_col});
+  auto const sha512_string_output1 = cudf::hashing::sha512(string_input1);
+  auto const sha512_string_output2 = cudf::hashing::sha512(string_input2);
+  EXPECT_EQ(string_input1.num_rows(), sha512_string_output1->size());
+  EXPECT_EQ(string_input2.num_rows(), sha512_string_output2->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_string_output1->view(), sha512_string_results1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_string_output2->view(), sha512_string_results2);
+
+  // Test non-string inputs for self-consistency
+  auto const input1         = cudf::table_view({strings_col, ints_col, bools_col});
+  auto const input2         = cudf::table_view({strings_col, ints_col, bools_col});
+  auto const sha512_output1 = cudf::hashing::sha512(input1);
+  auto const sha512_output2 = cudf::hashing::sha512(input2);
+  EXPECT_EQ(input1.num_rows(), sha512_output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_output1->view(), sha512_output2->view());
+}
+
+TEST_F(SHA512HashTest, EmptyNullEquivalence)
+{
+  // Test that empty strings hash the same as nulls
+  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0});
+  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1});
+
+  auto const input1 = cudf::table_view({strings_col1});
+  auto const input2 = cudf::table_view({strings_col2});
+
+  auto const output1 = cudf::hashing::sha512(input1);
+  auto const output2 = cudf::hashing::sha512(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TEST_F(SHA512HashTest, ListsUnsupported)
+{
+  cudf::test::lists_column_wrapper<cudf::string_view> strings_list_col(
+    {{""},
+     {"", "Some inputs"},
+     {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
+     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
+
+  auto const input = cudf::table_view({strings_list_col});
+
+  EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error);
+}
+
+TEST_F(SHA512HashTest, StructsUnsupported)
+{
+  auto child_col   = cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2, 3};
+  auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
+  auto const input = cudf::table_view({struct_col});
+
+  EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error);
+}
+
+template <typename T>
+class SHA512HashTestTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_CASE(SHA512HashTestTyped, cudf::test::NumericTypes);
+
+TYPED_TEST(SHA512HashTestTyped, NoNulls)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8});
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::sha512(input);
+  auto const output2 = cudf::hashing::sha512(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TYPED_TEST(SHA512HashTestTyped, WithNulls)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::sha512(input);
+  auto const output2 = cudf::hashing::sha512(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+template <typename T>
+class SHA512HashTestFloatTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_CASE(SHA512HashTestFloatTyped, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(SHA512HashTestFloatTyped, TestExtremes)
+{
+  using T = TypeParam;
+  T min   = std::numeric_limits<T>::min();
+  T max   = std::numeric_limits<T>::max();
+  T nan   = std::numeric_limits<T>::quiet_NaN();
+  T inf   = std::numeric_limits<T>::infinity();
+
+  cudf::test::fixed_width_column_wrapper<T> const col1(
+    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col2(
+    {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
+
+  auto const input1 = cudf::table_view({col1});
+  auto const input2 = cudf::table_view({col2});
+
+  auto const output1 = cudf::hashing::sha512(input1);
+  auto const output2 = cudf::hashing::sha512(input2);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd
index 4b9fe3f3cee..d55e244dc2c 100644
--- a/python/cudf/cudf/_lib/cpp/hash.pxd
+++ b/python/cudf/cudf/_lib/cpp/hash.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
@@ -20,6 +20,26 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
         const table_view& input
     ) except +
 
+    cdef unique_ptr[column] sha1 "cudf::hashing::sha1" (
+        const table_view& input
+    ) except +
+
+    cdef unique_ptr[column] sha224 "cudf::hashing::sha224" (
+        const table_view& input
+    ) except +
+
+    cdef unique_ptr[column] sha256 "cudf::hashing::sha256" (
+        const table_view& input
+    ) except +
+
+    cdef unique_ptr[column] sha384 "cudf::hashing::sha384" (
+        const table_view& input
+    ) except +
+
+    cdef unique_ptr[column] sha512 "cudf::hashing::sha512" (
+        const table_view& input
+    ) except +
+
     cdef unique_ptr[column] xxhash_64 "cudf::hashing::xxhash_64" (
         const table_view& input,
         const uint64_t seed
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index a4741239cf3..6854cff7763 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -10,7 +10,16 @@ from libcpp.vector cimport vector
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.hash cimport md5, murmurhash3_x86_32, xxhash_64
+from cudf._lib.cpp.hash cimport (
+    md5,
+    murmurhash3_x86_32,
+    sha1,
+    sha224,
+    sha256,
+    sha384,
+    sha512,
+    xxhash_64,
+)
 from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -50,6 +59,21 @@ def hash(list source_columns, str method, int seed=0):
     elif method == "md5":
         with nogil:
             c_result = move(md5(c_source_view))
+    elif method == "sha1":
+        with nogil:
+            c_result = move(sha1(c_source_view))
+    elif method == "sha224":
+        with nogil:
+            c_result = move(sha224(c_source_view))
+    elif method == "sha256":
+        with nogil:
+            c_result = move(sha256(c_source_view))
+    elif method == "sha384":
+        with nogil:
+            c_result = move(sha384(c_source_view))
+    elif method == "sha512":
+        with nogil:
+            c_result = move(sha512(c_source_view))
     elif method == "xxhash64":
         with nogil:
             c_result = move(xxhash_64(c_source_view, seed))
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d75db7dfaae..37c115a47d9 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1390,7 +1390,19 @@ def test_assign_callable(mapping):
 
 
 @pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
-@pytest.mark.parametrize("method", ["murmur3", "md5", "xxhash64"])
+@pytest.mark.parametrize(
+    "method",
+    [
+        "murmur3",
+        "md5",
+        "sha1",
+        "sha224",
+        "sha256",
+        "sha384",
+        "sha512",
+        "xxhash64",
+    ],
+)
 @pytest.mark.parametrize("seed", [None, 42])
 def test_dataframe_hash_values(nrows, method, seed):
     warning_expected = seed is not None and method not in {
@@ -1415,6 +1427,11 @@ def test_dataframe_hash_values(nrows, method, seed):
     expected_dtypes = {
         "murmur3": np.uint32,
         "md5": object,
+        "sha1": object,
+        "sha224": object,
+        "sha256": object,
+        "sha384": object,
+        "sha512": object,
         "xxhash64": np.uint64,
     }
     assert out.dtype == expected_dtypes[method]
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 36033a72479..2e2b79386d7 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1402,7 +1402,9 @@ def test_series_sort_index(
         assert_eq(expected, got, check_index_type=True)
 
 
-@pytest.mark.parametrize("method", ["md5"])
+@pytest.mark.parametrize(
+    "method", ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"]
+)
 def test_series_hash_values(method):
     inputs = cudf.Series(
         [

From daee3a38b3ec07027d36be33facd5f36812e226e Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 22 Jan 2024 16:24:36 -0600
Subject: [PATCH 34/60] Remove -DNVBench_ENABLE_CUPTI=OFF. (#14820)

The `-DNVBench_ENABLE_CUPTI=OFF` flag is no longer needed because of https://github.com/rapidsai/rapids-cmake/pull/504. NVBench CUPTI support is now disabled by default.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14820
---
 conda/recipes/libcudf/build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/recipes/libcudf/build.sh b/conda/recipes/libcudf/build.sh
index 47047f41b25..fef3dabd733 100644
--- a/conda/recipes/libcudf/build.sh
+++ b/conda/recipes/libcudf/build.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 export cudf_ROOT="$(realpath ./cpp/build)"
 
 ./build.sh -n -v \
     libcudf libcudf_kafka benchmarks tests \
     --build_metrics --incl_cache_stats \
-    --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DCUDF_ENABLE_ARROW_S3=ON -DNVBench_ENABLE_CUPTI=OFF\"
+    --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DCUDF_ENABLE_ARROW_S3=ON\"

From ef3ce4bc8db008f58249241c16c80f7e6e600fa9 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Mon, 22 Jan 2024 16:36:10 -0800
Subject: [PATCH 35/60] Fix total_byte_size in Parquet row group metadata
 (#14802)

The `total_byte_size` field in the row group metadata should be "[t]otal byte size of all the uncompressed column data in this row group". cuDF currently populates this field with the _compressed_ size. This PR fixes that and adds a test.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/14802
---
 cpp/src/io/parquet/writer_impl.cu    |  2 +-
 cpp/tests/io/parquet_writer_test.cpp | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 417577f7b89..93b225dca1b 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -2074,7 +2074,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
           need_sync = true;
         }
 
-        row_group.total_byte_size += ck.compressed_size;
+        row_group.total_byte_size += ck.bfr_size;
         column_chunk_meta.total_uncompressed_size = ck.bfr_size;
         column_chunk_meta.total_compressed_size   = ck.compressed_size;
       }
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 946c0e23f08..2df34c7928b 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -1401,6 +1401,33 @@ TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
   EXPECT_EQ(max_value, std::string(max_val));
 }
 
+TEST_F(ParquetWriterTest, RowGroupMetadata)
+{
+  using column_type      = int;
+  constexpr int num_rows = 1'000;
+  auto const ones        = thrust::make_constant_iterator(1);
+  auto const col =
+    cudf::test::fixed_width_column_wrapper<column_type>{ones, ones + num_rows, no_nulls()};
+  auto const table = table_view({col});
+
+  auto const filepath = temp_env->get_temp_filepath("RowGroupMetadata.parquet");
+  // force PLAIN encoding to make size calculation easier
+  cudf::io::parquet_writer_options opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER)
+      .compression(cudf::io::compression_type::ZSTD);
+  cudf::io::write_parquet(opts);
+
+  // check row group metadata to make sure total_byte_size is the uncompressed value
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::detail::FileMetaData fmd;
+  read_footer(source, &fmd);
+
+  ASSERT_GT(fmd.row_groups.size(), 0);
+  EXPECT_GE(fmd.row_groups[0].total_byte_size,
+            static_cast<int64_t>(num_rows * sizeof(column_type)));
+}
+
 // See #14772.
 // zStandard compression cannot currently be used with V2 page headers due to buffer
 // alignment issues.
@@ -1416,6 +1443,7 @@ TEST_F(ParquetWriterTest, ZstdWithV2Header)
   EXPECT_THROW(cudf::io::write_parquet(out_opts), cudf::logic_error);
 }
 
+/////////////////////////////////////////////////////////////
 // custom mem mapped data sink that supports device writes
 template <bool supports_device_writes>
 class custom_test_memmap_sink : public cudf::io::data_sink {

From c9ec9c247f90010b27f8f7aa057406fd46581b8e Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Mon, 22 Jan 2024 21:43:03 -0500
Subject: [PATCH 36/60] Fix 24.04 versions (#14825)

Fix `update-version.sh` to include pip `cu` suffixed dependencies

Authors:
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/14825
---
 ci/release/update-version.sh |  4 ++--
 dependencies.yaml            | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index f9b1436495a..47e3f887d7d 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 ########################
 # cuDF Version Updater #
 ########################
@@ -86,7 +86,7 @@ DEPENDENCIES=(
 )
 for DEP in "${DEPENDENCIES[@]}"; do
   for FILE in dependencies.yaml conda/environments/*.yaml; do
-    sed_runner "/-.* ${DEP}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" ${FILE}
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}"
   done
   for FILE in python/*/pyproject.toml; do
     sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE}
diff --git a/dependencies.yaml b/dependencies.yaml
index cd9591b89e2..e62fa86d4d4 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -301,10 +301,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages: &build_python_packages_cu12
-              - &rmm_cu12 rmm-cu12==24.2.*
+              - &rmm_cu12 rmm-cu12==24.4.*
           - matrix: {cuda: "11.*"}
             packages: &build_python_packages_cu11
-              - &rmm_cu11 rmm-cu11==24.2.*
+              - &rmm_cu11 rmm-cu11==24.4.*
           - {matrix: null, packages: null }
       - output_types: pyproject
         matrices:
@@ -545,11 +545,11 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - rmm-cu12==24.2.*
+              - rmm-cu12==24.4.*
               - pynvjitlink-cu12
           - matrix: {cuda: "11.*"}
             packages:
-              - rmm-cu11==24.2.*
+              - rmm-cu11==24.4.*
               - cubinlinker-cu11
               - ptxcompiler-cu11
           - {matrix: null, packages: null}
@@ -677,10 +677,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf-cu12==24.2.*
+              - cudf-cu12==24.4.*
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf-cu11==24.2.*
+              - cudf-cu11==24.4.*
           - {matrix: null, packages: [*cudf_conda]}
   depends_on_cudf_kafka:
     common:
@@ -698,10 +698,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf_kafka-cu12==24.2.*
+              - cudf_kafka-cu12==24.4.*
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf_kafka-cu11==24.2.*
+              - cudf_kafka-cu11==24.4.*
           - {matrix: null, packages: [*cudf_kafka_conda]}
   depends_on_cupy:
     common:

From a39897c108d44a4d5e027ca741be5462863eeefc Mon Sep 17 00:00:00 2001
From: Suraj Aralihalli <suraj.ara16@gmail.com>
Date: Tue, 23 Jan 2024 00:14:31 -0800
Subject: [PATCH 37/60] Fix Aggregation Type Promotion: Ensure Unsigned Input
 Types Result in Unsigned Output for Sum and Multiply (#14679)

During aggregation, output types are modified to prevent overflow. Presently, summing INT32 yields INT64, but summing UINT32 still results in INT64 instead of UINT64. This pull request resolves Issue #[10149](https://github.com/rapidsai/cudf/issues/10149) to ensure the correct output type is used when summing or multiplying integers.

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14679
---
 cpp/include/cudf/detail/aggregation/aggregation.hpp |  6 +++---
 cpp/tests/groupby/sum_tests.cpp                     | 13 ++++++++-----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 784f05a964e..c35d56b4c13 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1234,12 +1234,12 @@ constexpr bool is_sum_product_agg(aggregation::Kind k)
          (k == aggregation::SUM_OF_SQUARES);
 }
 
-// Summing/Multiplying integers of any type, always use int64_t accumulator
+// Summing/Multiplying integers of any type, always use uint64_t for unsigned and int64_t for signed
 template <typename Source, aggregation::Kind k>
 struct target_type_impl<Source,
                         k,
                         std::enable_if_t<std::is_integral_v<Source> && is_sum_product_agg(k)>> {
-  using type = int64_t;
+  using type = std::conditional_t<std::is_unsigned_v<Source>, uint64_t, int64_t>;
 };
 
 // Summing fixed_point numbers
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index 35e8fd18a4d..abf25eb0aa9 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,10 +28,10 @@ using namespace cudf::test::iterators;
 template <typename V>
 struct groupby_sum_test : public cudf::test::BaseFixture {};
 
-using K = int32_t;
-using supported_types =
-  cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>,
-                     cudf::test::DurationTypes>;
+using K               = int32_t;
+using supported_types = cudf::test::Concat<
+  cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double, uint16_t, uint64_t>,
+  cudf::test::DurationTypes>;
 
 TYPED_TEST_SUITE(groupby_sum_test, supported_types);
 
@@ -40,6 +40,9 @@ TYPED_TEST(groupby_sum_test, basic)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::SUM>;
 
+  static_assert(std::is_signed_v<R> == std::is_signed_v<V>,
+                "Both Result type and Source type must have same signedness");
+
   cudf::test::fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
   cudf::test::fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 

From bc706af8fd688a6c9d92c7c03aecb8795a9df9b1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 23 Jan 2024 10:19:13 -0500
Subject: [PATCH 38/60] Use get_offset_value utility in strings shift function
 (#14743)

Updates `cudf::strings::detail::shift` to support int32 or int64 offset types by changing calls to `get_value` to `get_offset_value`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14743
---
 cpp/src/strings/copying/shift.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index 331cdecc36f..3a83cdab045 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -59,9 +59,9 @@ struct output_sizes_fn {
 struct shift_chars_fn {
   column_device_view const d_column;  // input strings column
   string_view const d_filler;
-  size_type const offset;
+  int64_t const offset;
 
-  __device__ char operator()(size_type idx)
+  __device__ char operator()(int64_t idx)
   {
     if (offset < 0) {
       auto const last_index = -offset;
@@ -110,7 +110,7 @@ std::unique_ptr<column> shift(strings_column_view const& input,
   // compute the shift-offset for the output characters child column
   auto const shift_offset = [&] {
     auto const index = (offset < 0) ? input.size() + offset : offset;
-    return (offset < 0 ? -1 : 1) * cudf::detail::get_value<size_type>(offsets_view, index, stream);
+    return (offset < 0 ? -1 : 1) * get_offset_value(offsets_view, index, stream);
   }();
 
   // create output chars child column
@@ -119,8 +119,8 @@ std::unique_ptr<column> shift(strings_column_view const& input,
 
   // run kernel to shift all the characters
   thrust::transform(rmm::exec_policy(stream),
-                    thrust::counting_iterator<size_type>(0),
-                    thrust::counting_iterator<size_type>(total_bytes),
+                    thrust::counting_iterator<int64_t>(0),
+                    thrust::counting_iterator<int64_t>(total_bytes),
                     d_chars,
                     shift_chars_fn{*d_input, d_fill_str, shift_offset});
 

From 48367a9d0cec39915e6c4e3ec33336480359260f Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 23 Jan 2024 08:18:26 -0800
Subject: [PATCH 39/60] Migrate binary operations to pylibcudf (#14821)

This PR migrates the binary operations in cuDF Python to pylibcudf.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14821
---
 .../api_docs/pylibcudf/binaryop.rst           |   6 +
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 python/cudf/cudf/_lib/binaryop.pxd            |   5 -
 python/cudf/cudf/_lib/binaryop.pyx            | 261 +++---------------
 python/cudf/cudf/_lib/cpp/CMakeLists.txt      |   4 +-
 python/cudf/cudf/_lib/cpp/binaryop.pxd        |  75 ++---
 python/cudf/cudf/_lib/cpp/binaryop.pyx        |   0
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   6 +-
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   5 +-
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   5 +-
 python/cudf/cudf/_lib/pylibcudf/binaryop.pxd  |  14 +
 python/cudf/cudf/_lib/pylibcudf/binaryop.pyx  |  86 ++++++
 python/cudf/cudf/tests/test_udf_binops.py     |  51 ----
 13 files changed, 185 insertions(+), 334 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst
 delete mode 100644 python/cudf/cudf/_lib/binaryop.pxd
 create mode 100644 python/cudf/cudf/_lib/cpp/binaryop.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
 delete mode 100644 python/cudf/cudf/tests/test_udf_binops.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst
new file mode 100644
index 00000000000..e5bc6aa7cda
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst
@@ -0,0 +1,6 @@
+========
+binaryop
+========
+
+.. automodule:: cudf._lib.pylibcudf.binaryop
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 435278afeeb..7504295de92 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -8,6 +8,7 @@ This page provides API documentation for pylibcudf.
     :maxdepth: 1
     :caption: API Documentation
 
+    binaryop
     column
     copying
     gpumemoryview
diff --git a/python/cudf/cudf/_lib/binaryop.pxd b/python/cudf/cudf/_lib/binaryop.pxd
deleted file mode 100644
index 1f6022251b3..00000000000
--- a/python/cudf/cudf/_lib/binaryop.pxd
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-from libc.stdint cimport int32_t
-
-ctypedef int32_t underlying_type_t_binary_operator
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 6212347b5b1..969be426044 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -1,160 +1,30 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from enum import IntEnum
-
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from cudf._lib.binaryop cimport underlying_type_t_binary_operator
 from cudf._lib.column cimport Column
-
-from cudf._lib.scalar import as_device_scalar
-
 from cudf._lib.scalar cimport DeviceScalar
+from cudf._lib.types cimport dtype_to_pylibcudf_type
 
-from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport data_type, type_id
-from cudf._lib.types cimport dtype_to_data_type, underlying_type_t_type_id
-
-from cudf.api.types import is_scalar
+from cudf._lib import pylibcudf
+from cudf._lib.scalar import as_device_scalar
 from cudf.core.buffer import acquire_spill_lock
 
-cimport cudf._lib.cpp.binaryop as cpp_binaryop
-from cudf._lib.cpp.binaryop cimport binary_operator
-import cudf
-
-
-class BinaryOperation(IntEnum):
-    ADD = (
-        <underlying_type_t_binary_operator> binary_operator.ADD
-    )
-    SUB = (
-        <underlying_type_t_binary_operator> binary_operator.SUB
-    )
-    MUL = (
-        <underlying_type_t_binary_operator> binary_operator.MUL
-    )
-    DIV = (
-        <underlying_type_t_binary_operator> binary_operator.DIV
-    )
-    TRUEDIV = (
-        <underlying_type_t_binary_operator> binary_operator.TRUE_DIV
-    )
-    FLOORDIV = (
-        <underlying_type_t_binary_operator> binary_operator.FLOOR_DIV
-    )
-    MOD = (
-        <underlying_type_t_binary_operator> binary_operator.PYMOD
-    )
-    POW = (
-        <underlying_type_t_binary_operator> binary_operator.POW
-    )
-    INT_POW = (
-        <underlying_type_t_binary_operator> binary_operator.INT_POW
-    )
-    EQ = (
-        <underlying_type_t_binary_operator> binary_operator.EQUAL
-    )
-    NE = (
-        <underlying_type_t_binary_operator> binary_operator.NOT_EQUAL
-    )
-    LT = (
-        <underlying_type_t_binary_operator> binary_operator.LESS
-    )
-    GT = (
-        <underlying_type_t_binary_operator> binary_operator.GREATER
-    )
-    LE = (
-        <underlying_type_t_binary_operator> binary_operator.LESS_EQUAL
-    )
-    GE = (
-        <underlying_type_t_binary_operator> binary_operator.GREATER_EQUAL
-    )
-    AND = (
-        <underlying_type_t_binary_operator> binary_operator.BITWISE_AND
-    )
-    OR = (
-        <underlying_type_t_binary_operator> binary_operator.BITWISE_OR
-    )
-    XOR = (
-        <underlying_type_t_binary_operator> binary_operator.BITWISE_XOR
-    )
-    L_AND = (
-        <underlying_type_t_binary_operator> binary_operator.LOGICAL_AND
-    )
-    L_OR = (
-        <underlying_type_t_binary_operator> binary_operator.LOGICAL_OR
-    )
-    GENERIC_BINARY = (
-        <underlying_type_t_binary_operator> binary_operator.GENERIC_BINARY
-    )
-    NULL_EQUALS = (
-        <underlying_type_t_binary_operator> binary_operator.NULL_EQUALS
-    )
-
-
-cdef binaryop_v_v(Column lhs, Column rhs,
-                  binary_operator c_op, data_type c_dtype):
-    cdef column_view c_lhs = lhs.view()
-    cdef column_view c_rhs = rhs.view()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_binaryop.binary_operation(
-                c_lhs,
-                c_rhs,
-                c_op,
-                c_dtype
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-cdef binaryop_v_s(Column lhs, DeviceScalar rhs,
-                  binary_operator c_op, data_type c_dtype):
-    cdef column_view c_lhs = lhs.view()
-    cdef const scalar* c_rhs = rhs.get_raw_ptr()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_binaryop.binary_operation(
-                c_lhs,
-                c_rhs[0],
-                c_op,
-                c_dtype
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
-
-cdef binaryop_s_v(DeviceScalar lhs, Column rhs,
-                  binary_operator c_op, data_type c_dtype):
-    cdef const scalar* c_lhs = lhs.get_raw_ptr()
-    cdef column_view c_rhs = rhs.view()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_binaryop.binary_operation(
-                c_lhs[0],
-                c_rhs,
-                c_op,
-                c_dtype
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+# Map pandas operation names to pylibcudf operation names.
+_op_map = {
+    "TRUEDIV": "TRUE_DIV",
+    "FLOORDIV": "FLOOR_DIV",
+    "MOD": "PYMOD",
+    "EQ": "EQUAL",
+    "NE": "NOT_EQUAL",
+    "LT": "LESS",
+    "GT": "GREATER",
+    "LE": "LESS_EQUAL",
+    "GE": "GREATER_EQUAL",
+    "AND": "BITWISE_AND",
+    "OR": "BITWISE_OR",
+    "XOR": "BITWISE_XOR",
+    "L_AND": "LOGICAL_AND",
+    "L_OR": "LOGICAL_OR",
+}
 
 
 @acquire_spill_lock()
@@ -166,74 +36,25 @@ def binaryop(lhs, rhs, op, dtype):
     # pipeline for libcudf binops that don't map to Python binops.
     if op not in {"INT_POW", "NULL_EQUALS"}:
         op = op[2:-2]
-
-    op = BinaryOperation[op.upper()]
-    cdef binary_operator c_op = <binary_operator> (
-        <underlying_type_t_binary_operator> op
-    )
-
-    cdef data_type c_dtype = dtype_to_data_type(dtype)
-
-    if is_scalar(lhs) or lhs is None:
-        s_lhs = as_device_scalar(lhs, dtype=rhs.dtype if lhs is None else None)
-        result = binaryop_s_v(
-            s_lhs,
-            rhs,
-            c_op,
-            c_dtype
-        )
-
-    elif is_scalar(rhs) or rhs is None:
-        s_rhs = as_device_scalar(rhs, dtype=lhs.dtype if rhs is None else None)
-        result = binaryop_v_s(
-            lhs,
-            s_rhs,
-            c_op,
-            c_dtype
-        )
-
-    else:
-        result = binaryop_v_v(
-            lhs,
-            rhs,
-            c_op,
-            c_dtype
-        )
-    return result
-
-
-@acquire_spill_lock()
-def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
-    """
-    Apply a user-defined binary operator (a UDF) defined in `udf_ptx` on
-    the two input columns `lhs` and `rhs`. The output type of the UDF
-    has to be specified in `dtype`, a numpy data type.
-    Currently ONLY int32, int64, float32 and float64 are supported.
-    """
-    cdef column_view c_lhs = lhs.view()
-    cdef column_view c_rhs = rhs.view()
-
-    cdef type_id tid = (
-        <type_id> (
-            <underlying_type_t_type_id> (
-                SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype(dtype)]
-            )
+    op = op.upper()
+    op = _op_map.get(op, op)
+
+    return Column.from_pylibcudf(
+        # Check if the dtype args are desirable here.
+        pylibcudf.binaryop.binary_operation(
+            lhs.to_pylibcudf(mode="read") if isinstance(lhs, Column)
+            else (
+                <DeviceScalar> as_device_scalar(
+                    lhs, dtype=rhs.dtype if lhs is None else None
+                )
+            ).c_value,
+            rhs.to_pylibcudf(mode="read") if isinstance(rhs, Column)
+            else (
+                <DeviceScalar> as_device_scalar(
+                    rhs, dtype=lhs.dtype if rhs is None else None
+                )
+            ).c_value,
+            pylibcudf.binaryop.BinaryOperator[op],
+            dtype_to_pylibcudf_type(dtype),
         )
     )
-    cdef data_type c_dtype = data_type(tid)
-
-    cdef string cpp_str = udf_ptx.encode("UTF-8")
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_binaryop.binary_operation(
-                c_lhs,
-                c_rhs,
-                cpp_str,
-                c_dtype
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/cpp/CMakeLists.txt b/python/cudf/cudf/_lib/cpp/CMakeLists.txt
index a99aa58dfe8..764f28add0e 100644
--- a/python/cudf/cudf/_lib/cpp/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources copying.pyx types.pyx)
+set(cython_sources binaryop.pyx copying.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pxd b/python/cudf/cudf/_lib/cpp/binaryop.pxd
index f73a9502cd1..735216e656a 100644
--- a/python/cudf/cudf/_lib/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_lib/cpp/binaryop.pxd
@@ -1,5 +1,6 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
@@ -10,30 +11,30 @@ from cudf._lib.cpp.types cimport data_type
 
 
 cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
-    ctypedef enum binary_operator:
-        ADD "cudf::binary_operator::ADD"
-        SUB "cudf::binary_operator::SUB"
-        MUL "cudf::binary_operator::MUL"
-        DIV "cudf::binary_operator::DIV"
-        TRUE_DIV "cudf::binary_operator::TRUE_DIV"
-        FLOOR_DIV "cudf::binary_operator::FLOOR_DIV"
-        MOD "cudf::binary_operator::MOD"
-        PYMOD "cudf::binary_operator::PYMOD"
-        POW "cudf::binary_operator::POW"
-        INT_POW "cudf::binary_operator::INT_POW"
-        EQUAL "cudf::binary_operator::EQUAL"
-        NOT_EQUAL "cudf::binary_operator::NOT_EQUAL"
-        LESS "cudf::binary_operator::LESS"
-        GREATER "cudf::binary_operator::GREATER"
-        LESS_EQUAL "cudf::binary_operator::LESS_EQUAL"
-        GREATER_EQUAL "cudf::binary_operator::GREATER_EQUAL"
-        NULL_EQUALS "cudf::binary_operator::NULL_EQUALS"
-        BITWISE_AND "cudf::binary_operator::BITWISE_AND"
-        BITWISE_OR "cudf::binary_operator::BITWISE_OR"
-        BITWISE_XOR "cudf::binary_operator::BITWISE_XOR"
-        LOGICAL_AND "cudf::binary_operator::LOGICAL_AND"
-        LOGICAL_OR "cudf::binary_operator::LOGICAL_OR"
-        GENERIC_BINARY "cudf::binary_operator::GENERIC_BINARY"
+    cpdef enum class binary_operator(int32_t):
+        ADD
+        SUB
+        MUL
+        DIV
+        TRUE_DIV
+        FLOOR_DIV
+        MOD
+        PYMOD
+        POW
+        INT_POW
+        EQUAL
+        NOT_EQUAL
+        LESS
+        GREATER
+        LESS_EQUAL
+        GREATER_EQUAL
+        NULL_EQUALS
+        BITWISE_AND
+        BITWISE_OR
+        BITWISE_XOR
+        LOGICAL_AND
+        LOGICAL_OR
+        GENERIC_BINARY
 
     cdef unique_ptr[column] binary_operation (
         const scalar& lhs,
@@ -62,27 +63,3 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         const string& op,
         data_type output_type
     ) except +
-
-    unique_ptr[column] jit_binary_operation \
-        "cudf::jit::binary_operation" (
-        const column_view& lhs,
-        const column_view& rhs,
-        binary_operator op,
-        data_type output_type
-    ) except +
-
-    unique_ptr[column] jit_binary_operation \
-        "cudf::jit::binary_operation" (
-        const column_view& lhs,
-        const scalar& rhs,
-        binary_operator op,
-        data_type output_type
-    ) except +
-
-    unique_ptr[column] jit_binary_operation \
-        "cudf::jit::binary_operation" (
-        const scalar& lhs,
-        const column_view& rhs,
-        binary_operator op,
-        data_type output_type
-    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pyx b/python/cudf/cudf/_lib/cpp/binaryop.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 870a00f99a9..acb013c8b8c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -12,8 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx table.pyx
-                   types.pyx utils.pyx
+set(cython_sources binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx
+                   table.pyx types.pyx utils.pyx
 )
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 7a35854392c..f4b8c50eecc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -1,7 +1,7 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 # TODO: Verify consistent usage of relative/absolute imports in pylibcudf.
-from . cimport copying, interop
+from . cimport binaryop, copying, interop
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
 from .scalar cimport Scalar
@@ -15,6 +15,7 @@ __all__ = [
     "DataType",
     "Scalar",
     "Table",
+    "binaryop",
     "copying",
     "gpumemoryview",
     "interop",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 72b74a57b87..a27d80fc5a2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -1,6 +1,6 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
-from . import copying, interop
+from . import binaryop, copying, interop
 from .column import Column
 from .gpumemoryview import gpumemoryview
 from .scalar import Scalar
@@ -13,6 +13,7 @@
     "Scalar",
     "Table",
     "TypeId",
+    "binaryop",
     "copying",
     "gpumemoryview",
     "interop",
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
new file mode 100644
index 00000000000..56b98333757
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.binaryop cimport binary_operator
+
+from .column cimport Column
+from .types cimport DataType
+
+
+cpdef Column binary_operation(
+    object lhs,
+    object rhs,
+    binary_operator op,
+    DataType data_type
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
new file mode 100644
index 00000000000..af248ba2071
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
@@ -0,0 +1,86 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator import dereference
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp cimport binaryop as cpp_binaryop
+from cudf._lib.cpp.binaryop cimport binary_operator
+from cudf._lib.cpp.column.column cimport column
+
+from cudf._lib.cpp.binaryop import \
+    binary_operator as BinaryOperator  # no-cython-lint
+
+from .column cimport Column
+from .scalar cimport Scalar
+from .types cimport DataType
+
+
+cpdef Column binary_operation(
+    object lhs,
+    object rhs,
+    binary_operator op,
+    DataType data_type
+):
+    """Perform a binary operation between a column and another column or scalar.
+
+    Either ``lhs`` or ``rhs`` must be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column`. The other may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    For details, see :cpp:func:`binary_operation`.
+
+    Parameters
+    ----------
+    lhs : Column or Scalar
+        The left hand side argument.
+    rhs : Column or Scalar
+        The right hand side argument.
+    op : BinaryOperator
+        The operation to perform.
+    data_type : DataType
+        The output to use for the output.
+
+    Returns
+    -------
+    pylibcudf.Column
+        The result of the binary operation
+    """
+    cdef unique_ptr[column] result
+
+    if isinstance(lhs, Column) and isinstance(rhs, Column):
+        with nogil:
+            result = move(
+                cpp_binaryop.binary_operation(
+                    (<Column> lhs).view(),
+                    (<Column> rhs).view(),
+                    op,
+                    data_type.c_obj
+                )
+            )
+    elif isinstance(lhs, Column) and isinstance(rhs, Scalar):
+        with nogil:
+            result = move(
+                cpp_binaryop.binary_operation(
+                    (<Column> lhs).view(),
+                    dereference((<Scalar> rhs).c_obj),
+                    op,
+                    data_type.c_obj
+                )
+            )
+    elif isinstance(lhs, Scalar) and isinstance(rhs, Column):
+        with nogil:
+            result = move(
+                cpp_binaryop.binary_operation(
+                    dereference((<Scalar> lhs).c_obj),
+                    (<Column> rhs).view(),
+                    op,
+                    data_type.c_obj
+                )
+            )
+    else:
+        raise ValueError(f"Invalid arguments {lhs} and {rhs}")
+
+    return Column.from_libcudf(move(result))
diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py
deleted file mode 100644
index 1ad45e721a3..00000000000
--- a/python/cudf/cudf/tests/test_udf_binops.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
-
-import numpy as np
-import pytest
-from numba.cuda import compile_ptx
-from numba.np import numpy_support
-
-import rmm
-
-import cudf
-from cudf import Series, _lib as libcudf
-from cudf.utils import dtypes as dtypeutils
-
-_driver_version = rmm._cuda.gpu.driverGetVersion()
-_runtime_version = rmm._cuda.gpu.runtimeGetVersion()
-_CUDA_JIT128INT_SUPPORTED = (_driver_version >= 11050) and (
-    _runtime_version >= 11050
-)
-
-
-@pytest.mark.skipif(not _CUDA_JIT128INT_SUPPORTED, reason="requires CUDA 11.5")
-@pytest.mark.parametrize(
-    "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"int8"}))
-)
-def test_generic_ptx(dtype):
-
-    size = 500
-
-    lhs_arr = np.random.random(size).astype(dtype)
-    lhs_col = Series(lhs_arr)._column
-
-    rhs_arr = np.random.random(size).astype(dtype)
-    rhs_col = Series(rhs_arr)._column
-
-    def generic_function(a, b):
-        return a**3 + b
-
-    nb_type = numpy_support.from_dtype(cudf.dtype(dtype))
-    type_signature = (nb_type, nb_type)
-
-    ptx_code, output_type = compile_ptx(
-        generic_function, type_signature, device=True
-    )
-
-    dtype = numpy_support.as_dtype(output_type).type
-
-    out_col = libcudf.binaryop.binaryop_udf(lhs_col, rhs_col, ptx_code, dtype)
-
-    result = lhs_arr**3 + rhs_arr
-
-    np.testing.assert_almost_equal(result, out_col.values_host)

From c83b9fdcf45aa0b7204ef0313dc0a778dc15e017 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 23 Jan 2024 06:19:34 -1000
Subject: [PATCH 40/60] Refactor and add validation to IntervalIndex.__init__
 (#14778)

* Adding validation to `closed`, `dtype` arguments in `ItervalIndex.__init__`
* Ensure `closed` attribute always maps to `IntervalDtype.closed`
* `build_interval_column` was no longer necessary by using `IntervalColumn` directly

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14778
---
 python/cudf/cudf/core/column/column.py        |  50 +-------
 python/cudf/cudf/core/column/interval.py      |  25 +---
 python/cudf/cudf/core/index.py                | 120 ++++++++++++------
 .../cudf/cudf/tests/indexes/test_interval.py  |  29 ++++-
 python/cudf/cudf/tests/test_udf_masked_ops.py |   4 +-
 5 files changed, 114 insertions(+), 114 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7a99ef9f470..dc060a7117e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -999,14 +999,14 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
                     "`.astype('str')` instead."
                 )
             return col.as_string_column(dtype)
+        elif isinstance(dtype, IntervalDtype):
+            return col.as_interval_column(dtype)
         elif isinstance(dtype, (ListDtype, StructDtype)):
             if not col.dtype == dtype:
                 raise NotImplementedError(
                     f"Casting {self.dtype} columns not currently supported"
                 )
             return col
-        elif isinstance(dtype, IntervalDtype):
-            return col.as_interval_column(dtype)
         elif isinstance(dtype, cudf.core.dtypes.DecimalDtype):
             return col.as_decimal_column(dtype)
         elif np.issubdtype(cast(Any, dtype), np.datetime64):
@@ -1689,52 +1689,6 @@ def build_categorical_column(
     return cast("cudf.core.column.CategoricalColumn", result)
 
 
-def build_interval_column(
-    left_col,
-    right_col,
-    mask=None,
-    size=None,
-    offset=0,
-    null_count=None,
-    closed="right",
-):
-    """
-    Build an IntervalColumn
-
-    Parameters
-    ----------
-    left_col : Column
-        Column of values representing the left of the interval
-    right_col : Column
-        Column of representing the right of the interval
-    mask : Buffer
-        Null mask
-    size : int, optional
-    offset : int, optional
-    closed : {"left", "right", "both", "neither"}, default "right"
-            Whether the intervals are closed on the left-side, right-side,
-            both or neither.
-    """
-    left = as_column(left_col)
-    right = as_column(right_col)
-    if closed not in {"left", "right", "both", "neither"}:
-        closed = "right"
-    if type(left_col) is not list:
-        dtype = IntervalDtype(left_col.dtype, closed)
-    else:
-        dtype = IntervalDtype("int64", closed)
-    size = len(left)
-    return build_column(
-        data=None,
-        dtype=dtype,
-        mask=mask,
-        size=size,
-        offset=offset,
-        null_count=null_count,
-        children=(left, right),
-    )
-
-
 def build_list_column(
     indices: ColumnBase,
     elements: ColumnBase,
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index 6a7e7729123..7227ef8ba3a 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -18,7 +18,6 @@ def __init__(
         offset=0,
         null_count=None,
         children=(),
-        closed="right",
     ):
         super().__init__(
             data=None,
@@ -29,14 +28,6 @@ def __init__(
             null_count=null_count,
             children=children,
         )
-        if closed in ["left", "right", "neither", "both"]:
-            self._closed = closed
-        else:
-            raise ValueError("closed value is not valid")
-
-    @property
-    def closed(self):
-        return self._closed
 
     @classmethod
     def from_arrow(cls, data):
@@ -50,7 +41,6 @@ def from_arrow(cls, data):
         offset = data.offset
         null_count = data.null_count
         children = new_col.children
-        closed = dtype.closed
 
         return IntervalColumn(
             size=size,
@@ -59,7 +49,6 @@ def from_arrow(cls, data):
             offset=offset,
             null_count=null_count,
             children=children,
-            closed=closed,
         )
 
     def to_arrow(self):
@@ -73,7 +62,7 @@ def to_arrow(self):
 
     @classmethod
     def from_struct_column(cls, struct_column: StructColumn, closed="right"):
-        first_field_name = list(struct_column.dtype.fields.keys())[0]
+        first_field_name = next(iter(struct_column.dtype.fields.keys()))
         return IntervalColumn(
             size=struct_column.size,
             dtype=IntervalDtype(
@@ -83,20 +72,19 @@ def from_struct_column(cls, struct_column: StructColumn, closed="right"):
             offset=struct_column.offset,
             null_count=struct_column.null_count,
             children=struct_column.base_children,
-            closed=closed,
         )
 
     def copy(self, deep=True):
-        closed = self.closed
         struct_copy = super().copy(deep=deep)
         return IntervalColumn(
             size=struct_copy.size,
-            dtype=IntervalDtype(struct_copy.dtype.fields["left"], closed),
+            dtype=IntervalDtype(
+                struct_copy.dtype.fields["left"], self.dtype.closed
+            ),
             mask=struct_copy.base_mask,
             offset=struct_copy.offset,
             null_count=struct_copy.null_count,
             children=struct_copy.base_children,
-            closed=closed,
         )
 
     def as_interval_column(self, dtype):
@@ -109,7 +97,7 @@ def as_interval_column(self, dtype):
                 # when creating an interval series or interval dataframe
                 if dtype == "interval":
                     dtype = IntervalDtype(
-                        self.dtype.fields["left"], self.closed
+                        self.dtype.subtype, self.dtype.closed
                     )
                 children = self.children
                 return IntervalColumn(
@@ -119,7 +107,6 @@ def as_interval_column(self, dtype):
                     offset=self.offset,
                     null_count=self.null_count,
                     children=children,
-                    closed=dtype.closed,
                 )
         else:
             raise ValueError("dtype must be IntervalDtype")
@@ -141,5 +128,5 @@ def to_pandas(
     def element_indexing(self, index: int):
         result = super().element_indexing(index)
         if cudf.get_option("mode.pandas_compatible"):
-            return pd.Interval(**result, closed=self._closed)
+            return pd.Interval(**result, closed=self.dtype.closed)
         return result
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index fa7173f1d0f..c10124f4de6 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3174,10 +3174,12 @@ def interval_range(
         data = column.column_empty_like_same_mask(left_col, dtype)
         return IntervalIndex(data, closed=closed)
 
-    interval_col = column.build_interval_column(
-        left_col, right_col, closed=closed
+    interval_col = IntervalColumn(
+        dtype=IntervalDtype(left_col.dtype, closed),
+        size=len(left_col),
+        children=(left_col, right_col),
     )
-    return IntervalIndex(interval_col)
+    return IntervalIndex(interval_col, closed=closed)
 
 
 class IntervalIndex(GenericIndex):
@@ -3217,44 +3219,72 @@ class IntervalIndex(GenericIndex):
     def __init__(
         self,
         data,
-        closed=None,
+        closed: Optional[Literal["left", "right", "neither", "both"]] = None,
         dtype=None,
-        copy=False,
+        copy: bool = False,
         name=None,
     ):
-        if copy:
-            data = column.as_column(data, dtype=dtype).copy()
-        kwargs = _setdefault_name(data, name=name)
-
-        if closed is None:
-            closed = "right"
+        name = _setdefault_name(data, name=name)["name"]
 
-        if isinstance(data, IntervalColumn):
-            data = data
-        elif isinstance(data, pd.Series) and isinstance(
-            data.dtype, pd.IntervalDtype
-        ):
-            data = column.as_column(data, data.dtype)
-        elif isinstance(data, (pd.Interval, pd.IntervalIndex)):
-            data = column.as_column(
-                data,
-                dtype=dtype,
-            )
-        elif len(data) == 0:
-            subtype = getattr(data, "dtype", "int64")
-            dtype = IntervalDtype(subtype, closed)
-            data = column.column_empty_like_same_mask(
-                column.as_column(data), dtype
+        if dtype is not None:
+            dtype = cudf.dtype(dtype)
+            if not isinstance(dtype, IntervalDtype):
+                raise TypeError("dtype must be an IntervalDtype")
+            if closed is not None and closed != dtype.closed:
+                raise ValueError("closed keyword does not match dtype.closed")
+            closed = dtype.closed
+
+        if closed is None and isinstance(dtype, IntervalDtype):
+            closed = dtype.closed
+
+        closed = closed or "right"
+
+        if len(data) == 0:
+            if not hasattr(data, "dtype"):
+                data = np.array([], dtype=np.int64)
+            elif isinstance(data.dtype, (pd.IntervalDtype, IntervalDtype)):
+                data = np.array([], dtype=data.dtype.subtype)
+            interval_col = IntervalColumn(
+                dtype=IntervalDtype(data.dtype, closed),
+                size=len(data),
+                children=(as_column(data), as_column(data)),
             )
         else:
-            data = column.as_column(data)
-            data.dtype.closed = closed
+            col = as_column(data)
+            if not isinstance(col, IntervalColumn):
+                raise TypeError("data must be an iterable of Interval data")
+            if copy:
+                col = col.copy()
+            interval_col = IntervalColumn(
+                dtype=IntervalDtype(col.dtype.subtype, closed),
+                mask=col.mask,
+                size=col.size,
+                offset=col.offset,
+                null_count=col.null_count,
+                children=col.children,
+            )
 
-        self.closed = closed
-        super().__init__(data, **kwargs)
+        if dtype:
+            interval_col = interval_col.astype(dtype)  # type: ignore[assignment]
 
+        super().__init__(interval_col, name=name)
+
+    @property
+    def closed(self):
+        return self._values.dtype.closed
+
+    @classmethod
     @_cudf_nvtx_annotate
-    def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None):
+    def from_breaks(
+        cls,
+        breaks,
+        closed: Optional[
+            Literal["left", "right", "neither", "both"]
+        ] = "right",
+        name=None,
+        copy: bool = False,
+        dtype=None,
+    ):
         """
         Construct an IntervalIndex from an array of splits.
 
@@ -3283,16 +3313,28 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None):
         >>> cudf.IntervalIndex.from_breaks([0, 1, 2, 3])
         IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval[int64, right]')
         """
+        breaks = as_column(breaks, dtype=dtype)
         if copy:
-            breaks = column.as_column(breaks, dtype=dtype).copy()
-        left_col = breaks[:-1:]
-        right_col = breaks[+1::]
-
-        interval_col = column.build_interval_column(
-            left_col, right_col, closed=closed
+            breaks = breaks.copy()
+        left_col = breaks.slice(0, len(breaks) - 1)
+        right_col = breaks.slice(1, len(breaks))
+        # For indexing, children should both have 0 offset
+        right_col = column.build_column(
+            data=right_col.data,
+            dtype=right_col.dtype,
+            size=right_col.size,
+            mask=right_col.mask,
+            offset=0,
+            null_count=right_col.null_count,
+            children=right_col.children,
         )
 
-        return IntervalIndex(interval_col, name=name)
+        interval_col = IntervalColumn(
+            dtype=IntervalDtype(left_col.dtype, closed),
+            size=len(left_col),
+            children=(left_col, right_col),
+        )
+        return IntervalIndex(interval_col, name=name, closed=closed)
 
     def __getitem__(self, index):
         raise NotImplementedError(
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index 52c49aebf35..5a6155ece29 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 import numpy as np
 import pandas as pd
 import pyarrow as pa
@@ -57,11 +57,9 @@ def test_interval_range_dtype_basic(start_t, end_t):
 
 
 @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("start", [0])
-@pytest.mark.parametrize("end", [0])
-def test_interval_range_empty(start, end, closed):
-    pindex = pd.interval_range(start=start, end=end, closed=closed)
-    gindex = cudf.interval_range(start=start, end=end, closed=closed)
+def test_interval_range_empty(closed):
+    pindex = pd.interval_range(start=0, end=0, closed=closed)
+    gindex = cudf.interval_range(start=0, end=0, closed=closed)
 
     assert_eq(pindex, gindex)
 
@@ -315,3 +313,22 @@ def test_intervalindex_empty_typed_non_int():
     result = cudf.IntervalIndex(data)
     expected = pd.IntervalIndex(data)
     assert_eq(result, expected)
+
+
+def test_intervalindex_invalid_dtype():
+    with pytest.raises(TypeError):
+        cudf.IntervalIndex([pd.Interval(1, 2)], dtype="int64")
+
+
+def test_intervalindex_conflicting_closed():
+    with pytest.raises(ValueError):
+        cudf.IntervalIndex(
+            [pd.Interval(1, 2)],
+            dtype=cudf.IntervalDtype("int64", closed="left"),
+            closed="right",
+        )
+
+
+def test_intervalindex_invalid_data():
+    with pytest.raises(TypeError):
+        cudf.IntervalIndex([1, 2])
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index ad0c961a749..11970944a95 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 import math
 import operator
 
@@ -636,7 +636,7 @@ def func(row):
             ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1)
         ),
         cudf.Series([1, 2, 3], dtype="category"),
-        cudf.interval_range(start=0, end=3, closed=True),
+        cudf.interval_range(start=0, end=3),
         [[1, 2], [3, 4], [5, 6]],
         [{"a": 1}, {"a": 2}, {"a": 3}],
     ],

From c949abeef0a62d94430746995ba1ff68865365db Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 23 Jan 2024 11:47:04 -0500
Subject: [PATCH 41/60] Add ci check for external kernels (#14768)

Adds CI checks so that libcudf doesn't reintroduce weak/external CUDA kernels.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14768
---
 .github/workflows/pr.yaml                       | 8 ++++++++
 .github/workflows/test.yaml                     | 9 +++++++++
 cpp/cmake/thirdparty/patches/cccl_override.json | 5 +++++
 3 files changed, 22 insertions(+)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c94724bcf8c..2fe4bf0b05e 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -14,6 +14,7 @@ jobs:
     needs:
       - checks
       - conda-cpp-build
+      - conda-cpp-checks
       - conda-cpp-tests
       - conda-python-build
       - conda-python-cudf-tests
@@ -43,6 +44,13 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
     with:
       build_type: pull-request
+  conda-cpp-checks:
+    needs: conda-cpp-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
+    with:
+      build_type: pull-request
+      enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index df26a8c5916..7bb2530a7bc 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -14,6 +14,15 @@ on:
         type: string
 
 jobs:
+  conda-cpp-checks:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index fa82bfb5421..68fc8979c46 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -8,6 +8,11 @@
           "issue" : "CCCL installs header-search.cmake files in nondeterministic order and has a typo in checking target creation that leads to duplicates",
           "fixed_in" : "2.3"
         },
+        {
+          "file" : "cccl/hide_kernels.diff",
+          "issue" : "Mark all cub and thrust kernels with hidden visibility [https://github.com/nvidia/cccl/pulls/443]",
+          "fixed_in" : "2.3"
+        },
         {
           "file" : "cccl/revert_pr_211.diff",
           "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",

From bb025142b40d10125cf3297085f23cfe28e02d20 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 23 Jan 2024 11:01:41 -0600
Subject: [PATCH 42/60] Add developer guideline to use east const. (#14836)

This PR documents the libcudf preference for "east const." Follow-up from #13491, #13492, #13493, #13494.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14836
---
 cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index c38151d7518..2606b487c07 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -129,6 +129,10 @@ and we try to follow his rules: "No raw loops. No raw pointers. No raw synchroni
 
 Additional style guidelines for libcudf code include:
 
+ * Prefer "east const", placing `const` after the type. This is not
+   automatically enforced by `clang-format` because the option
+   `QualifierAlignment: Right` has been observed to produce false negatives and
+   false positives.
  * [NL.11: Make Literals
    Readable](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#nl11-make-literals-readable):
    Decimal values should use integer separators every thousands place, like

From 67a36a9104097cd6a8ae6efee1018e249f2fe441 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 23 Jan 2024 09:06:58 -1000
Subject: [PATCH 43/60] Simplify ColumnAccessor methods; avoid unnecessary
 validations (#14758)

For methods that essentially do

```python
def select_by_foo(self, ...):
    ...
    return self.__class__(data={subset of self._data})
```

The `return` would perform validation on the returned subset of column, but I think that's unnecessary since that was done during initialization

Additionally
* Removed `_create_unsafe` in favor of a `verify=True|False` keyword in the constructor
* `_column_length` == `nrows` so removed `_column_length`
* Renamed `_compare_keys` to `_keys_equal`
* Remove seldom used/unnecessary methods

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14758
---
 python/cudf/cudf/core/column_accessor.py | 141 +++++++++--------------
 python/cudf/cudf/core/dataframe.py       |  14 ++-
 python/cudf/cudf/core/frame.py           |   6 +-
 python/cudf/cudf/core/multiindex.py      |   3 +-
 4 files changed, 70 insertions(+), 94 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 021d4994613..d87580fcfac 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -21,7 +21,6 @@
 import pandas as pd
 from packaging.version import Version
 from pandas.api.types import is_bool
-from typing_extensions import Self
 
 import cudf
 from cudf.core import column
@@ -66,7 +65,7 @@ def __getitem__(self, key):
         return super().__getitem__(key)
 
 
-def _to_flat_dict_inner(d, parents=()):
+def _to_flat_dict_inner(d: dict, parents: tuple = ()):
     for k, v in d.items():
         if not isinstance(v, d.__class__):
             if parents:
@@ -76,14 +75,6 @@ def _to_flat_dict_inner(d, parents=()):
             yield from _to_flat_dict_inner(d=v, parents=parents + (k,))
 
 
-def _to_flat_dict(d):
-    """
-    Convert the given nested dictionary to a flat dictionary
-    with tuple keys.
-    """
-    return {k: v for k, v in _to_flat_dict_inner(d)}
-
-
 class ColumnAccessor(abc.MutableMapping):
     """
     Parameters
@@ -103,6 +94,9 @@ class ColumnAccessor(abc.MutableMapping):
     label_dtype : Dtype, optional
         What dtype should be returned in `to_pandas_index`
         (default=None).
+    verify : bool, optional
+        For non ColumnAccessor inputs, whether to verify
+        column length and type
     """
 
     _data: "Dict[Any, ColumnBase]"
@@ -116,6 +110,7 @@ def __init__(
         level_names=None,
         rangeindex: bool = False,
         label_dtype: Dtype | None = None,
+        verify: bool = True,
     ):
         self.rangeindex = rangeindex
         self.label_dtype = label_dtype
@@ -133,9 +128,9 @@ def __init__(
         else:
             # This code path is performance-critical for copies and should be
             # modified with care.
-            self._data = {}
-            if data:
-                data = dict(data)
+            data = dict(data)
+            if data and verify:
+                result = {}
                 # Faster than next(iter(data.values()))
                 column_length = len(data[next(iter(data))])
                 for k, v in data.items():
@@ -146,30 +141,14 @@ def __init__(
                         v = column.as_column(v)
                     if len(v) != column_length:
                         raise ValueError("All columns must be of equal length")
-                    self._data[k] = v
+                    result[k] = v
+                self._data = result
+            else:
+                self._data = data
 
             self.multiindex = multiindex
             self._level_names = level_names
 
-    @classmethod
-    def _create_unsafe(
-        cls,
-        data: Dict[Any, ColumnBase],
-        multiindex: bool = False,
-        level_names=None,
-        rangeindex: bool = False,
-        label_dtype: Dtype | None = None,
-    ) -> ColumnAccessor:
-        # create a ColumnAccessor without verifying column
-        # type or size
-        obj = cls()
-        obj._data = data
-        obj.multiindex = multiindex
-        obj._level_names = level_names
-        obj.rangeindex = rangeindex
-        obj.label_dtype = label_dtype
-        return obj
-
     def __iter__(self):
         return iter(self._data)
 
@@ -217,7 +196,7 @@ def nlevels(self) -> int:
     def name(self) -> Any:
         return self.level_names[-1]
 
-    @property
+    @cached_property
     def nrows(self) -> int:
         if len(self._data) == 0:
             return 0
@@ -243,13 +222,6 @@ def _grouped_data(self) -> abc.MutableMapping:
         else:
             return self._data
 
-    @cached_property
-    def _column_length(self):
-        try:
-            return len(self._data[next(iter(self._data))])
-        except StopIteration:
-            return 0
-
     def _clear_cache(self):
         cached_properties = ("columns", "names", "_grouped_data")
         for attr in cached_properties:
@@ -258,9 +230,9 @@ def _clear_cache(self):
             except AttributeError:
                 pass
 
-        # Column length should only be cleared if no data is present.
-        if len(self._data) == 0 and hasattr(self, "_column_length"):
-            del self._column_length
+        # nrows should only be cleared if no data is present.
+        if len(self._data) == 0 and hasattr(self, "nrows"):
+            del self.nrows
 
     def to_pandas_index(self) -> pd.Index:
         """Convert the keys of the ColumnAccessor to a Pandas Index object."""
@@ -345,11 +317,8 @@ def insert(
         if loc == len(self._data):
             if validate:
                 value = column.as_column(value)
-                if len(self._data) > 0:
-                    if len(value) != self._column_length:
-                        raise ValueError("All columns must be of equal length")
-                else:
-                    self._column_length = len(value)
+                if len(self._data) > 0 and len(value) != self.nrows:
+                    raise ValueError("All columns must be of equal length")
             self._data[name] = value
         else:
             new_keys = self.names[:loc] + (name,) + self.names[loc:]
@@ -362,15 +331,16 @@ def copy(self, deep=False) -> ColumnAccessor:
         Make a copy of this ColumnAccessor.
         """
         if deep or cudf.get_option("copy_on_write"):
-            return self.__class__(
-                {k: v.copy(deep=deep) for k, v in self._data.items()},
-                multiindex=self.multiindex,
-                level_names=self.level_names,
-            )
+            data = {k: v.copy(deep=deep) for k, v in self._data.items()}
+        else:
+            data = self._data.copy()
         return self.__class__(
-            self._data.copy(),
+            data=data,
             multiindex=self.multiindex,
             level_names=self.level_names,
+            rangeindex=self.rangeindex,
+            label_dtype=self.label_dtype,
+            verify=False,
         )
 
     def select_by_label(self, key: Any) -> ColumnAccessor:
@@ -508,22 +478,12 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
         key = self._pad_key(key)
         if validate:
             value = column.as_column(value)
-            if len(self._data) > 0:
-                if len(value) != self._column_length:
-                    raise ValueError("All columns must be of equal length")
-            else:
-                self._column_length = len(value)
+            if len(self._data) > 0 and len(value) != self.nrows:
+                raise ValueError("All columns must be of equal length")
 
         self._data[key] = value
         self._clear_cache()
 
-    def _select_by_names(self, names: abc.Sequence) -> Self:
-        return self.__class__(
-            {key: self[key] for key in names},
-            multiindex=self.multiindex,
-            level_names=self.level_names,
-        )
-
     def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
         # Might be a generator
         key = tuple(key)
@@ -541,7 +501,7 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
         else:
             data = {k: self._grouped_data[k] for k in key}
         if self.multiindex:
-            data = _to_flat_dict(data)
+            data = dict(_to_flat_dict_inner(data))
         return self.__class__(
             data,
             multiindex=self.multiindex,
@@ -550,11 +510,16 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
 
     def _select_by_label_grouped(self, key: Any) -> ColumnAccessor:
         result = self._grouped_data[key]
-        if isinstance(result, cudf.core.column.ColumnBase):
-            return self.__class__({key: result}, multiindex=self.multiindex)
+        if isinstance(result, column.ColumnBase):
+            # self._grouped_data[key] = self._data[key] so skip validation
+            return self.__class__(
+                data={key: result},
+                multiindex=self.multiindex,
+                verify=False,
+            )
         else:
             if self.multiindex:
-                result = _to_flat_dict(result)
+                result = dict(_to_flat_dict_inner(result))
             if not isinstance(key, tuple):
                 key = (key,)
             return self.__class__(
@@ -575,11 +540,11 @@ def _select_by_label_slice(self, key: slice) -> ColumnAccessor:
         start = self._pad_key(start, slice(None))
         stop = self._pad_key(stop, slice(None))
         for idx, name in enumerate(self.names):
-            if _compare_keys(name, start):
+            if _keys_equal(name, start):
                 start_idx = idx
                 break
         for idx, name in enumerate(reversed(self.names)):
-            if _compare_keys(name, stop):
+            if _keys_equal(name, stop):
                 stop_idx = len(self.names) - idx
                 break
         keys = self.names[start_idx:stop_idx]
@@ -587,14 +552,16 @@ def _select_by_label_slice(self, key: slice) -> ColumnAccessor:
             {k: self._data[k] for k in keys},
             multiindex=self.multiindex,
             level_names=self.level_names,
+            verify=False,
         )
 
     def _select_by_label_with_wildcard(self, key: Any) -> ColumnAccessor:
         key = self._pad_key(key, slice(None))
         return self.__class__(
-            {k: self._data[k] for k in self._data if _compare_keys(k, key)},
+            {k: self._data[k] for k in self._data if _keys_equal(k, key)},
             multiindex=self.multiindex,
             level_names=self.level_names,
+            verify=False,
         )
 
     def _pad_key(self, key: Any, pad_value="") -> Any:
@@ -639,6 +606,7 @@ def rename_levels(
         to the given mapper and level.
 
         """
+        new_col_names: abc.Iterable
         if self.multiindex:
 
             def rename_column(x):
@@ -655,12 +623,7 @@ def rename_column(x):
                     "Renaming columns with a MultiIndex and level=None is"
                     "not supported"
                 )
-            new_names = map(rename_column, self.keys())
-            ca = ColumnAccessor(
-                dict(zip(new_names, self.values())),
-                level_names=self.level_names,
-                multiindex=self.multiindex,
-            )
+            new_col_names = (rename_column(k) for k in self.keys())
 
         else:
             if level is None:
@@ -680,13 +643,13 @@ def rename_column(x):
             if len(new_col_names) != len(set(new_col_names)):
                 raise ValueError("Duplicate column names are not allowed")
 
-            ca = ColumnAccessor(
-                dict(zip(new_col_names, self.values())),
-                level_names=self.level_names,
-                multiindex=self.multiindex,
-            )
-
-        return self.__class__(ca)
+        data = dict(zip(new_col_names, self.values()))
+        return self.__class__(
+            data=data,
+            level_names=self.level_names,
+            multiindex=self.multiindex,
+            verify=False,
+        )
 
     def droplevel(self, level):
         # drop the nth level
@@ -708,7 +671,7 @@ def droplevel(self, level):
         self._clear_cache()
 
 
-def _compare_keys(target: Any, key: Any) -> bool:
+def _keys_equal(target: Any, key: Any) -> bool:
     """
     Compare `key` to `target`.
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2f18c194fde..2acb250ee13 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -481,12 +481,22 @@ def __getitem__(self, arg):
         index = self._frame.index
         if col_is_scalar:
             s = Series._from_data(
-                ca._select_by_names(column_names), index=index
+                data=ColumnAccessor(
+                    {key: ca._data[key] for key in column_names},
+                    multiindex=ca.multiindex,
+                    level_names=ca.level_names,
+                ),
+                index=index,
             )
             return s._getitem_preprocessed(row_spec)
         if column_names != list(self._frame._column_names):
             frame = self._frame._from_data(
-                ca._select_by_names(column_names), index=index
+                data=ColumnAccessor(
+                    {key: ca._data[key] for key in column_names},
+                    multiindex=ca.multiindex,
+                    level_names=ca.level_names,
+                ),
+                index=index,
             )
         else:
             frame = self._frame
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index fc313a62fd0..eb14a8948af 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -278,12 +278,13 @@ def astype(self, dtype, copy: bool = False):
             for col_name, col in self._data.items()
         }
 
-        return ColumnAccessor._create_unsafe(
+        return ColumnAccessor(
             data=result_data,
             multiindex=self._data.multiindex,
             level_names=self._data.level_names,
             rangeindex=self._data.rangeindex,
             label_dtype=self._data.label_dtype,
+            verify=False,
         )
 
     @_cudf_nvtx_annotate
@@ -881,12 +882,13 @@ def fillna(
 
         return self._mimic_inplace(
             self._from_data(
-                data=ColumnAccessor._create_unsafe(
+                data=ColumnAccessor(
                     data=filled_data,
                     multiindex=self._data.multiindex,
                     level_names=self._data.level_names,
                     rangeindex=self._data.rangeindex,
                     label_dtype=self._data.label_dtype,
+                    verify=False,
                 )
             ),
             inplace=inplace,
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 8ba47795437..d19fb966194 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -224,9 +224,10 @@ def names(self, value):
             # to unexpected behavior in some cases. This is
             # definitely buggy, but we can't disallow non-unique
             # names either...
-            self._data = self._data.__class__._create_unsafe(
+            self._data = self._data.__class__(
                 dict(zip(value, self._data.values())),
                 level_names=self._data.level_names,
+                verify=False,
             )
         self._names = pd.core.indexes.frozen.FrozenList(value)
 

From 16a942da598ec818b27916e4217a35e31a89d353 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 23 Jan 2024 12:02:07 -0800
Subject: [PATCH 44/60] Use `rapids_cuda_set_runtime` to determine cuda runtime
 usage by target (#14833)

This PR uses rapids-cmake to handle per-target CMake linking to cudart.

Replaces #13543 and #11641.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/14833
---
 cpp/CMakeLists.txt                              | 16 +++++-----------
 cpp/cmake/Modules/JitifyPreprocessKernels.cmake |  5 +++--
 cpp/tests/CMakeLists.txt                        |  1 +
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index df158a64d0b..c9d93f83e5c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -18,6 +18,7 @@ include(../fetch_rapids.cmake)
 include(rapids-cmake)
 include(rapids-cpm)
 include(rapids-cuda)
+include(${rapids-cmake-dir}/cuda/set_runtime.cmake)
 include(rapids-export)
 include(rapids-find)
 
@@ -780,17 +781,7 @@ if(TARGET conda_env)
   target_link_libraries(cudf PRIVATE conda_env)
 endif()
 
-if(CUDA_STATIC_RUNTIME)
-  # Tell CMake what CUDA language runtime to use
-  set_target_properties(cudf PROPERTIES CUDA_RUNTIME_LIBRARY Static)
-  # Make sure to export to consumers what runtime we used
-  target_link_libraries(cudf PUBLIC CUDA::cudart_static)
-else()
-  # Tell CMake what CUDA language runtime to use
-  set_target_properties(cudf PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
-  # Make sure to export to consumers what runtime we used
-  target_link_libraries(cudf PUBLIC CUDA::cudart)
-endif()
+rapids_cuda_set_runtime(cudf USE_STATIC ${CUDA_STATIC_RUNTIME})
 
 file(
   WRITE "${CUDF_BINARY_DIR}/fatbin.ld"
@@ -838,6 +829,7 @@ if(CUDF_BUILD_TESTUTIL)
     PUBLIC cudf
     PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
   )
+  rapids_cuda_set_runtime(cudftest_default_stream USE_STATIC ${CUDA_STATIC_RUNTIME})
 
   add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream)
 
@@ -881,6 +873,7 @@ if(CUDF_BUILD_TESTUTIL)
     cudftestutil PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
                         "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
   )
+  rapids_cuda_set_runtime(cudftestutil USE_STATIC ${CUDA_STATIC_RUNTIME})
   add_library(cudf::cudftestutil ALIAS cudftestutil)
 
 endif()
@@ -919,6 +912,7 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
     if(CUDF_BUILD_STACKTRACE_DEBUG)
       target_link_libraries(${_tgt} PRIVATE cudf_backtrace)
     endif()
+    rapids_cuda_set_runtime(${_tgt} USE_STATIC ${CUDA_STATIC_RUNTIME})
     add_library(cudf::${_tgt} ALIAS ${_tgt})
 
     if("${_mode}" STREQUAL "testing")
diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index baabffceeac..8a40be1dc94 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -16,7 +16,8 @@
 add_executable(jitify_preprocess "${JITIFY_INCLUDE_DIR}/jitify2_preprocess.cpp")
 
 target_compile_definitions(jitify_preprocess PRIVATE "_FILE_OFFSET_BITS=64")
-target_link_libraries(jitify_preprocess CUDA::cudart ${CMAKE_DL_LIBS})
+rapids_cuda_set_runtime(jitify_preprocess USE_STATIC ${CUDA_STATIC_RUNTIME})
+target_link_libraries(jitify_preprocess PUBLIC ${CMAKE_DL_LIBS})
 
 # Take a list of files to JIT-compile and run them through jitify_preprocess.
 function(jit_preprocess_files)
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eee736613fe..064d0c49f80 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -58,6 +58,7 @@ function(ConfigureTest CMAKE_TEST_NAME)
     ${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main
                                $<TARGET_NAME_IF_EXISTS:conda_env>
   )
+  rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
     NAME ${CMAKE_TEST_NAME}
     COMMAND ${CMAKE_TEST_NAME}

From 45bf274de87ea29d5ba8e6f4aac2fa048141312a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 23 Jan 2024 18:25:17 -0600
Subject: [PATCH 45/60] Fix calls to deprecated strings factory API in
 examples. (#14838)

Follow-up PR to #14771.

I noticed the strings example code still had a deprecated function call:

```
-- Build files have been written to: /opt/conda/conda-bld/work/cpp/examples/strings/build
[1/8] Building CXX object CMakeFiles/libcudf_apis.dir/libcudf_apis.cpp.o
[2/8] Linking CXX executable libcudf_apis
[3/8] Building CUDA object CMakeFiles/custom_prealloc.dir/custom_prealloc.cu.o
[4/8] Building CUDA object CMakeFiles/custom_with_malloc.dir/custom_with_malloc.cu.o
[5/8] Linking CUDA executable custom_prealloc
[6/8] Linking CUDA executable custom_with_malloc
[7/8] Building CUDA object CMakeFiles/custom_optimized.dir/custom_optimized.cu.o
/opt/conda/conda-bld/work/cpp/examples/strings/custom_optimized.cu: In function 'std::unique_ptr<cudf::column> redact_strings(const cudf::column_view&, const cudf::column_view&)':
/opt/conda/conda-bld/work/cpp/examples/strings/custom_optimized.cu:158:40: warning: 'std::unique_ptr<cudf::column> cudf::make_strings_column(cudf::size_type, rmm::device_uvector<int>&&, rmm::device_uvector<char>&&, rmm::device_buffer&&, cudf::size_type)' is deprecated [-Wdeprecated-declarations]
  158 |   auto result =
      |               ~                        ^
/opt/conda/conda-bld/work/cpp/include/cudf/column/column_factories.hpp:510:42: note: declared here
  510 | [[deprecated]] std::unique_ptr<column> make_strings_column(size_type num_strings,
      |                                          ^~~~~~~~~~~~~~~~~~~
[8/8] Linking CUDA executable custom_optimized
```

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/14838
---
 cpp/examples/strings/custom_optimized.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu
index 522093bc647..aa1468ea790 100644
--- a/cpp/examples/strings/custom_optimized.cu
+++ b/cpp/examples/strings/custom_optimized.cu
@@ -155,8 +155,7 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
     *d_names, *d_visibilities, offsets.data(), chars.data());
 
   // create column from offsets and chars vectors (no copy is performed)
-  auto result =
-    cudf::make_strings_column(names.size(), std::move(offsets), std::move(chars), {}, 0);
+  auto result = cudf::make_strings_column(names.size(), std::move(offsets), chars.release(), {}, 0);
 
   // wait for all of the above to finish
   stream.synchronize();

From 0a4ce5136685761d3a9d0541f3e2e8ec8867e3cf Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Wed, 24 Jan 2024 12:14:05 +1100
Subject: [PATCH 46/60] Remove get_mem_info functions from custom memory
 resources (#14832)

Part of rapidsai/rmm#1388. This removes now-optional and soon-to-be deprecated functions from cuDF's custom device_memory_resource implementations:
 * `supports_get_mem_info()`
 * `do_get_mem_info()`

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/14832
---
 .../stream_checking_resource_adaptor.hpp      | 27 +------------------
 java/src/main/native/src/RmmJni.cpp           | 14 +---------
 2 files changed, 2 insertions(+), 39 deletions(-)

diff --git a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index e18400422aa..90a8c2ccc2f 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,16 +71,6 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    */
   bool supports_streams() const noexcept override { return upstream_->supports_streams(); }
 
-  /**
-   * @brief Query whether the resource supports the get_mem_info API.
-   *
-   * @return Whether or not the upstream resource supports get_mem_info
-   */
-  bool supports_get_mem_info() const noexcept override
-  {
-    return upstream_->supports_get_mem_info();
-  }
-
  private:
   /**
    * @brief Allocates memory of size at least `bytes` using the upstream
@@ -131,21 +121,6 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
                            : upstream_->is_equal(other);
   }
 
-  /**
-   * @brief Get free and available memory from upstream resource.
-   *
-   * @throws `rmm::cuda_error` if unable to retrieve memory info.
-   * @throws `cudf::logic_error` if attempted on a default stream
-   *
-   * @param stream Stream on which to get the mem info.
-   * @return std::pair with available and free memory for resource
-   */
-  std::pair<std::size_t, std::size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override
-  {
-    verify_stream(stream);
-    return upstream_->get_mem_info(stream);
-  }
-
   /**
    * @brief Throw an error if the provided stream is invalid.
    *
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 3c49d153cb6..b92d9e4e891 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,8 +96,6 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
     return scoped_max_total_allocated;
   }
 
-  bool supports_get_mem_info() const noexcept override { return resource->supports_get_mem_info(); }
-
   bool supports_streams() const noexcept override { return resource->supports_streams(); }
 
 private:
@@ -144,10 +142,6 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
       scoped_allocated -= size;
     }
   }
-
-  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override {
-    return resource->get_mem_info(stream);
-  }
 };
 
 template <typename Upstream>
@@ -213,8 +207,6 @@ class java_event_handler_memory_resource : public device_memory_resource {
 
   device_memory_resource *get_wrapped_resource() { return resource; }
 
-  bool supports_get_mem_info() const noexcept override { return resource->supports_get_mem_info(); }
-
   bool supports_streams() const noexcept override { return resource->supports_streams(); }
 
 private:
@@ -277,10 +269,6 @@ class java_event_handler_memory_resource : public device_memory_resource {
     }
   }
 
-  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override {
-    return resource->get_mem_info(stream);
-  }
-
 protected:
   JavaVM *jvm;
   jobject handler_obj;

From 60f04cefaf699daf621125368398cd62635a583d Mon Sep 17 00:00:00 2001
From: Pantakan Kanprawet <pantakan.totae@gmail.com>
Date: Thu, 25 Jan 2024 02:52:58 +0700
Subject: [PATCH 47/60] Notes convert to Pandas-compat (#12641)

---
 python/cudf/cudf/core/column/lists.py    |  10 +-
 python/cudf/cudf/core/column/string.py   | 107 +++++++------
 python/cudf/cudf/core/dataframe.py       | 187 ++++++++++++-----------
 python/cudf/cudf/core/frame.py           | 119 ++++++++++-----
 python/cudf/cudf/core/groupby/groupby.py |  17 ++-
 python/cudf/cudf/core/indexed_frame.py   |  99 ++++++------
 python/cudf/cudf/core/series.py          |  62 ++++----
 python/cudf/cudf/core/tools/datetimes.py |   1 -
 python/cudf/cudf/core/tools/numeric.py   |  15 +-
 9 files changed, 343 insertions(+), 274 deletions(-)

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 0cccec6f28a..c28489a2f98 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -620,11 +620,6 @@ def sort_values(
         -------
         Series or Index with each list sorted
 
-        Notes
-        -----
-        Difference from pandas:
-          * Not supporting: `inplace`, `kind`
-
         Examples
         --------
         >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]])
@@ -633,6 +628,11 @@ def sort_values(
         1         [2.0, 8.0, 8.0]
         2              [1.0, 2.0]
         dtype: list
+
+        .. pandas-compat::
+            **ListMethods.sort_values**
+
+            The ``inplace`` and ``kind`` arguments are currently not supported.
         """
         if inplace:
             raise NotImplementedError("`inplace` not currently implemented.")
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index c47088caebc..fcb993e1a78 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -594,11 +594,6 @@ def extract(
             for each group. If `expand=False` and `pat` has only one capture
             group, then return a Series/Index.
 
-        Notes
-        -----
-        The `flags` parameter currently only supports re.DOTALL and
-        re.MULTILINE.
-
         Examples
         --------
         >>> import cudf
@@ -625,6 +620,12 @@ def extract(
         1       2
         2    <NA>
         dtype: object
+
+        .. pandas-compat::
+            **StringMethods.extract**
+
+            The `flags` parameter currently only supports re.DOTALL and
+            re.MULTILINE.
         """  # noqa W605
         if not _is_supported_regex_flags(flags):
             raise NotImplementedError(
@@ -672,14 +673,6 @@ def contains(
             pattern is contained within the string of each element of the
             Series/Index.
 
-        Notes
-        -----
-        The parameters `case` and `na` are not yet supported and will
-        raise a NotImplementedError if anything other than the default
-        value is set.
-        The `flags` parameter currently only supports re.DOTALL and
-        re.MULTILINE.
-
         Examples
         --------
         >>> import cudf
@@ -753,6 +746,15 @@ def contains(
         3     True
         4     <NA>
         dtype: bool
+
+        .. pandas-compat::
+            **StringMethods.contains**
+
+            The parameters `case` and `na` are not yet supported and will
+            raise a NotImplementedError if anything other than the default
+            value is set.
+            The `flags` parameter currently only supports re.DOTALL and
+            re.MULTILINE.
         """  # noqa W605
         if na is not np.nan:
             raise NotImplementedError("`na` parameter is not yet supported")
@@ -951,12 +953,6 @@ def replace(
             A copy of the object with all matching occurrences of pat replaced
             by repl.
 
-        Notes
-        -----
-        The parameters `case` and `flags` are not yet supported and will raise
-        a `NotImplementedError` if anything other than the default value
-        is set.
-
         Examples
         --------
         >>> import cudf
@@ -986,6 +982,13 @@ def replace(
         1     fuz
         2    <NA>
         dtype: object
+
+        .. pandas-compat::
+            **StringMethods.replace**
+
+            The parameters `case` and `flags` are not yet supported and will
+            raise a `NotImplementedError` if anything other than the default
+            value is set.
         """
         if case is not None:
             raise NotImplementedError("`case` parameter is not yet supported")
@@ -2769,11 +2772,6 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
         DataFrame or MultiIndex
             Returns a DataFrame / MultiIndex
 
-        Notes
-        -----
-        The parameter `expand` is not yet supported and will raise a
-        `NotImplementedError` if anything other than the default value is set.
-
         See Also
         --------
         rpartition
@@ -2815,6 +2813,14 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
         MultiIndex([('X', ' ', '123'),
                     ('Y', ' ', '999')],
                    )
+
+        .. pandas-compat::
+            **StringMethods.partition**
+
+            The parameter `expand` is not yet supported and will raise a
+            `NotImplementedError` if anything other than the default
+            value is set.
+
         """
         if expand is not True:
             raise NotImplementedError(
@@ -3500,14 +3506,6 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         -------
         Series or Index
 
-        Notes
-        -----
-            -  `flags` parameter currently only supports re.DOTALL
-               and re.MULTILINE.
-            -  Some characters need to be escaped when passing
-               in pat. e.g. ``'$'`` has a special meaning in regex
-               and must be escaped when finding this literal character.
-
         Examples
         --------
         >>> import cudf
@@ -3539,6 +3537,15 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         >>> index = cudf.Index(['A', 'A', 'Aaba', 'cat'])
         >>> index.str.count('a')
         Int64Index([0, 0, 2, 1], dtype='int64')
+
+        .. pandas-compat::
+            **StringMethods.count**
+
+            -   `flags` parameter currently only supports re.DOTALL
+                and re.MULTILINE.
+            -   Some characters need to be escaped when passing
+                in pat. e.g. ``'$'`` has a special meaning in regex
+                and must be escaped when finding this literal character.
         """  # noqa W605
         if isinstance(pat, re.Pattern):
             flags = pat.flags & ~re.U
@@ -3570,11 +3577,6 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
             All non-overlapping matches of pattern or
             regular expression in each string of this Series/Index.
 
-        Notes
-        -----
-        The `flags` parameter currently only supports re.DOTALL and
-        re.MULTILINE.
-
         Examples
         --------
         >>> import cudf
@@ -3615,6 +3617,12 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         1        []
         2    [b, b]
         dtype: list
+
+        .. pandas-compat::
+            **StringMethods.findall**
+
+            The `flags` parameter currently only supports re.DOTALL and
+            re.MULTILINE.
         """
         if isinstance(pat, re.Pattern):
             flags = pat.flags & ~re.U
@@ -3797,11 +3805,6 @@ def endswith(self, pat: str) -> SeriesOrIndex:
             A Series of booleans indicating whether the given
             pattern matches the end of each string element.
 
-        Notes
-        -----
-        `na` parameter is not yet supported, as cudf uses
-        native strings instead of Python objects.
-
         Examples
         --------
         >>> import cudf
@@ -3818,6 +3821,12 @@ def endswith(self, pat: str) -> SeriesOrIndex:
         2    False
         3     <NA>
         dtype: bool
+
+        .. pandas-compat::
+            **StringMethods.endswith**
+
+            `na` parameter is not yet supported, as cudf uses
+            native strings instead of Python objects.
         """
         if pat is None:
             raise TypeError(
@@ -4245,13 +4254,6 @@ def match(
         -------
         Series or Index of boolean values.
 
-        Notes
-        -----
-        Parameters `case` and `na` are currently not supported.
-        The `flags` parameter currently only supports re.DOTALL and
-        re.MULTILINE.
-
-
         Examples
         --------
         >>> import cudf
@@ -4272,6 +4274,13 @@ def match(
         1     True
         2     True
         dtype: bool
+
+        .. pandas-compat::
+            **StringMethods.match**
+
+            Parameters `case` and `na` are currently not supported.
+            The `flags` parameter currently only supports re.DOTALL and
+            re.MULTILINE.
         """
         if case is not True:
             raise NotImplementedError("`case` parameter is not yet supported")
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2acb250ee13..7c48352d861 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3265,10 +3265,6 @@ def diff(self, periods=1, axis=0):
         DataFrame
             First differences of the DataFrame.
 
-        Notes
-        -----
-        Diff currently only supports numeric dtype columns.
-
         Examples
         --------
         >>> import cudf
@@ -3292,6 +3288,10 @@ def diff(self, periods=1, axis=0):
         4     2     3    16
         5     2     5    20
 
+        .. pandas-compat::
+            **DataFrame.diff**
+
+            Diff currently only supports numeric dtype columns.
         """
         if not is_integer(periods):
             if not (is_float(periods) and periods.is_integer()):
@@ -3467,14 +3467,6 @@ def rename(
         -------
         DataFrame
 
-        Notes
-        -----
-        Difference from pandas:
-            * Not supporting: level
-
-        Rename will not overwrite column names. If a list with duplicates is
-        passed, column names will be postfixed with a number.
-
         Examples
         --------
         >>> import cudf
@@ -3500,6 +3492,15 @@ def rename(
         10  1  4
         20  2  5
         30  3  6
+
+        .. pandas-compat::
+            **DataFrame.rename**
+
+            * Not Supporting: level
+
+            Rename will not overwrite column names. If a list with
+            duplicates is passed, column names will be postfixed
+            with a number.
         """
         if errors != "ignore":
             raise NotImplementedError(
@@ -3599,10 +3600,10 @@ def agg(self, aggs, axis=None):
             When ``DataFrame.agg`` is called with several aggs,
             ``DataFrame`` is returned.
 
-        Notes
-        -----
-        Difference from pandas:
-          * Not supporting: ``axis``, ``*args``, ``**kwargs``
+        .. pandas-compat::
+            **DataFrame.agg**
+
+            * Not supporting: ``axis``, ``*args``, ``**kwargs``
 
         """
         # TODO: Remove the typecasting below once issue #6846 is fixed
@@ -3735,11 +3736,6 @@ def nlargest(self, n, columns, keep="first"):
             The first `n` rows ordered by the given columns in descending
             order.
 
-        Notes
-        -----
-        Difference from pandas:
-            - Only a single column is supported in *columns*
-
         Examples
         --------
         >>> import cudf
@@ -3774,6 +3770,11 @@ def nlargest(self, n, columns, keep="first"):
         France    65000000  2583560      FR
         Italy     59000000  1937894      IT
         Brunei      434000    12128      BN
+
+        .. pandas-compat::
+            **DataFrame.nlargest**
+
+            - Only a single column is supported in *columns*
         """
         return self._n_largest_or_smallest(True, n, columns, keep)
 
@@ -3800,11 +3801,6 @@ def nsmallest(self, n, columns, keep="first"):
         -------
         DataFrame
 
-        Notes
-        -----
-        Difference from pandas:
-            - Only a single column is supported in *columns*
-
         Examples
         --------
         >>> import cudf
@@ -3846,6 +3842,11 @@ def nsmallest(self, n, columns, keep="first"):
         Anguilla       11300  311      AI
         Tuvalu         11300   38      TV
         Nauru         337000  182      NR
+
+        .. pandas-compat::
+            **DataFrame.nsmallest**
+
+            - Only a single column is supported in *columns*
         """
         return self._n_largest_or_smallest(False, n, columns, keep)
 
@@ -3923,10 +3924,11 @@ def transpose(self):
         -------
         a new (ncol x nrow) dataframe. self is (nrow x ncol)
 
-        Notes
-        -----
-        Difference from pandas:
-        Not supporting *copy* because default and only behavior is copy=True
+        .. pandas-compat::
+            **DataFrame.transpose, DataFrame.T**
+
+            Not supporting *copy* because default and only behavior is
+            copy=True
         """
 
         index = self._data.to_pandas_index()
@@ -4078,10 +4080,6 @@ def merge(
         -------
             merged : DataFrame
 
-        Notes
-        -----
-        **DataFrames merges in cuDF result in non-deterministic row ordering.**
-
         Examples
         --------
         >>> import cudf
@@ -4117,6 +4115,12 @@ def merge(
         right dtype respectively. This extends to semi and anti joins.
         - For outer joins, the result will be the union of categories
         from both sides.
+
+        .. pandas-compat::
+            **DataFrame.merge**
+
+            DataFrames merges in cuDF result in non-deterministic row
+            ordering.
         """
         if indicator:
             raise NotImplementedError(
@@ -4187,12 +4191,11 @@ def join(
         -------
         joined : DataFrame
 
-        Notes
-        -----
-        Difference from pandas:
+        .. pandas-compat::
+            **DataFrame.join**
 
-        - *other* must be a single DataFrame for now.
-        - *on* is not supported yet due to lack of multi-index support.
+            - *other* must be a single DataFrame for now.
+            - *on* is not supported yet due to lack of multi-index support.
         """
         if on is not None:
             raise NotImplementedError("The on parameter is not yet supported")
@@ -5327,11 +5330,6 @@ def from_arrow(cls, table):
         -------
         cudf DataFrame
 
-        Notes
-        -----
-        -   Does not support automatically setting index column(s) similar
-            to how ``to_pandas`` works for PyArrow Tables.
-
         Examples
         --------
         >>> import cudf
@@ -5342,6 +5340,12 @@ def from_arrow(cls, table):
         0  1  4
         1  2  5
         2  3  6
+
+        .. pandas-compat::
+            **DataFrame.from_arrow**
+
+            -   Does not support automatically setting index column(s) similar
+                to how ``to_pandas`` works for PyArrow Tables.
         """
         index_col = None
         col_index_names = None
@@ -5701,14 +5705,6 @@ def quantile(
             If q is a float, a Series will be returned where the index is
             the columns of self and the values are the quantiles.
 
-        .. pandas-compat::
-            **DataFrame.quantile**
-
-            One notable difference from Pandas is when DataFrame is of
-            non-numeric types and result is expected to be a Series in case of
-            Pandas. cuDF will return a DataFrame as it doesn't support mixed
-            types under Series.
-
         Examples
         --------
         >>> import cupy as cp
@@ -5729,6 +5725,14 @@ def quantile(
                a     b
         0.1  1.3   3.7
         0.5  2.5  55.0
+
+        .. pandas-compat::
+            **DataFrame.quantile**
+
+            One notable difference from Pandas is when DataFrame is of
+            non-numeric types and result is expected to be a Series in case of
+            Pandas. cuDF will return a DataFrame as it doesn't support mixed
+            types under Series.
         """  # noqa: E501
         if axis not in (0, None):
             raise NotImplementedError("axis is not implemented yet")
@@ -6001,10 +6005,6 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs):
         Series
             For each column/row the number of non-NA/null entries.
 
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level`, `numeric_only`.
-
         Examples
         --------
         >>> import cudf
@@ -6018,6 +6018,12 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs):
         Age       4
         Single    5
         dtype: int64
+
+        .. pandas-compat::
+            **DataFrame.count**
+
+            Parameters currently not supported are `axis`, `level`,
+            `numeric_only`.
         """
         axis = self._get_axis_from_axis_arg(axis)
         if axis != 0:
@@ -6191,10 +6197,6 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         cudf.Series.value_counts : Return the counts of values
             in a Series.
 
-        Notes
-        -----
-        ``axis`` parameter is currently not supported.
-
         Examples
         --------
         >>> import cudf
@@ -6233,6 +6235,11 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
            legs  wings
         0     2    0.0
         1  <NA>    2.0
+
+        .. pandas-compat::
+            **DataFrame.mode**
+
+            ``axis`` parameter is currently not supported.
         """
         if axis not in (0, "index"):
             raise NotImplementedError("Only axis=0 is currently supported")
@@ -7007,7 +7014,7 @@ def to_struct(self, name=None):
 
         Notes
         -----
-        Note that a copy of the columns is made.
+        Note: a copy of the columns is made.
         """
         if not all(isinstance(name, str) for name in self._data.names):
             warnings.warn(
@@ -7112,22 +7119,18 @@ def append(
         -------
         DataFrame
 
+        Notes
+        -----
+        Iteratively appending rows to a cudf DataFrame can be more
+        computationally intensive than a single concatenate. A better solution
+        is to append those rows to a list and then concatenate the list with
+        the original DataFrame all at once.
+
         See Also
         --------
         cudf.concat : General function to concatenate DataFrame or
             objects.
 
-        Notes
-        -----
-        If a list of dict/series is passed and the keys are all contained in
-        the DataFrame's index, the order of the columns in the resulting
-        DataFrame will be unchanged.
-        Iteratively appending rows to a cudf DataFrame can be more
-        computationally intensive than a single concatenate. A better
-        solution is to append those rows to a list and then concatenate
-        the list with the original DataFrame all at once.
-        `verify_integrity` parameter is not supported yet.
-
         Examples
         --------
         >>> import cudf
@@ -7182,6 +7185,14 @@ def append(
         2  2
         3  3
         4  4
+
+        .. pandas-compat::
+            **DataFrame.append**
+
+            * If a list of dict/series is passed and the keys are all contained
+              in the DataFrame's index, the order of the columns in the
+              resulting DataFrame will be unchanged.
+            * The `verify_integrity` parameter is not supported yet.
         """
         if isinstance(other, dict):
             if not ignore_index:
@@ -7503,22 +7514,6 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
             DataFrame if any assignment statements are included in
             ``expr``, or None if ``inplace=True``.
 
-        Notes
-        -----
-        Difference from pandas:
-            * Additional kwargs are not supported.
-            * Bitwise and logical operators are not dtype-dependent.
-              Specifically, `&` must be used for bitwise operators on integers,
-              not `and`, which is specifically for the logical and between
-              booleans.
-            * Only numerical types currently support all operators.
-            * String types currently support comparison operators.
-            * Operators generally will not cast automatically. Users are
-              responsible for casting columns to suitable types before
-              evaluating a function.
-            * Multiple assignments to the same name (i.e. a sequence of
-              assignment statements where later statements are conditioned upon
-              the output of earlier statements) is not supported.
 
         Examples
         --------
@@ -7581,6 +7576,22 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
         2  3   6   9 -3
         3  4   4   8  0
         4  5   2   7  3
+
+        .. pandas-compat::
+            **DataFrame.eval**
+
+            * Additional kwargs are not supported.
+            * Bitwise and logical operators are not dtype-dependent.
+              Specifically, `&` must be used for bitwise operators on integers,
+              not `and`, which is specifically for the logical and between
+              booleans.
+            * Only numerical types are currently supported.
+            * Operators generally will not cast automatically. Users are
+              responsible for casting columns to suitable types before
+              evaluating a function.
+            * Multiple assignments to the same name (i.e. a sequence of
+              assignment statements where later statements are conditioned upon
+              the output of earlier statements) is not supported.
         """
         if kwargs:
             raise ValueError(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index eb14a8948af..1e6ff118626 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -251,7 +251,6 @@ def size(self) -> int:
         """
         return self._num_columns * self._num_rows
 
-    @_cudf_nvtx_annotate
     def memory_usage(self, deep=False):
         """Return the memory usage of an object.
 
@@ -597,6 +596,8 @@ def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]:
         dtype: int64
 
         .. pandas-compat::
+            **DataFrame.where, Series.where**
+
             Note that ``where`` treats missing values as falsy,
             in parallel with pandas treatment of nullable data:
 
@@ -1948,10 +1949,6 @@ def min(
         -------
         Series
 
-        Notes
-        -----
-        Parameters currently not supported are `level`, `numeric_only`.
-
         Examples
         --------
         >>> import cudf
@@ -1960,6 +1957,11 @@ def min(
         a    1
         b    7
         dtype: int64
+
+        .. pandas-compat::
+            **DataFrame.min, Series.min**
+
+            Parameters currently not supported are `level`, `numeric_only`.
         """
         return self._reduce(
             "min",
@@ -1999,10 +2001,6 @@ def max(
         -------
         Series
 
-        Notes
-        -----
-        Parameters currently not supported are `level`, `numeric_only`.
-
         Examples
         --------
         >>> import cudf
@@ -2011,6 +2009,11 @@ def max(
         a     4
         b    10
         dtype: int64
+
+        .. pandas-compat::
+            **DataFrame.max, Series.max**
+
+            Parameters currently not supported are `level`, `numeric_only`.
         """
         return self._reduce(
             "max",
@@ -2055,10 +2058,6 @@ def sum(
         -------
         Series
 
-        Notes
-        -----
-        Parameters currently not supported are `level`, `numeric_only`.
-
         Examples
         --------
         >>> import cudf
@@ -2067,6 +2066,11 @@ def sum(
         a    10
         b    34
         dtype: int64
+
+        .. pandas-compat::
+            **DataFrame.sum, Series.sum**
+
+            Parameters currently not supported are `level`, `numeric_only`.
         """
         return self._reduce(
             "sum",
@@ -2113,10 +2117,6 @@ def product(
         -------
         Series
 
-        Notes
-        -----
-        Parameters currently not supported are level`, `numeric_only`.
-
         Examples
         --------
         >>> import cudf
@@ -2125,6 +2125,11 @@ def product(
         a      24
         b    5040
         dtype: int64
+
+        .. pandas-compat::
+            **DataFrame.product, Series.product**
+
+            Parameters currently not supported are level`, `numeric_only`.
         """
 
         return self._reduce(
@@ -2224,11 +2229,6 @@ def std(
         -------
         Series
 
-        Notes
-        -----
-        Parameters currently not supported are `level` and
-        `numeric_only`
-
         Examples
         --------
         >>> import cudf
@@ -2237,6 +2237,12 @@ def std(
         a    1.290994
         b    1.290994
         dtype: float64
+
+        .. pandas-compat::
+            **DataFrame.std, Series.std**
+
+            Parameters currently not supported are `level` and
+            `numeric_only`
         """
 
         return self._reduce(
@@ -2280,11 +2286,6 @@ def var(
         -------
         scalar
 
-        Notes
-        -----
-        Parameters currently not supported are `level` and
-        `numeric_only`
-
         Examples
         --------
         >>> import cudf
@@ -2293,6 +2294,12 @@ def var(
         a    1.666667
         b    1.666667
         dtype: float64
+
+        .. pandas-compat::
+            **DataFrame.var, Series.var**
+
+            Parameters currently not supported are `level` and
+            `numeric_only`
         """
         return self._reduce(
             "var",
@@ -2330,10 +2337,6 @@ def kurtosis(
         -------
         Series or scalar
 
-        Notes
-        -----
-        Parameters currently not supported are `level` and `numeric_only`
-
         Examples
         --------
         **Series**
@@ -2351,6 +2354,11 @@ def kurtosis(
         a   -1.2
         b   -1.2
         dtype: float64
+
+        .. pandas-compat::
+            **DataFrame.kurtosis**
+
+            Parameters currently not supported are `level` and `numeric_only`
         """
         if axis not in (0, "index", None, no_default):
             raise NotImplementedError("Only axis=0 is currently supported.")
@@ -2388,11 +2396,6 @@ def skew(
         -------
         Series
 
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
         Examples
         --------
         **Series**
@@ -2417,6 +2420,12 @@ def skew(
         a    0.00000
         b   -0.37037
         dtype: float64
+
+        .. pandas-compat::
+            **DataFrame.skew, Series.skew, Frame.skew**
+
+            Parameters currently not supported are `axis`, `level` and
+            `numeric_only`
         """
         if axis not in (0, "index", None, no_default):
             raise NotImplementedError("Only axis=0 is currently supported.")
@@ -2469,6 +2478,18 @@ def all(self, axis=0, skipna=True, level=None, **kwargs):
         a     True
         b    False
         dtype: bool
+
+        .. pandas-compat::
+            **DataFrame.all, Series.all**
+
+            Parameters currently not supported are `axis`, `bool_only`,
+            `level`.
+
+        .. pandas-compat::
+            **DataFrame.all, Series.all**
+
+            Parameters currently not supported are `axis`, `bool_only`,
+            `level`.
         """
         return self._reduce(
             "all",
@@ -2517,6 +2538,18 @@ def any(self, axis=0, skipna=True, level=None, **kwargs):
         a    True
         b    True
         dtype: bool
+
+        .. pandas-compat::
+            **DataFrame.any, Series.any**
+
+            Parameters currently not supported are `axis`, `bool_only`,
+            `level`.
+
+        .. pandas-compat::
+            **DataFrame.any, Series.any**
+
+            Parameters currently not supported are `axis`, `bool_only`,
+            `level`.
         """
         return self._reduce(
             "any",
@@ -2542,10 +2575,6 @@ def median(
         -------
         scalar
 
-        Notes
-        -----
-        Parameters currently not supported are `level` and `numeric_only`.
-
         Examples
         --------
         >>> import cudf
@@ -2560,6 +2589,16 @@ def median(
         dtype: int64
         >>> ser.median()
         17.0
+
+        .. pandas-compat::
+            **DataFrame.median, Series.median**
+
+            Parameters currently not supported are `level` and `numeric_only`.
+
+        .. pandas-compat::
+            **DataFrame.median, Series.median**
+
+            Parameters currently not supported are `level` and `numeric_only`.
         """
         return self._reduce(
             "median",
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index e28ba233c56..c4d92b84c99 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -685,10 +685,10 @@ def _reduce(
         Series or DataFrame
             Computed {op} of values within each group.
 
-        Notes
-        -----
-        Difference from pandas:
-            * Not supporting: numeric_only, min_count
+        .. pandas-compat::
+            **{cls}.{op}**
+
+            The numeric_only, min_count
         """
         if numeric_only:
             raise NotImplementedError(
@@ -1382,7 +1382,7 @@ def mult(df):
           6    2    6   12
 
         .. pandas-compat::
-            **groupby.apply**
+            **GroupBy.apply**
 
             cuDF's ``groupby.apply`` is limited compared to pandas.
             In some situations, Pandas returns the grouped keys as part of
@@ -2283,9 +2283,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         Series or DataFrame
             Object shifted within each group.
 
-        Notes
-        -----
-        Parameter ``freq`` is unsupported.
+        .. pandas-compat::
+            **GroupBy.shift**
+
+            Parameter ``freq`` is unsupported.
         """
 
         if freq is not None:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 3e564919090..6c0aba34970 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -446,11 +446,6 @@ def empty(self):
         out : bool
             If DataFrame/Series is empty, return True, if not return False.
 
-        Notes
-        -----
-        If DataFrame/Series contains only `null` values, it is still not
-        considered empty. See the example below.
-
         Examples
         --------
         >>> import cudf
@@ -491,6 +486,12 @@ def empty(self):
         Series([], dtype: float64)
         >>> s.empty
         True
+
+        .. pandas-compat::
+            **DataFrame.empty, Series.empty**
+
+            If DataFrame/Series contains only `null` values, it is still not
+            considered empty. See the example above.
         """
         return self.size == 0
 
@@ -638,11 +639,6 @@ def replace(
         result : Series
             Series after replacement. The mask and index are preserved.
 
-        Notes
-        -----
-        Parameters that are currently not supported are: `limit`, `regex`,
-        `method`
-
         Examples
         --------
         **Series**
@@ -785,6 +781,12 @@ def replace(
         2    2    7  c
         3    3    8  d
         4    4    9  e
+
+        .. pandas-compat::
+            **DataFrame.replace, Series.replace**
+
+            Parameters that are currently not supported are: `limit`, `regex`,
+            `method`
         """
         if limit is not None:
             raise NotImplementedError("limit parameter is not implemented yet")
@@ -1125,13 +1127,6 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         `before` and `after` may be specified as strings instead of
         Timestamps.
 
-        .. pandas-compat::
-            **DataFrame.truncate, Series.truncate**
-
-            The ``copy`` parameter is only present for API compatibility, but
-            ``copy=False`` is not supported. This method always generates a
-            copy.
-
         Examples
         --------
         **Series**
@@ -1273,6 +1268,13 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         2021-01-01 23:45:25  1  2
         2021-01-01 23:45:26  1  2
         2021-01-01 23:45:27  1  2
+
+        .. pandas-compat::
+            **DataFrame.truncate, Series.truncate**
+
+            The ``copy`` parameter is only present for API compatibility, but
+            ``copy=False`` is not supported. This method always generates a
+            copy.
         """
         if not copy:
             raise ValueError("Truncating with copy=False is not supported.")
@@ -1527,11 +1529,6 @@ def sort_index(
         -------
         Frame or None
 
-        Notes
-        -----
-        Difference from pandas:
-          * Not supporting: kind, sort_remaining=False
-
         Examples
         --------
         **Series**
@@ -1574,6 +1571,11 @@ def sort_index(
         1  2  3
         3  1  2
         2  3  1
+
+        .. pandas-compat::
+            **DataFrame.sort_index, Series.sort_index**
+
+            * Not supporting: kind, sort_remaining=False
         """
         if kind is not None:
             raise NotImplementedError("kind is not yet supported")
@@ -2383,12 +2385,6 @@ def sort_values(
         -------
         Frame : Frame with sorted values.
 
-        Notes
-        -----
-        Difference from pandas:
-          * Support axis='index' only.
-          * Not supporting: inplace, kind
-
         Examples
         --------
         >>> import cudf
@@ -2400,6 +2396,12 @@ def sort_values(
         0  0 -3
         2  2  0
         1  1  2
+
+        .. pandas-compat::
+            **DataFrame.sort_values, Series.sort_values**
+
+            * Support axis='index' only.
+            * Not supporting: inplace, kind
         """
         if na_position not in {"first", "last"}:
             raise ValueError(f"invalid na_position: {na_position}")
@@ -2923,13 +2925,14 @@ def resample(
         2018-02-28      18.0  63.333333
 
 
-        Notes
-        -----
-        Note that the dtype of the index (or the 'on' column if using
-        'on=') in the result will be of a frequency closest to the
-        resampled frequency.  For example, if resampling from
-        nanoseconds to milliseconds, the index will be of dtype
-        'datetime64[ms]'.
+        .. pandas-compat::
+            **DataFrame.resample, Series.resample**
+
+            Note that the dtype of the index (or the 'on' column if using
+            'on=') in the result will be of a frequency closest to the
+            resampled frequency.  For example, if resampling from
+            nanoseconds to milliseconds, the index will be of dtype
+            'datetime64[ms]'.
         """
         import cudf.core.resample
 
@@ -3405,18 +3408,6 @@ def sample(
         provided via the `random_state` parameter. This function will always
         produce the same sample given an identical `random_state`.
 
-        Notes
-        -----
-        When sampling from ``axis=0/'index'``, ``random_state`` can be either
-        a numpy random state (``numpy.random.RandomState``) or a cupy random
-        state (``cupy.random.RandomState``). When a numpy random state is
-        used, the output is guaranteed to match the output of the corresponding
-        pandas method call, but generating the sample may be slow. If exact
-        pandas equivalence is not required, using a cupy random state will
-        achieve better performance, especially when sampling large number of
-        items. It's advised to use the matching `ndarray` type to the random
-        state for the `weights` array.
-
         Parameters
         ----------
         n : int, optional
@@ -3484,6 +3475,20 @@ def sample(
            a  c
         0  1  3
         1  2  4
+
+        .. pandas-compat::
+            **DataFrame.sample, Series.sample**
+
+            When sampling from ``axis=0/'index'``, ``random_state`` can be
+            either a numpy random state (``numpy.random.RandomState``)
+            or a cupy random state (``cupy.random.RandomState``). When a numpy
+            random state is used, the output is guaranteed to match the output
+            of the corresponding pandas method call, but generating the sample
+            maybe slow. If exact pandas equivalence is not required, using a
+            cupy random state will achieve better performance,
+            especially when sampling large number of
+            items. It's advised to use the matching `ndarray` type to
+            the random state for the `weights` array.
         """
         axis = 0 if axis is None else self._get_axis_from_axis_arg(axis)
         size = self.shape[axis]
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 55100343306..7e25713e63c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1356,10 +1356,11 @@ def map(self, arg, na_action=None) -> "Series":
         4     <NA>
         dtype: int64
 
-        Notes
-        -----
-        Please note map currently only supports fixed-width numeric
-        type functions.
+        .. pandas-compat::
+            **Series.map**
+
+            Please note map currently only supports fixed-width numeric
+            type functions.
         """
         if isinstance(arg, dict):
             if hasattr(arg, "__missing__"):
@@ -2191,12 +2192,6 @@ def sort_values(
         -------
         Series : Series with sorted values.
 
-        Notes
-        -----
-        Difference from pandas:
-          * Support axis='index' only.
-          * Not supporting: inplace, kind
-
         Examples
         --------
         >>> import cudf
@@ -2208,6 +2203,12 @@ def sort_values(
         3    4
         1    5
         dtype: int64
+
+        .. pandas-compat::
+            **Series.sort_values**
+
+            * Support axis='index' only.
+            * The inplace and kind argument is currently unsupported
         """
         return super().sort_values(
             by=self.name,
@@ -2652,16 +2653,17 @@ def count(self, level=None):
         int
             Number of non-null values in the Series.
 
-        Notes
-        -----
-        Parameters currently not supported is `level`.
-
         Examples
         --------
         >>> import cudf
         >>> ser = cudf.Series([1, 5, 2, 4, 3])
         >>> ser.count()
         5
+
+        .. pandas-compat::
+            **Series.count**
+
+            Parameters currently not supported is `level`.
         """
 
         if level is not None:
@@ -2765,10 +2767,6 @@ def cov(self, other, min_periods=None):
             Covariance between Series and other normalized by N-1
             (unbiased estimator).
 
-        Notes
-        -----
-        `min_periods` parameter is not yet supported.
-
         Examples
         --------
         >>> import cudf
@@ -2776,6 +2774,11 @@ def cov(self, other, min_periods=None):
         >>> ser2 = cudf.Series([0.12, 0.26, 0.51])
         >>> ser1.cov(ser2)
         -0.015750000000000004
+
+        .. pandas-compat::
+            **Series.cov**
+
+            `min_periods` parameter is not yet supported.
         """
 
         if min_periods is not None:
@@ -3521,12 +3524,6 @@ def rename(self, index=None, copy=True):
         -------
         Series
 
-        Notes
-        -----
-        Difference from pandas:
-          - Supports scalar values only for changing name attribute
-          - Not supporting : inplace, level
-
         Examples
         --------
         >>> import cudf
@@ -3545,6 +3542,12 @@ def rename(self, index=None, copy=True):
         Name: numeric_series, dtype: int64
         >>> renamed_series.name
         'numeric_series'
+
+        .. pandas-compat::
+            **Series.rename**
+
+            - Supports scalar values only for changing name attribute
+            - The ``inplace`` and ``level`` is not supported
         """
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
@@ -4724,11 +4727,6 @@ def strftime(self, date_format, *args, **kwargs):
         Series
             Series of formatted strings.
 
-        Notes
-        -----
-        The following date format identifiers are not yet
-        supported: ``%c``, ``%x``,``%X``
-
         Examples
         --------
         >>> import cudf
@@ -4755,6 +4753,12 @@ def strftime(self, date_format, *args, **kwargs):
         1    2000 / 30 / 06
         2    2000 / 30 / 09
         dtype: object
+
+        .. pandas-compat::
+            **series.DatetimeProperties.strftime**
+
+            The following date format identifiers are not yet
+            supported: ``%c``, ``%x``,``%X``
         """
 
         if not isinstance(date_format, str):
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 14459c81966..0e0a32e21fe 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -825,7 +825,6 @@ def date_range(
                 '2023-12-23 08:00:00', '2025-02-23 08:00:00',
                 '2026-04-23 08:00:00'],
                 dtype='datetime64[ns]')
-
     """
     if tz is not None:
         raise NotImplementedError("tz is currently unsupported.")
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index a28c679b8be..8991fbe1c13 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -54,13 +54,6 @@ def to_numeric(arg, errors="raise", downcast=None):
         Depending on the input, if series is passed in, series is returned,
         otherwise ndarray
 
-    Notes
-    -----
-    An important difference from pandas is that this function does not accept
-    mixed numeric/non-numeric type sequences. For example ``[1, 'a']``.
-    A ``TypeError`` will be raised when such input is received, regardless of
-    ``errors`` parameter.
-
     Examples
     --------
     >>> s = cudf.Series(['1', '2.0', '3e3'])
@@ -90,6 +83,14 @@ def to_numeric(arg, errors="raise", downcast=None):
     1       1.0
     2    3000.0
     dtype: float64
+
+    .. pandas-compat::
+        **cudf.to_numeric**
+
+        An important difference from pandas is that this function does not
+        accept mixed numeric/non-numeric type sequences.
+        For example ``[1, 'a']``. A ``TypeError`` will be raised when such
+        input is received, regardless of ``errors`` parameter.
     """
 
     if errors not in {"raise", "ignore", "coerce"}:

From f800f5a2fa9a961699345e6febe740b4b8f4760e Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 24 Jan 2024 12:14:05 -0800
Subject: [PATCH 48/60] JSON single quote normalization API (#14729)

The goal of this PR is to address [10004](https://github.com/rapidsai/cudf/issues/10004) by supporting parsing of JSON files containing single quotes for field/value strings. This is a follow-up work to the POC [PR 14545](https://github.com/rapidsai/cudf/pull/14545)

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Andy Grove (https://github.com/andygrove)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Elias Stehle (https://github.com/elstehle)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/14729
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/io/detail/json.hpp           |  14 +-
 cpp/include/cudf/io/json.hpp                  |  31 +++
 .../io/json/json_quote_normalization.cu}      | 204 +++--------------
 cpp/src/io/json/read_json.cu                  |  21 +-
 cpp/src/io/json/read_json.hpp                 |   2 +-
 cpp/tests/CMakeLists.txt                      |   2 +-
 .../io/json_quote_normalization_test.cpp      | 215 ++++++++++++++++++
 .../main/java/ai/rapids/cudf/JSONOptions.java |  15 ++
 java/src/main/java/ai/rapids/cudf/Table.java  |  11 +-
 java/src/main/native/src/TableJni.cpp         |  44 ++--
 .../test/java/ai/rapids/cudf/TableTest.java   |  33 +++
 java/src/test/js                              |   0
 java/src/test/resources/single_quotes.json    |   2 +
 14 files changed, 401 insertions(+), 194 deletions(-)
 rename cpp/{tests/io/fst/quote_normalization_test.cu => src/io/json/json_quote_normalization.cu} (56%)
 create mode 100644 cpp/tests/io/json_quote_normalization_test.cpp
 create mode 100644 java/src/test/js
 create mode 100644 java/src/test/resources/single_quotes.json

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 90eaec6804a..3925ac55d6b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -375,6 +375,7 @@ add_library(
   src/io/functions.cpp
   src/io/json/byte_range_info.cu
   src/io/json/json_column.cu
+  src/io/json/json_quote_normalization.cu
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
   src/io/json/read_json.cu
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index d0a9543397d..0eb0e17ea10 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,4 +51,16 @@ void write_json(data_sink* sink,
                 json_writer_options const& options,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Normalize single quotes to double quotes using FST
+ *
+ * @param inbuf Input device buffer
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ */
+rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& inbuf,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
+
 }  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 2a39a539cc7..f0c3d48ab7e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -115,6 +115,9 @@ class json_reader_options {
   // Whether to keep the quote characters of string values
   bool _keep_quotes = false;
 
+  // Normalize single quotes
+  bool _normalize_single_quotes = false;
+
   // Whether to recover after an invalid JSON line
   json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
 
@@ -255,6 +258,13 @@ class json_reader_options {
    */
   bool is_enabled_keep_quotes() const { return _keep_quotes; }
 
+  /**
+   * @brief Whether the reader should normalize single quotes around strings
+   *
+   * @returns true if the reader should normalize single quotes, false otherwise
+   */
+  bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }
+
   /**
    * @brief Queries the JSON reader's behavior on invalid JSON lines.
    *
@@ -340,6 +350,14 @@ class json_reader_options {
    */
   void enable_keep_quotes(bool val) { _keep_quotes = val; }
 
+  /**
+   * @brief Set whether the reader should enable normalization of single quotes around strings.
+   *
+   * @param val Boolean value to indicate whether the reader should normalize single quotes around
+   * strings
+   */
+  void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; }
+
   /**
    * @brief Specifies the JSON reader's behavior on invalid JSON lines.
    *
@@ -502,6 +520,19 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether the reader should normalize single quotes around strings
+   *
+   * @param val Boolean value to indicate whether the reader should normalize single quotes
+   * of strings
+   * @return this for chaining
+   */
+  json_reader_options_builder& normalize_single_quotes(bool val)
+  {
+    options._normalize_single_quotes = val;
+    return *this;
+  }
+
   /**
    * @brief Specifies the JSON reader's behavior on invalid JSON lines.
    *
diff --git a/cpp/tests/io/fst/quote_normalization_test.cu b/cpp/src/io/json/json_quote_normalization.cu
similarity index 56%
rename from cpp/tests/io/fst/quote_normalization_test.cu
rename to cpp/src/io/json/json_quote_normalization.cu
index d0794b8f17e..7c9466748cd 100644
--- a/cpp/tests/io/fst/quote_normalization_test.cu
+++ b/cpp/src/io/json/json_quote_normalization.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,19 +15,13 @@
  */
 
 #include <io/fst/lookup_tables.cuh>
-#include <io/utilities/hostdevice_vector.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/testing_main.hpp>
-
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/repeat_strings.hpp>
+#include <cudf/io/detail/json.hpp>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
@@ -36,17 +30,16 @@
 #include <string>
 #include <vector>
 
-namespace {
+namespace cudf::io::json {
+
+using SymbolT       = char;
+using StateT        = char;
+using SymbolOffsetT = uint32_t;
 
-// Type used to represent the atomic symbol type used within the finite-state machine
-// TODO: type aliasing to be declared in a common header for better maintainability and
-//        pre-empt future bugs
-using SymbolT = char;
-using StateT  = char;
+namespace normalize_quotes {
 
 // Type sufficiently large to index symbols within the input and output (may be unsigned)
-using SymbolOffsetT = uint32_t;
-enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
+enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
 enum class dfa_symbol_group_id : uint32_t {
   DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
   SINGLE_QUOTE_CHAR,  ///< Quote character SG: '
@@ -62,7 +55,7 @@ constexpr auto TT_DQS            = dfa_states::TT_DQS;
 constexpr auto TT_SQS            = dfa_states::TT_SQS;
 constexpr auto TT_DEC            = dfa_states::TT_DEC;
 constexpr auto TT_SEC            = dfa_states::TT_SEC;
-constexpr auto TT_NUM_STATES     = static_cast<char>(dfa_states::TT_NUM_STATES);
+constexpr auto TT_NUM_STATES     = static_cast<StateT>(dfa_states::TT_NUM_STATES);
 constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
 
 // The i-th string representing all the characters of a symbol group
@@ -80,7 +73,7 @@ std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_s
 }};
 
 // The DFA's starting state
-constexpr char start_state = static_cast<char>(TT_OOS);
+constexpr auto start_state = static_cast<StateT>(TT_OOS);
 
 struct TransduceToNormalizedQuotes {
   /**
@@ -112,7 +105,7 @@ struct TransduceToNormalizedQuotes {
     // SEC   | Sigma\{'}       -> {\*}
 
     // Whether this transition translates to the escape sequence: \"
-    const bool outputs_escape_sequence =
+    bool const outputs_escape_sequence =
       (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
       (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
     // Case when a double quote needs to be replaced by the escape sequence: \"
@@ -156,19 +149,19 @@ struct TransduceToNormalizedQuotes {
                                                 SymbolT const read_symbol) const
   {
     // Whether this transition translates to the escape sequence: \"
-    const bool sqs_outputs_escape_sequence =
+    bool const sqs_outputs_escape_sequence =
       (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
       (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
     // Number of characters to output on this transition
     if (sqs_outputs_escape_sequence) { return 2; }
     // Whether this transition translates to the escape sequence \<s> or unescaped '
-    const bool sec_outputs_escape_sequence =
+    bool const sec_outputs_escape_sequence =
       (state_id == static_cast<StateT>(dfa_states::TT_SEC)) &&
       (match_id != static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR));
     // Number of characters to output on this transition
     if (sec_outputs_escape_sequence) { return 2; }
     // Whether this transition translates to no output <nop>
-    const bool sqs_outputs_nop =
+    bool const sqs_outputs_nop =
       (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
       (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR));
     // Number of characters to output on this transition
@@ -177,156 +170,33 @@ struct TransduceToNormalizedQuotes {
   }
 };
 
-}  // namespace
+}  // namespace normalize_quotes
 
-// Base test fixture for tests
-struct FstTest : public cudf::test::BaseFixture {};
+namespace detail {
 
-void run_test(std::string& input, std::string& output)
+rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr)
 {
-  // Prepare cuda stream for data transfers & kernels
-  rmm::cuda_stream stream{};
-  rmm::cuda_stream_view stream_view(stream);
-
-  auto parser = cudf::io::fst::detail::make_fst(
-    cudf::io::fst::detail::make_symbol_group_lut(qna_sgs),
-    cudf::io::fst::detail::make_transition_table(qna_state_tt),
-    cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedQuotes{}),
+  auto parser = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
+    fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
+    fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}),
     stream);
 
-  auto d_input_scalar = cudf::make_string_scalar(input, stream_view);
-  auto& d_input       = static_cast<cudf::scalar_type_t<std::string>&>(*d_input_scalar);
-
-  // Prepare input & output buffers
-  constexpr std::size_t single_item = 1;
-  cudf::detail::hostdevice_vector<SymbolT> output_gpu(input.size() * 2, stream_view);
-  cudf::detail::hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
-
-  // Allocate device-side temporary storage & run algorithm
-  parser.Transduce(d_input.data(),
-                   static_cast<SymbolOffsetT>(d_input.size()),
-                   output_gpu.device_ptr(),
+  rmm::device_uvector<SymbolT> outbuf(inbuf.size() * 2, stream, mr);
+  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
+  parser.Transduce(inbuf.data(),
+                   static_cast<SymbolOffsetT>(inbuf.size()),
+                   outbuf.data(),
                    thrust::make_discard_iterator(),
-                   output_gpu_size.device_ptr(),
-                   start_state,
-                   stream_view);
-
-  // Async copy results from device to host
-  output_gpu.device_to_host_async(stream_view);
-  output_gpu_size.device_to_host_async(stream_view);
-
-  // Make sure results have been copied back to host
-  stream.synchronize();
-
-  // Verify results
-  ASSERT_EQ(output_gpu_size[0], output.size());
-  CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output, output.size());
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization1)
-{
-  std::string input  = R"({"A":'TEST"'})";
-  std::string output = R"({"A":"TEST\""})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization2)
-{
-  std::string input  = R"({'A':"TEST'"} ['OTHER STUFF'])";
-  std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization3)
-{
-  std::string input  = R"(['{"A": "B"}',"{'A': 'B'}"])";
-  std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization4)
-{
-  std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})";
-  std::string output =
-    R"({"ain't ain't a word and you ain't supposed to say it":"\"\"\"\"\"\"\"\"\"\"\""})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization5)
-{
-  std::string input  = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})";
-  std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization6)
-{
-  std::string input  = R"([{"ABC':'CBA":'XYZ":"ZXY'}])";
-  std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization7)
-{
-  std::string input  = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
-  std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization8)
-{
-  std::string input  = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])";
-  std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid1)
-{
-  std::string input  = R"(["THIS IS A TEST'])";
-  std::string output = R"(["THIS IS A TEST'])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid2)
-{
-  std::string input  = R"(['THIS IS A TEST"])";
-  std::string output = R"(["THIS IS A TEST\"])";
-  run_test(input, output);
-}
+                   outbuf_size.data(),
+                   normalize_quotes::start_state,
+                   stream);
 
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid3)
-{
-  std::string input  = R"({"MORE TEST'N":'RESUL})";
-  std::string output = R"({"MORE TEST'N":"RESUL})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid4)
-{
-  std::string input  = R"({"NUMBER":100'0,'STRING':'SOMETHING'})";
-  std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid5)
-{
-  std::string input  = R"({'NUMBER':100"0,"STRING":"SOMETHING"})";
-  std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid6)
-{
-  std::string input  = R"({'a':'\\''})";
-  std::string output = R"({"a":"\\""})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid7)
-{
-  std::string input  = R"(}'a': 'b'{)";
-  std::string output = R"(}"a": "b"{)";
-  run_test(input, output);
+  outbuf.resize(outbuf_size.value(stream), stream);
+  return outbuf;
 }
 
-CUDF_TEST_PROGRAM_MAIN()
+}  // namespace detail
+}  // namespace cudf::io::json
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 080da7800f4..2cfb5fa03c9 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/json.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -45,6 +46,15 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
   });
 }
 
+/**
+ * @brief Read from array of data sources into RMM buffer
+ *
+ * @param sources Array of data sources
+ * @param compression Compression format of source
+ * @param range_offset Number of bytes to skip from source start
+ * @param range_size Number of bytes to read from source
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
 rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
                                            compression_type compression,
                                            size_t range_offset,
@@ -217,7 +227,14 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                  "Multiple inputs are supported only for JSON Lines format");
   }
 
-  auto const buffer = get_record_range_raw_input(sources, reader_opts, stream);
+  auto buffer = get_record_range_raw_input(sources, reader_opts, stream);
+
+  // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
+  // invoke pre-processing FST
+  if (reader_opts.is_enabled_normalize_single_quotes()) {
+    buffer =
+      normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+  }
 
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
   // For debug purposes, use host_parse_nested_json()
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index db37e7abcdb..d05134fa837 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eee736613fe..24085eb5e10 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -313,13 +313,13 @@ ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu)
 ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
+ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp)
 ConfigureTest(
   DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
   GPUS 1
   PERCENT 30
 )
 target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB)
-ConfigureTest(QUOTE_NORMALIZATION_TEST io/fst/quote_normalization_test.cu)
 ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)
 ConfigureTest(FST_TEST io/fst/fst_test.cu)
 ConfigureTest(TYPE_INFERENCE_TEST io/type_inference_test.cu)
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
new file mode 100644
index 00000000000..50faea5e4d8
--- /dev/null
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/detail/json.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/io/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <string>
+
+// Base test fixture for tests
+struct JsonNormalizationTest : public cudf::test::BaseFixture {};
+
+void run_test(const std::string& host_input, const std::string& expected_host_output)
+{
+  // RMM memory resource
+  std::shared_ptr<rmm::mr::device_memory_resource> rsc =
+    std::make_shared<rmm::mr::cuda_memory_resource>();
+
+  rmm::device_uvector<char> device_input(
+    host_input.size(), cudf::test::get_default_stream(), rsc.get());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(device_input.data(),
+                                host_input.data(),
+                                host_input.size(),
+                                cudaMemcpyHostToDevice,
+                                cudf::test::get_default_stream().value()));
+  // Preprocessing FST
+  auto device_fst_output = cudf::io::json::detail::normalize_single_quotes(
+    std::move(device_input), cudf::test::get_default_stream(), rsc.get());
+
+  std::string preprocessed_host_output(device_fst_output.size(), 0);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
+                                device_fst_output.data(),
+                                preprocessed_host_output.size(),
+                                cudaMemcpyDeviceToHost,
+                                cudf::test::get_default_stream().value()));
+  CUDF_TEST_EXPECT_VECTOR_EQUAL(
+    preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization1)
+{
+  std::string input  = R"({"A":'TEST"'})";
+  std::string output = R"({"A":"TEST\""})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization2)
+{
+  std::string input  = R"({'A':"TEST'"} ['OTHER STUFF'])";
+  std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization3)
+{
+  std::string input  = R"(['{"A": "B"}',"{'A': 'B'}"])";
+  std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4)
+{
+  std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})";
+  std::string output =
+    R"({"ain't ain't a word and you ain't supposed to say it":"\"\"\"\"\"\"\"\"\"\"\""})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization5)
+{
+  std::string input  = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})";
+  std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization6)
+{
+  std::string input  = R"([{"ABC':'CBA":'XYZ":"ZXY'}])";
+  std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization7)
+{
+  std::string input  = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
+  std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization8)
+{
+  std::string input  = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])";
+  std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid1)
+{
+  std::string input  = R"(["THIS IS A TEST'])";
+  std::string output = R"(["THIS IS A TEST'])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid2)
+{
+  std::string input  = R"(['THIS IS A TEST"])";
+  std::string output = R"(["THIS IS A TEST\"])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid3)
+{
+  std::string input  = R"({"MORE TEST'N":'RESUL})";
+  std::string output = R"({"MORE TEST'N":"RESUL})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid4)
+{
+  std::string input  = R"({"NUMBER":100'0,'STRING':'SOMETHING'})";
+  std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid5)
+{
+  std::string input  = R"({'NUMBER':100"0,"STRING":"SOMETHING"})";
+  std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid6)
+{
+  std::string input  = R"({'a':'\\''})";
+  std::string output = R"({"a":"\\""})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid7)
+{
+  std::string input  = R"(}'a': 'b'{)";
+  std::string output = R"(}"a": "b"{)";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, ReadJsonOption)
+{
+  // RMM memory resource
+  std::shared_ptr<rmm::mr::device_memory_resource> rsc =
+    std::make_shared<rmm::mr::cuda_memory_resource>();
+
+  // Test input
+  std::string const host_input = R"({"A":'TEST"'})";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .normalize_single_quotes(true);
+
+  cudf::io::table_with_metadata processed_table =
+    cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get());
+
+  // Expected table
+  std::string const expected_input = R"({"A":"TEST\""})";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true);
+
+  cudf::io::table_with_metadata expected_table =
+    cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
+}
+
+TEST_F(JsonNormalizationTest, ErrorCheck)
+{
+  // RMM memory resource
+  std::shared_ptr<rmm::mr::device_memory_resource> rsc =
+    std::make_shared<rmm::mr::cuda_memory_resource>();
+
+  // Test input
+  std::string const host_input = R"({"A":'TEST"'})";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true);
+
+  EXPECT_THROW(cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()),
+               cudf::logic_error);
+}
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 523d594f8ba..35165c18c7a 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -30,6 +30,7 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean dayFirst;
   private final boolean lines;
   private final boolean recoverWithNull;
+  private final boolean normalizeSingleQuotes;
   private final boolean mixedTypesAsStrings;
 
   private JSONOptions(Builder builder) {
@@ -37,6 +38,7 @@ private JSONOptions(Builder builder) {
     dayFirst = builder.dayFirst;
     lines = builder.lines;
     recoverWithNull = builder.recoverWithNull;
+    normalizeSingleQuotes = builder.normalizeSingleQuotes;
     mixedTypesAsStrings = builder.mixedTypesAsStrings;
   }
 
@@ -53,6 +55,10 @@ public boolean isRecoverWithNull() {
     return recoverWithNull;
   }
 
+  public boolean isNormalizeSingleQuotes() {
+    return normalizeSingleQuotes;
+  }
+
   public boolean isMixedTypesAsStrings() {
     return mixedTypesAsStrings;
   }
@@ -71,6 +77,7 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean lines = true;
 
     private boolean recoverWithNull = false;
+    private boolean normalizeSingleQuotes = false;
 
     private boolean mixedTypesAsStrings = false;
 
@@ -109,6 +116,14 @@ public Builder withRecoverWithNull(boolean recoverWithNull) {
       return this;
     }
 
+    /**
+     * Should the single quotes be normalized.
+     */
+    public Builder withNormalizeSingleQuotes(boolean normalizeSingleQuotes) {
+      this.normalizeSingleQuotes = normalizeSingleQuotes;
+      return this;
+    }
+
     /**
      * Specify how to handle columns that contain mixed types.
      *
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 300c540b8c0..ecf2e860351 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -251,17 +251,19 @@ private static native long readJSON(String[] columnNames,
                                         String filePath, long address, long length,
                                         boolean dayFirst, boolean lines,
                                         boolean recoverWithNulls,
+                                        boolean normalizeSingleQuotes,
                                         boolean mixedTypesAsStrings) throws CudfException;
 
   private static native long readJSONFromDataSource(String[] columnNames,
                                       int[] dTypeIds, int[] dTypeScales,
                                       boolean dayFirst, boolean lines,
                                       boolean recoverWithNulls,
+                                      boolean normalizeSingleQuotes,
                                       boolean mixedTypesAsStrings,
                                       long dsHandle) throws CudfException;
 
   private static native long readAndInferJSON(long address, long length,
-      boolean dayFirst, boolean lines, boolean recoverWithNulls, boolean mixedTypesAsStrings) throws CudfException;
+      boolean dayFirst, boolean lines, boolean recoverWithNulls, boolean normalizeSingleQuotes, boolean mixedTypesAsStrings) throws CudfException;
 
   /**
    * Read in Parquet formatted data.
@@ -1090,6 +1092,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     path.getAbsolutePath(),
                     0, 0,
                     opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
+                    opts.isNormalizeSingleQuotes(),
                     opts.isMixedTypesAsStrings()))) {
 
       return gatherJSONColumns(schema, twm);
@@ -1143,6 +1146,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
     assert offset >= 0 && offset < buffer.length;
     return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
         opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
+        opts.isNormalizeSingleQuotes(),
         opts.isMixedTypesAsStrings()));
   }
 
@@ -1166,7 +1170,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(),
             schema.getTypeIds(), schema.getTypeScales(), null,
             buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
-            opts.isRecoverWithNull(), opts.isMixedTypesAsStrings()))) {
+            opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+            opts.isMixedTypesAsStrings()))) {
       return gatherJSONColumns(schema, twm);
     }
   }
@@ -1182,7 +1187,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
     long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(),
             schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(),
-            opts.isRecoverWithNull(), opts.isMixedTypesAsStrings(), dsHandle))) {
+            opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(), opts.isMixedTypesAsStrings(), dsHandle))) {
       return gatherJSONColumns(schema, twm);
     } finally {
       DataSourceHelper.destroyWrapperDataSource(dsHandle);
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 1ac15a3023c..cef18b245e7 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1392,7 +1392,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null, jboolean mixed_types_as_string) {
+    jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string) {
 
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1408,11 +1408,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     auto const recovery_mode = recover_with_null ?
                                    cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
                                    cudf::io::json_recovery_mode_t::FAIL;
-    cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
-                                                     .dayfirst(static_cast<bool>(day_first))
-                                                     .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode)
-                                                     .mixed_types_as_string(mixed_types_as_string);
+    cudf::io::json_reader_options_builder opts =
+        cudf::io::json_reader_options::builder(source)
+            .dayfirst(static_cast<bool>(day_first))
+            .lines(static_cast<bool>(lines))
+            .recovery_mode(recovery_mode)
+            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+            .mixed_types_as_string(mixed_types_as_string);
 
     auto result =
         std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1470,8 +1472,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jboolean day_first, jboolean lines, jboolean recover_with_null, jboolean mixed_types_as_string,
-    jlong ds_handle) {
+    jboolean day_first, jboolean lines, jboolean recover_with_null,
+    jboolean normalize_single_quotes, jboolean mixed_types_as_string, jlong ds_handle) {
 
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
@@ -1503,11 +1505,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     cudf::io::json_recovery_mode_t recovery_mode =
         recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
                             cudf::io::json_recovery_mode_t::FAIL;
-    cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
-                                                     .dayfirst(static_cast<bool>(day_first))
-                                                     .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode)
-                                                     .mixed_types_as_string(mixed_types_as_string);
+    cudf::io::json_reader_options_builder opts =
+        cudf::io::json_reader_options::builder(source)
+            .dayfirst(static_cast<bool>(day_first))
+            .lines(static_cast<bool>(lines))
+            .recovery_mode(recovery_mode)
+            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+            .mixed_types_as_string(mixed_types_as_string);
 
     if (!n_col_names.is_null() && data_types.size() > 0) {
       if (n_col_names.size() != n_types.size()) {
@@ -1539,7 +1543,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null, jboolean mixed_types_as_string) {
+    jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string) {
 
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1586,11 +1590,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     cudf::io::json_recovery_mode_t recovery_mode =
         recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
                             cudf::io::json_recovery_mode_t::FAIL;
-    cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
-                                                     .dayfirst(static_cast<bool>(day_first))
-                                                     .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode)
-                                                     .mixed_types_as_string(mixed_types_as_string);
+    cudf::io::json_reader_options_builder opts =
+        cudf::io::json_reader_options::builder(source)
+            .dayfirst(static_cast<bool>(day_first))
+            .lines(static_cast<bool>(lines))
+            .recovery_mode(recovery_mode)
+            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+            .mixed_types_as_string(mixed_types_as_string);
 
     if (!n_col_names.is_null() && data_types.size() > 0) {
       if (n_col_names.size() != n_types.size()) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 73002644858..f1c4d0803a3 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -87,6 +87,7 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
   private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
   private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json");
+  private static final File TEST_JSON_SINGLE_QUOTES_FILE = TestUtils.getResourceAsFile("single_quotes.json");
   private static final File TEST_MIXED_TYPE_1_JSON = TestUtils.getResourceAsFile("mixed_types_1.json");
   private static final File TEST_MIXED_TYPE_2_JSON = TestUtils.getResourceAsFile("mixed_types_2.json");
 
@@ -330,6 +331,23 @@ void testReadJSONFile() {
   }
 
   @Test
+  void testReadSingleQuotesJSONFile() throws IOException {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "A")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .withNormalizeSingleQuotes(true)
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("TEST\"", "TESTER'")
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE);
+         Table table = Table.readJSON(schema, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   void testReadMixedType2JSONFileFeatureDisabled() {
     Schema schema = Schema.builder()
             .column(DType.STRING, "a")
@@ -377,6 +395,21 @@ void testReadMixedType2JSONFile() throws IOException {
     }
   }
 
+  @Test
+  void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException {
+    Schema schema = Schema.builder()
+      .column(DType.STRING, "A")
+      .build();
+    JSONOptions opts = JSONOptions.builder()
+      .withLines(true)
+      .withNormalizeSingleQuotes(false)
+      .build();
+    try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) {
+      assertThrows(CudfException.class, () ->
+        Table.readJSON(schema, opts, source));
+    }
+  }
+
   @Test
   void testReadJSONFromDataSource() throws IOException {
     Schema schema = Schema.builder()
diff --git a/java/src/test/js b/java/src/test/js
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/java/src/test/resources/single_quotes.json b/java/src/test/resources/single_quotes.json
new file mode 100644
index 00000000000..cb432fbc643
--- /dev/null
+++ b/java/src/test/resources/single_quotes.json
@@ -0,0 +1,2 @@
+{"A":'TEST"'}
+{'A':"TESTER'"}

From 807318b5c0f10219291bf10db497018c7f42d591 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 24 Jan 2024 15:54:04 -0600
Subject: [PATCH 49/60] Update conda-cpp-post-build-checks to branch-24.04.
 (#14854)

Fixes some merge issues with outdated versions from #14768. I also made a minor tweak to `update-version.sh` that double-quotes some outputs to make pre-commit happier.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/14854
---
 .github/workflows/pr.yaml     | 2 +-
 .github/workflows/test.yaml   | 2 +-
 ci/release/update-version.sh  | 8 ++++----
 docs/dask_cudf/source/conf.py | 6 +++---
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 41d4e42891c..14a74618413 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -47,7 +47,7 @@ jobs:
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
     with:
       build_type: pull-request
       enable_check_symbols: true
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a1674e691cd..e044d69c6d8 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 47e3f887d7d..02dba0d09e4 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -67,10 +67,10 @@ sed_runner 's/'"branch-.*\/cmake-format-rapids-cmake.json"'/'"branch-${NEXT_SHOR
 sed_runner 's/PROJECT_NUMBER         = .*/PROJECT_NUMBER         = '${NEXT_FULL_TAG}'/g' cpp/doxygen/Doxyfile
 
 # sphinx docs update
-sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/cudf/source/conf.py
-sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/conf.py
-sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/dask_cudf/source/conf.py
-sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/dask_cudf/source/conf.py
+sed_runner 's/version = .*/version = "'${NEXT_SHORT_TAG}'"/g' docs/cudf/source/conf.py
+sed_runner 's/release = .*/release = "'${NEXT_FULL_TAG}'"/g' docs/cudf/source/conf.py
+sed_runner 's/version = .*/version = "'${NEXT_SHORT_TAG}'"/g' docs/dask_cudf/source/conf.py
+sed_runner 's/release = .*/release = "'${NEXT_FULL_TAG}'"/g' docs/dask_cudf/source/conf.py
 
 DEPENDENCIES=(
   cudf
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
index f1f28ccd752..25f0eb41ed5 100644
--- a/docs/dask_cudf/source/conf.py
+++ b/docs/dask_cudf/source/conf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 # Configuration file for the Sphinx documentation builder.
 #
@@ -11,8 +11,8 @@
 project = "dask-cudf"
 copyright = "2018-2023, NVIDIA Corporation"
 author = "NVIDIA Corporation"
-version = '24.04'
-release = '24.04.00'
+version = "24.04"
+release = "24.04.00"
 
 language = "en"
 

From 258d9ee28311df406c16b61e12bfc592d57149b0 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 24 Jan 2024 15:13:25 -0800
Subject: [PATCH 50/60] Add row index and stripe size options to Python ORC
 chunked writer (#14785)

Adds the APIs that control the stripe/row group size when using the chunked writer. This functions are already present in to_orc (non-chunked version of the same API).

Adding this options to facilitate smaller unit tests.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14785
---
 python/cudf/cudf/_lib/orc.pyx      | 25 ++++++++++++++++++++-----
 python/cudf/cudf/tests/test_orc.py | 22 ++++++++++++++++++++++
 2 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index c64296eb7da..2cbdf76030b 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -375,13 +375,19 @@ cdef class ORCWriter:
     cdef object index
     cdef table_input_metadata tbl_meta
     cdef object cols_as_map_type
+    cdef object stripe_size_bytes
+    cdef object stripe_size_rows
+    cdef object row_index_stride
 
     def __cinit__(self,
                   object path,
                   object index=None,
                   object compression="snappy",
                   object statistics="ROWGROUP",
-                  object cols_as_map_type=None):
+                  object cols_as_map_type=None,
+                  object stripe_size_bytes=None,
+                  object stripe_size_rows=None,
+                  object row_index_stride=None):
 
         self.sink = make_sink_info(path, self._data_sink)
         self.stat_freq = _get_orc_stat_freq(statistics)
@@ -389,6 +395,9 @@ cdef class ORCWriter:
         self.index = index
         self.cols_as_map_type = cols_as_map_type \
             if cols_as_map_type is None else set(cols_as_map_type)
+        self.stripe_size_bytes = stripe_size_bytes
+        self.stripe_size_rows = stripe_size_rows
+        self.row_index_stride = row_index_stride
         self.initialized = False
 
     def write_table(self, table):
@@ -456,9 +465,7 @@ cdef class ORCWriter:
         pandas_metadata = generate_pandas_metadata(table, self.index)
         user_data[str.encode("pandas")] = str.encode(pandas_metadata)
 
-        cdef chunked_orc_writer_options args
-        with nogil:
-            args = move(
+        cdef chunked_orc_writer_options c_opts = move(
                 chunked_orc_writer_options.builder(self.sink)
                 .metadata(self.tbl_meta)
                 .key_value_metadata(move(user_data))
@@ -466,7 +473,15 @@ cdef class ORCWriter:
                 .enable_statistics(self.stat_freq)
                 .build()
             )
-            self.writer.reset(new orc_chunked_writer(args))
+        if self.stripe_size_bytes is not None:
+            c_opts.set_stripe_size_bytes(self.stripe_size_bytes)
+        if self.stripe_size_rows is not None:
+            c_opts.set_stripe_size_rows(self.stripe_size_rows)
+        if self.row_index_stride is not None:
+            c_opts.set_row_index_stride(self.row_index_stride)
+
+        with nogil:
+            self.writer.reset(new orc_chunked_writer(c_opts))
 
         self.initialized = True
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 4630b6eef0a..6b7f86098a0 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1911,3 +1911,25 @@ def test_orc_reader_empty_deeply_nested_level(datadir):
     got = cudf.read_orc(path)
 
     assert_eq(expect, got)
+
+
+def test_orc_chunked_writer_stripe_size(datadir):
+    from pyarrow import orc
+
+    df = cudf.DataFrame({"col": gen_rand_series("int", 100000)})
+
+    buffer = BytesIO()
+    writer = ORCWriter(buffer, stripe_size_bytes=64 * 1024)
+    writer.write_table(df)
+    writer.close()
+
+    orc_file = orc.ORCFile(buffer)
+    assert_eq(orc_file.nstripes, 10)
+
+    buffer = BytesIO()
+    writer = ORCWriter(buffer, stripe_size_rows=20000)
+    writer.write_table(df)
+    writer.close()
+
+    orc_file = orc.ORCFile(buffer)
+    assert_eq(orc_file.nstripes, 5)

From 5b1eef31ed4c5935285ef780dc74d35cea086b49 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 24 Jan 2024 18:40:15 -0600
Subject: [PATCH 51/60] Parquet sub-rowgroup reading. (#14360)

closes #14270

Implementation of sub-rowgroup reading of Parquet files.  This PR implements an additional layer on top of the existing chunking system.  Currently, the reader takes two parameters:  `input_pass_read_limit` which specifies a limit on temporary memory usage when reading and decompressing file data;  and `output_pass_read_limit` which specifies a limit on how large an output chunk (a table) can be.

Currently when the user specifies a limit via `input_pass_read_limit`, the reader will perform multiple `passes` over the file at row-group granularity.  That is, it will control how many row groups it will read at once to conform to the specified limit.

However, there are cases where this is not sufficient.  So this PR changes things so that we now have `subpasses` below the top level `passes`.  It works as follows:

- We read a set of input chunks based on the `input_pass_read_limit` but we do not decompress them immediately. This constitutes a `pass`.
- Within each pass of compressed data, we progressively decompress batches of pages as `subpasses`.
- Within each `subpass` we apply the output limit to produce `chunks`.

So the overall structure of the reader is:  (read) `pass` -> (decompress) `subpass` -> (decode) `chunk`

Major sections of code changes:

- Previously the incoming page data in the file was unsorted. To handle this we later on produced a `page_index` that could be applied to the array to get them in schema-sorted order.  This was getting very unwieldy so I just sort the pages up front now and the `page_index` array has gone away.

- There are now two sets of pages to be aware of in the code.  Within each `pass_intermediate_data` there is the set of all pages within the current set of loaded row groups.  And then within the `subpass_intermediate_data` struct there is a separate array of pages representing the current batch of decompressed data we are processing.  To keep the confusion down I changed a good amount of code to always reference it's array though it's associated struct.  Ie,  `pass.pages` or `subpass.pages`. In addition, I removed the `page_info` from `ColumnChunkDesc` to help prevent the kernels from getting confused. `ColumnChunkDesc` now only has a `dict_page` field which is constant across all subpasses.

- The primary entry point for the chunking mechanism is in `handle_chunking`. Here we iterate through passes, subpasses and output chunks.  Successive subpasses are computed and preprocessed through here.

- The volume of diffs you'll see in `reader_impl_chunking.cu` is a little deceptive.  A lot of this is just functions (or pieces of functions) that have been moved over from either `reader_impl_preprocess.cu` or `reader_impl_helpers.cpp`.   The most relevant actual changes are in: ` handle_chunking`, `compute_input_passes`, `compute_next_subpass`, and `compute_chunks_for_subpass`.

Note on tests:   I renamed `parquet_chunked_reader_tests.cpp` to `parquet_chunked_reader_test.cu` as I needed to use thrust. The only actual changes in the file are the addition of the `ParquetChunkedReaderInputLimitConstrainedTest` and `ParquetChunkedReaderInputLimitTest` test suites at the bottom.

Authors:
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/14360
---
 cpp/src/io/comp/nvcomp_adapter.hpp            |   24 +-
 cpp/src/io/parquet/page_decode.cuh            |    9 +-
 cpp/src/io/parquet/page_hdr.cu                |   27 +-
 cpp/src/io/parquet/page_string_decode.cu      |   10 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |   60 +-
 cpp/src/io/parquet/reader_impl.cpp            |  206 ++-
 cpp/src/io/parquet/reader_impl.hpp            |  128 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    | 1410 ++++++++++++++---
 cpp/src/io/parquet/reader_impl_chunking.hpp   |   89 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  | 1107 +++++++------
 cpp/src/io/utilities/column_buffer.cpp        |   31 +
 cpp/src/io/utilities/column_buffer.hpp        |   12 +-
 cpp/tests/CMakeLists.txt                      |    2 +-
 ...est.cpp => parquet_chunked_reader_test.cu} |  342 +++-
 14 files changed, 2387 insertions(+), 1070 deletions(-)
 rename cpp/tests/io/{parquet_chunked_reader_test.cpp => parquet_chunked_reader_test.cu} (73%)

diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 1393b70f058..69a278757ce 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -99,8 +99,8 @@ inline bool operator==(feature_status_parameters const& lhs, feature_status_para
  * @param[in] inputs List of input buffers
  * @param[out] outputs List of output buffers
  * @param[out] results List of output status structures
- * @param[in] max_uncomp_chunk_size maximum size of uncompressed chunk
- * @param[in] max_total_uncomp_size maximum total size of uncompressed data
+ * @param[in] max_uncomp_chunk_size Maximum size of any single uncompressed chunk
+ * @param[in] max_total_uncomp_size Maximum total size of uncompressed data
  * @param[in] stream CUDA stream to use
  */
 void batched_decompress(compression_type compression,
@@ -111,6 +111,24 @@ void batched_decompress(compression_type compression,
                         size_t max_total_uncomp_size,
                         rmm::cuda_stream_view stream);
 
+/**
+ * @brief Return the amount of temporary space required in bytes for a given decompression
+ * operation.
+ *
+ * The size returned reflects the size of the scratch buffer to be passed to
+ * `batched_decompress_async`
+ *
+ * @param[in] compression Compression type
+ * @param[in] num_chunks The number of decompression chunks to be processed
+ * @param[in] max_uncomp_chunk_size Maximum size of any single uncompressed chunk
+ * @param[in] max_total_uncomp_size Maximum total size of uncompressed data
+ * @returns The total required size in bytes
+ */
+size_t batched_decompress_temp_size(compression_type compression,
+                                    size_t num_chunks,
+                                    size_t max_uncomp_chunk_size,
+                                    size_t max_total_uncomp_size);
+
 /**
  * @brief Gets the maximum size any chunk could compress to in the batch.
  *
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 8f256cd1f97..409b1464cd1 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -1301,16 +1301,15 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
             // String dictionary: use index
             s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
-            s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
+            s->dict_size = s->col.dict_page->num_input_values * sizeof(string_index_pair);
           } else {
-            s->dict_base =
-              s->col.page_info[0].page_data;  // dictionary is always stored in the first page
-            s->dict_size = s->col.page_info[0].uncompressed_page_size;
+            s->dict_base = s->col.dict_page->page_data;
+            s->dict_size = s->col.dict_page->uncompressed_page_size;
           }
           s->dict_run  = 0;
           s->dict_val  = 0;
           s->dict_bits = (cur < end) ? *cur++ : 0;
-          if (s->dict_bits > 32 || !s->dict_base) {
+          if (s->dict_bits > 32 || (!s->dict_base && s->col.dict_page->num_input_values > 0)) {
             s->set_error_code(decode_error::INVALID_DICT_WIDTH);
           }
           break;
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 4be4f45497d..888d9452612 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -348,9 +348,11 @@ struct gpuParsePageHeader {
  * @param[in] num_chunks Number of column chunks
  */
 // blockDim {128,1,1}
-CUDF_KERNEL void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks,
-                                                             int32_t num_chunks,
-                                                             kernel_error::pointer error_code)
+CUDF_KERNEL
+void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks,
+                                                 chunk_page_info* chunk_pages,
+                                                 int32_t num_chunks,
+                                                 kernel_error::pointer error_code)
 {
   using cudf::detail::warp_size;
   gpuParsePageHeader parse_page_header;
@@ -392,11 +394,10 @@ CUDF_KERNEL void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* ch
       bs->page.temp_string_buf     = nullptr;
       bs->page.kernel_mask         = decode_kernel_mask::NONE;
     }
-    num_values     = bs->ck.num_values;
-    page_info      = bs->ck.page_info;
-    num_dict_pages = bs->ck.num_dict_pages;
-    max_num_pages  = (page_info) ? bs->ck.max_num_pages : 0;
-    values_found   = 0;
+    num_values    = bs->ck.num_values;
+    page_info     = chunk_pages ? chunk_pages[chunk].pages : nullptr;
+    max_num_pages = page_info ? bs->ck.max_num_pages : 0;
+    values_found  = 0;
     __syncwarp();
     while (values_found < num_values && bs->cur < bs->end) {
       int index_out = -1;
@@ -495,9 +496,9 @@ CUDF_KERNEL void __launch_bounds__(128)
   if (!lane_id && ck->num_dict_pages > 0 && ck->str_dict_index) {
     // Data type to describe a string
     string_index_pair* dict_index = ck->str_dict_index;
-    uint8_t const* dict           = ck->page_info[0].page_data;
-    int dict_size                 = ck->page_info[0].uncompressed_page_size;
-    int num_entries               = ck->page_info[0].num_input_values;
+    uint8_t const* dict           = ck->dict_page->page_data;
+    int dict_size                 = ck->dict_page->uncompressed_page_size;
+    int num_entries               = ck->dict_page->num_input_values;
     int pos = 0, cur = 0;
     for (int i = 0; i < num_entries; i++) {
       int len = 0;
@@ -518,13 +519,15 @@ CUDF_KERNEL void __launch_bounds__(128)
 }
 
 void __host__ DecodePageHeaders(ColumnChunkDesc* chunks,
+                                chunk_page_info* chunk_pages,
                                 int32_t num_chunks,
                                 kernel_error::pointer error_code,
                                 rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid((num_chunks + 3) >> 2, 1);  // 1 chunk per warp, 4 warps per block
-  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks, error_code);
+  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(
+    chunks, chunk_pages, num_chunks, error_code);
 }
 
 void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 37a8cabc182..d652a43d097 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -868,14 +868,16 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputePageStringSi
         if (col.str_dict_index) {
           // String dictionary: use index
           dict_base = reinterpret_cast<const uint8_t*>(col.str_dict_index);
-          dict_size = col.page_info[0].num_input_values * sizeof(string_index_pair);
+          dict_size = col.dict_page->num_input_values * sizeof(string_index_pair);
         } else {
-          dict_base = col.page_info[0].page_data;  // dictionary is always stored in the first page
-          dict_size = col.page_info[0].uncompressed_page_size;
+          dict_base = col.dict_page->page_data;
+          dict_size = col.dict_page->uncompressed_page_size;
         }
 
         // FIXME: need to return an error condition...this won't actually do anything
-        if (s->dict_bits > 32 || !dict_base) { CUDF_UNREACHABLE("invalid dictionary bit size"); }
+        if (s->dict_bits > 32 || (!dict_base && col.dict_page->num_input_values > 0)) {
+          CUDF_UNREACHABLE("invalid dictionary bit size");
+        }
 
         str_bytes = totalDictEntriesSize(
           data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value);
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 18d282be855..d58c7f95389 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -339,6 +339,21 @@ struct PageInfo {
   decode_kernel_mask kernel_mask;
 };
 
+/**
+ * @brief Return the column schema id as the key for a PageInfo struct.
+ */
+struct get_page_key {
+  __device__ int32_t operator()(PageInfo const& page) const { return page.src_col_schema; }
+};
+
+/**
+ * @brief Return an iterator that returns they keys for a vector of pages.
+ */
+inline auto make_page_key_iterator(device_span<PageInfo const> pages)
+{
+  return thrust::make_transform_iterator(pages.begin(), get_page_key{});
+}
+
 /**
  * @brief Struct describing a particular chunk of column data
  */
@@ -362,7 +377,8 @@ struct ColumnChunkDesc {
                            int8_t decimal_precision_,
                            int32_t ts_clock_rate_,
                            int32_t src_col_index_,
-                           int32_t src_col_schema_)
+                           int32_t src_col_schema_,
+                           float list_bytes_per_row_est_)
     : compressed_data(compressed_data_),
       compressed_size(compressed_size_),
       num_values(num_values_),
@@ -375,7 +391,7 @@ struct ColumnChunkDesc {
       num_data_pages(0),
       num_dict_pages(0),
       max_num_pages(0),
-      page_info(nullptr),
+      dict_page(nullptr),
       str_dict_index(nullptr),
       valid_map_base{nullptr},
       column_data_base{nullptr},
@@ -386,26 +402,25 @@ struct ColumnChunkDesc {
       decimal_precision(decimal_precision_),
       ts_clock_rate(ts_clock_rate_),
       src_col_index(src_col_index_),
-      src_col_schema(src_col_schema_)
+      src_col_schema(src_col_schema_),
+      list_bytes_per_row_est(list_bytes_per_row_est_)
   {
   }
 
-  uint8_t const* compressed_data{};                  // pointer to compressed column chunk data
-  size_t compressed_size{};                          // total compressed data size for this chunk
-  size_t num_values{};                               // total number of values in this column
-  size_t start_row{};                                // starting row of this chunk
-  uint32_t num_rows{};                               // number of rows in this chunk
+  uint8_t const* compressed_data{};  // pointer to compressed column chunk data
+  size_t compressed_size{};          // total compressed data size for this chunk
+  size_t num_values{};               // total number of values in this column
+  size_t start_row{};                // file-wide, absolute starting row of this chunk
+  uint32_t num_rows{};               // number of rows in this chunk
   int16_t max_level[level_type::NUM_LEVEL_TYPES]{};  // max definition/repetition level
   int16_t max_nesting_depth{};                       // max nesting depth of the output
-  uint16_t data_type{};  // basic column data type, ((type_length << 3) |
-                         // parquet::Type)
+  uint16_t data_type{};  // basic column data type, ((type_length << 3) | // parquet::Type)
   uint8_t
-    level_bits[level_type::NUM_LEVEL_TYPES]{};   // bits to encode max definition/repetition levels
-  int32_t num_data_pages{};                      // number of data pages
-  int32_t num_dict_pages{};                      // number of dictionary pages
-  int32_t max_num_pages{};                       // size of page_info array
-  PageInfo* page_info{};                         // output page info for up to num_dict_pages +
-                                                 // num_data_pages (dictionary pages first)
+    level_bits[level_type::NUM_LEVEL_TYPES]{};  // bits to encode max definition/repetition levels
+  int32_t num_data_pages{};                     // number of data pages
+  int32_t num_dict_pages{};                     // number of dictionary pages
+  int32_t max_num_pages{};                      // size of page_info array
+  PageInfo const* dict_page{};
   string_index_pair* str_dict_index{};           // index for string dictionary
   bitmask_type** valid_map_base{};               // base pointers of valid bit map for this column
   void** column_data_base{};                     // base pointers of column data
@@ -418,6 +433,15 @@ struct ColumnChunkDesc {
 
   int32_t src_col_index{};   // my input column index
   int32_t src_col_schema{};  // my schema index in the file
+
+  float list_bytes_per_row_est{};  // for LIST columns, an estimate on number of bytes per row
+};
+
+/**
+ * @brief A utility structure for use in decoding page headers.
+ */
+struct chunk_page_info {
+  PageInfo* pages;
 };
 
 /**
@@ -578,11 +602,13 @@ constexpr bool is_string_col(ColumnChunkDesc const& chunk)
  * @brief Launches kernel for parsing the page headers in the column chunks
  *
  * @param[in] chunks List of column chunks
+ * @param[in] chunk_pages List of pages associated with the chunks, in chunk-sorted order
  * @param[in] num_chunks Number of column chunks
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
 void DecodePageHeaders(ColumnChunkDesc* chunks,
+                       chunk_page_info* chunk_pages,
                        int32_t num_chunks,
                        kernel_error::pointer error_code,
                        rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index c1082c0305a..24d46d91dbb 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,26 +29,28 @@ namespace cudf::io::parquet::detail {
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
-  auto& chunks               = _pass_itm_data->chunks;
-  auto& pages                = _pass_itm_data->pages_info;
-  auto& page_nesting         = _pass_itm_data->page_nesting_info;
-  auto& page_nesting_decode  = _pass_itm_data->page_nesting_decode_info;
-  auto const level_type_size = _pass_itm_data->level_type_size;
+  auto& pass    = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
+
+  auto& page_nesting        = subpass.page_nesting_info;
+  auto& page_nesting_decode = subpass.page_nesting_decode_info;
+
+  auto const level_type_size = pass.level_type_size;
 
   // temporary space for DELTA_BYTE_ARRAY decoding. this only needs to live until
   // gpu::DecodeDeltaByteArray returns.
   rmm::device_uvector<uint8_t> delta_temp_buf(0, _stream);
 
   // Should not reach here if there is no page data.
-  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+  CUDF_EXPECTS(subpass.pages.size() > 0, "There are no pages to decode");
 
   size_t const sum_max_depths = std::accumulate(
-    chunks.begin(), chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) {
+    pass.chunks.begin(), pass.chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) {
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
   // figure out which kernels to run
-  auto const kernel_mask = GetAggregatedDecodeKernelMask(pages, _stream);
+  auto const kernel_mask = GetAggregatedDecodeKernelMask(subpass.pages, _stream);
 
   // Check to see if there are any string columns present. If so, then we need to get size info
   // for each string page. This size info will be used to pre-allocate memory for the column,
@@ -59,8 +61,14 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   auto const has_strings = (kernel_mask & STRINGS_MASK) != 0;
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
-    ComputePageStringSizes(
-      pages, chunks, delta_temp_buf, skip_rows, num_rows, level_type_size, kernel_mask, _stream);
+    ComputePageStringSizes(subpass.pages,
+                           pass.chunks,
+                           delta_temp_buf,
+                           skip_rows,
+                           num_rows,
+                           level_type_size,
+                           kernel_mask,
+                           _stream);
 
     col_sizes = calculate_page_string_offsets();
 
@@ -83,26 +91,26 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     cudf::detail::hostdevice_vector<void*>(has_strings ? sum_max_depths : 0, _stream);
 
   // Update chunks with pointers to column data.
-  for (size_t c = 0, page_count = 0, chunk_off = 0; c < chunks.size(); c++) {
-    input_column_info const& input_col = _input_columns[chunks[c].src_col_index];
-    CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
+  for (size_t c = 0, chunk_off = 0; c < pass.chunks.size(); c++) {
+    input_column_info const& input_col = _input_columns[pass.chunks[c].src_col_index];
+    CUDF_EXPECTS(input_col.schema_idx == pass.chunks[c].src_col_schema,
                  "Column/page schema index mismatch");
 
-    size_t max_depth = _metadata->get_output_nesting_depth(chunks[c].src_col_schema);
+    size_t max_depth = _metadata->get_output_nesting_depth(pass.chunks[c].src_col_schema);
     chunk_offsets.push_back(chunk_off);
 
     // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers
     // to validity data
-    auto valids              = chunk_nested_valids.host_ptr(chunk_off);
-    chunks[c].valid_map_base = chunk_nested_valids.device_ptr(chunk_off);
+    auto valids                   = chunk_nested_valids.host_ptr(chunk_off);
+    pass.chunks[c].valid_map_base = chunk_nested_valids.device_ptr(chunk_off);
 
     // get a slice of size `nesting depth` from `chunk_nested_data` to store an array of pointers to
     // out data
-    auto data                  = chunk_nested_data.host_ptr(chunk_off);
-    chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
+    auto data                       = chunk_nested_data.host_ptr(chunk_off);
+    pass.chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
 
     auto str_data = has_strings ? chunk_nested_str_data.host_ptr(chunk_off) : nullptr;
-    chunks[c].column_string_base =
+    pass.chunks[c].column_string_base =
       has_strings ? chunk_nested_str_data.device_ptr(chunk_off) : nullptr;
 
     chunk_off += max_depth;
@@ -148,8 +156,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
         valids[idx] = out_buf.null_mask();
         data[idx]   = out_buf.data();
         // only do string buffer for leaf
-        if (out_buf.string_size() == 0 && col_sizes[chunks[c].src_col_index] > 0) {
-          out_buf.create_string_data(col_sizes[chunks[c].src_col_index], _stream);
+        if (out_buf.string_size() == 0 && col_sizes[pass.chunks[c].src_col_index] > 0) {
+          out_buf.create_string_data(col_sizes[pass.chunks[c].src_col_index], _stream);
         }
         if (has_strings) { str_data[idx] = out_buf.string_data(); }
         out_buf.user_data |=
@@ -159,12 +167,9 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
         data[idx]   = nullptr;
       }
     }
-
-    // column_data_base will always point to leaf data, even for nested types.
-    page_count += chunks[c].max_num_pages;
   }
 
-  chunks.host_to_device_async(_stream);
+  pass.chunks.host_to_device_async(_stream);
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
   if (has_strings) { chunk_nested_str_data.host_to_device_async(_stream); }
@@ -179,44 +184,71 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // launch string decoder
   int s_idx = 0;
   if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) {
-    DecodeStringPageData(
-      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+    DecodeStringPageData(subpass.pages,
+                         pass.chunks,
+                         num_rows,
+                         skip_rows,
+                         level_type_size,
+                         error_code.data(),
+                         streams[s_idx++]);
   }
 
   // launch delta byte array decoder
   if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BYTE_ARRAY) != 0) {
-    DecodeDeltaByteArray(
-      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+    DecodeDeltaByteArray(subpass.pages,
+                         pass.chunks,
+                         num_rows,
+                         skip_rows,
+                         level_type_size,
+                         error_code.data(),
+                         streams[s_idx++]);
   }
 
   // launch delta length byte array decoder
   if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_LENGTH_BA) != 0) {
-    DecodeDeltaLengthByteArray(
-      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+    DecodeDeltaLengthByteArray(subpass.pages,
+                               pass.chunks,
+                               num_rows,
+                               skip_rows,
+                               level_type_size,
+                               error_code.data(),
+                               streams[s_idx++]);
   }
 
   // launch delta binary decoder
   if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BINARY) != 0) {
-    DecodeDeltaBinary(
-      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+    DecodeDeltaBinary(subpass.pages,
+                      pass.chunks,
+                      num_rows,
+                      skip_rows,
+                      level_type_size,
+                      error_code.data(),
+                      streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
   if (BitAnd(kernel_mask, decode_kernel_mask::GENERAL) != 0) {
-    DecodePageData(
-      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+    DecodePageData(subpass.pages,
+                   pass.chunks,
+                   num_rows,
+                   skip_rows,
+                   level_type_size,
+                   error_code.data(),
+                   streams[s_idx++]);
   }
 
   // synchronize the streams
   cudf::detail::join_streams(streams, _stream);
 
-  pages.device_to_host_async(_stream);
+  subpass.pages.device_to_host_async(_stream);
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
 
   if (error_code.value() != 0) {
     CUDF_FAIL("Parquet data decode failed with code(s) " + error_code.str());
   }
+  // error_code.value() is synchronous; explicitly sync here for better visibility
+  _stream.synchronize();
 
   // for list columns, add the final offset to every offset buffer.
   // TODO : make this happen in more efficiently. Maybe use thrust::for_each
@@ -259,10 +291,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   }
 
   // update null counts in the final column buffers
-  for (size_t idx = 0; idx < pages.size(); idx++) {
-    PageInfo* pi = &pages[idx];
+  for (size_t idx = 0; idx < subpass.pages.size(); idx++) {
+    PageInfo* pi = &subpass.pages[idx];
     if (pi->flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
-    ColumnChunkDesc* col               = &chunks[pi->chunk_idx];
+    ColumnChunkDesc* col               = &pass.chunks[pi->chunk_idx];
     input_column_info const& input_col = _input_columns[col->src_col_index];
 
     int index                   = pi->nesting_decode - page_nesting_decode.device_ptr();
@@ -344,60 +376,16 @@ void reader::impl::prepare_data(int64_t skip_rows,
 {
   // if we have not preprocessed at the whole-file level, do that now
   if (!_file_preprocessed) {
-    // if filter is not empty, then create output types as vector and pass for filtering.
-    std::vector<data_type> output_types;
-    if (filter.has_value()) {
-      std::transform(_output_buffers.cbegin(),
-                     _output_buffers.cend(),
-                     std::back_inserter(output_types),
-                     [](auto const& col) { return col.type; });
-    }
-    std::tie(
-      _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
-      _metadata->select_row_groups(
-        row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
-
-    if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
-        not _input_columns.empty()) {
-      // fills in chunk information without physically loading or decompressing
-      // the associated data
-      create_global_chunk_info();
-
-      // compute schedule of input reads. Each rowgroup contains 1 chunk per column. For now
-      // we will read an entire row group at a time. However, it is possible to do
-      // sub-rowgroup reads if we made some estimates on individual chunk sizes (tricky) and
-      // changed the high level structure such that we weren't always reading an entire table's
-      // worth of columns at once.
-      compute_input_passes();
-    }
-
-    _file_preprocessed = true;
+    // setup file level information
+    // - read row group information
+    // - setup information on (parquet) chunks
+    // - compute schedule of input passes
+    preprocess_file(skip_rows, num_rows, row_group_indices, filter);
   }
 
-  // if we have to start a new pass, do that now
-  if (!_pass_preprocessed) {
-    auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
-
-    // always create the pass struct, even if we end up with no passes.
-    // this will also cause the previous pass information to be deleted
-    _pass_itm_data = std::make_unique<pass_intermediate_data>();
-
-    if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
-        not _input_columns.empty() && _current_input_pass < num_passes) {
-      // setup the pass_intermediate_info for this pass.
-      setup_next_pass();
-
-      load_and_decompress_data();
-      preprocess_pages(uses_custom_row_bounds, _output_chunk_read_limit);
-
-      if (_output_chunk_read_limit == 0) {  // read the whole file at once
-        CUDF_EXPECTS(_pass_itm_data->output_chunk_read_info.size() == 1,
-                     "Reading the whole file should yield only one chunk.");
-      }
-    }
-
-    _pass_preprocessed = true;
-  }
+  // handle any chunking work (ratcheting through the subpasses and chunks within
+  // our current pass)
+  if (_file_itm_data.num_passes() > 0) { handle_chunking(uses_custom_row_bounds); }
 }
 
 void reader::impl::populate_metadata(table_metadata& out_metadata)
@@ -427,12 +415,12 @@ table_with_metadata reader::impl::read_chunk_internal(
   auto out_columns = std::vector<std::unique_ptr<column>>{};
   out_columns.reserve(_output_buffers.size());
 
-  if (!has_next() || _pass_itm_data->output_chunk_read_info.empty()) {
-    return finalize_output(out_metadata, out_columns, filter);
-  }
+  // no work to do (this can happen on the first pass if we have no rows to read)
+  if (!has_more_work()) { return finalize_output(out_metadata, out_columns, filter); }
 
-  auto const& read_info =
-    _pass_itm_data->output_chunk_read_info[_pass_itm_data->current_output_chunk];
+  auto& pass            = *_pass_itm_data;
+  auto& subpass         = *pass.subpass;
+  auto const& read_info = subpass.output_chunk_read_info[subpass.current_output_chunk];
 
   // Allocate memory buffers for the output columns.
   allocate_columns(read_info.skip_rows, read_info.num_rows, uses_custom_row_bounds);
@@ -485,15 +473,12 @@ table_with_metadata reader::impl::finalize_output(
     _output_metadata = std::make_unique<table_metadata>(out_metadata);
   }
 
-  // advance chunks/passes as necessary
-  _pass_itm_data->current_output_chunk++;
-  _chunk_count++;
-  if (_pass_itm_data->current_output_chunk >= _pass_itm_data->output_chunk_read_info.size()) {
-    _pass_itm_data->current_output_chunk = 0;
-    _pass_itm_data->output_chunk_read_info.clear();
-
-    _current_input_pass++;
-    _pass_preprocessed = false;
+  // advance output chunk/subpass/pass info
+  if (_file_itm_data.num_passes() > 0) {
+    auto& pass    = *_pass_itm_data;
+    auto& subpass = *pass.subpass;
+    subpass.current_output_chunk++;
+    _file_itm_data._output_chunk_count++;
   }
 
   if (filter.has_value()) {
@@ -530,7 +515,7 @@ table_with_metadata reader::impl::read_chunk()
 {
   // Reset the output buffers to their original states (right after reader construction).
   // Don't need to do it if we read the file all at once.
-  if (_chunk_count > 0) {
+  if (_file_itm_data._output_chunk_count > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
       _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
@@ -553,10 +538,9 @@ bool reader::impl::has_next()
                {} /*row_group_indices, empty means read all row groups*/,
                std::nullopt /*filter*/);
 
-  size_t const num_input_passes = std::max(
-    int64_t{0}, static_cast<int64_t>(_file_itm_data.input_pass_row_group_offsets.size()) - 1);
-  return (_pass_itm_data->current_output_chunk < _pass_itm_data->output_chunk_read_info.size()) ||
-         (_current_input_pass < num_input_passes);
+  // current_input_pass will only be incremented to be == num_passes after
+  // the last chunk in the last subpass in the last pass has been returned
+  return has_more_work();
 }
 
 namespace {
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index cea4ba35606..67c56c9c2d7 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -120,6 +120,8 @@ class reader::impl {
    */
   table_with_metadata read_chunk();
 
+  // top level functions involved with ratcheting through the passes, subpasses
+  // and output chunks of the read process
  private:
   /**
    * @brief Perform the necessary data preprocessing for parsing file later on.
@@ -138,20 +140,101 @@ class reader::impl {
                     std::optional<std::reference_wrapper<ast::expression const>> filter);
 
   /**
-   * @brief Create chunk information and start file reads
+   * @brief Preprocess step for the entire file.
+   *
+   * Only ever called once. This function reads in rowgroup and associated chunk
+   * information and computes the schedule of top level passes (see `pass_intermediate_data`).
+   *
+   * @param skip_rows The number of rows to skip in the requested set of rowgroups to be read
+   * @param num_rows The total number of rows to read out of the selected rowgroups
+   * @param row_group_indices Lists of row groups to read, one per source
+   * @param filter Optional AST expression to filter output rows
+   */
+  void preprocess_file(int64_t skip_rows,
+                       std::optional<size_type> const& num_rows,
+                       host_span<std::vector<size_type> const> row_group_indices,
+                       std::optional<std::reference_wrapper<ast::expression const>> filter);
+
+  /**
+   * @brief Ratchet the pass/subpass/chunk process forward.
+   *
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specified
+   *        bounds
+   */
+  void handle_chunking(bool uses_custom_row_bounds);
+
+  /**
+   * @brief Setup step for the next input read pass.
+   *
+   * A 'pass' is defined as a subset of row groups read out of the globally
+   * requested set of all row groups.
+   *
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
+   */
+  void setup_next_pass(bool uses_custom_row_bounds);
+
+  /**
+   * @brief Setup step for the next decompression subpass.
+   *
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
+   *
+   * A 'subpass' is defined as a subset of pages within a pass that are
+   * decompressed and decoded as a batch. Subpasses may be further subdivided
+   * into output chunks.
+   */
+  void setup_next_subpass(bool uses_custom_row_bounds);
+
+  /**
+   * @brief Read a chunk of data and return an output table.
+   *
+   * This function is called internally and expects all preprocessing steps have already been done.
+   *
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
+   * @param filter Optional AST expression to filter output rows
+   * @return The output table along with columns' metadata
+   */
+  table_with_metadata read_chunk_internal(
+    bool uses_custom_row_bounds,
+    std::optional<std::reference_wrapper<ast::expression const>> filter);
+
+  // utility functions
+ private:
+  /**
+   * @brief Read the set of column chunks to be processed for this pass.
+   *
+   * Does not decompress the chunk data.
    *
    * @return pair of boolean indicating if compressed chunks were found and a vector of futures for
    * read completion
    */
-  std::pair<bool, std::vector<std::future<void>>> read_and_decompress_column_chunks();
+  std::pair<bool, std::vector<std::future<void>>> read_column_chunks();
 
   /**
-   * @brief Load and decompress the input file(s) into memory.
+   * @brief Read compressed data and page information for the current pass.
    */
-  void load_and_decompress_data();
+  void read_compressed_data();
 
   /**
-   * @brief Perform some preprocessing for page data and also compute the split locations
+   * @brief Build string dictionary indices for a pass.
+   *
+   */
+  void build_string_dict_indices();
+
+  /**
+   * @brief For list columns, generate estimated row counts for pages in the current pass.
+   *
+   * The row counts in the pages that come out of the file only reflect the number of values in
+   * all of the rows in the page, not the number of rows themselves. In order to do subpass reading
+   * more accurately, we would like to have a more accurate guess of the real number of rows per
+   * page.
+   */
+  void generate_list_column_row_count_estimates();
+
+  /**
+   * @brief Perform some preprocessing for subpass page data and also compute the split locations
    * {skip_rows, num_rows} for chunked reading.
    *
    * There are several pieces of information we can't compute directly from row counts in
@@ -166,7 +249,7 @@ class reader::impl {
    * @param chunk_read_limit Limit on total number of bytes to be returned per read,
    *        or `0` if there is no limit
    */
-  void preprocess_pages(bool uses_custom_row_bounds, size_t chunk_read_limit);
+  void preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit);
 
   /**
    * @brief Allocate nesting information storage for all pages and set pointers to it.
@@ -194,20 +277,6 @@ class reader::impl {
    */
   void populate_metadata(table_metadata& out_metadata);
 
-  /**
-   * @brief Read a chunk of data and return an output table.
-   *
-   * This function is called internally and expects all preprocessing steps have already been done.
-   *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   * @param filter Optional AST expression to filter output rows
-   * @return The output table along with columns' metadata
-   */
-  table_with_metadata read_chunk_internal(
-    bool uses_custom_row_bounds,
-    std::optional<std::reference_wrapper<ast::expression const>> filter);
-
   /**
    * @brief Finalize the output table by adding empty columns for the non-selected columns in
    * schema.
@@ -260,17 +329,18 @@ class reader::impl {
    */
   void compute_input_passes();
 
-  /**
-   * @brief Close out the existing pass (if any) and prepare for the next pass.
-   */
-  void setup_next_pass();
-
   /**
    * @brief Given a set of pages that have had their sizes computed by nesting level and
    * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
    * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes.
    */
-  void compute_splits_for_pass();
+  void compute_output_chunks_for_subpass();
+
+  [[nodiscard]] bool has_more_work() const
+  {
+    return _file_itm_data.num_passes() > 0 &&
+           _file_itm_data._current_input_pass < _file_itm_data.num_passes();
+  }
 
  private:
   rmm::cuda_stream_view _stream;
@@ -311,13 +381,9 @@ class reader::impl {
   bool _file_preprocessed{false};
 
   std::unique_ptr<pass_intermediate_data> _pass_itm_data;
-  bool _pass_preprocessed{false};
 
   std::size_t _output_chunk_read_limit{0};  // output chunk size limit in bytes
   std::size_t _input_pass_read_limit{0};    // input pass memory usage limit in bytes
-
-  std::size_t _current_input_pass{0};  // current input pass index
-  std::size_t _chunk_count{0};         // how many output chunks we have produced
 };
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 213fc380a34..1bfe5745b9e 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,11 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 
+#include <io/comp/nvcomp_adapter.hpp>
+
+#include <io/utilities/config_utils.hpp>
 #include <io/utilities/time_utils.cuh>
 
 #include <rmm/exec_policy.hpp>
@@ -27,37 +31,61 @@
 #include <thrust/binary_search.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/logical.h>
 #include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include <cuda/functional>
+
+#include <numeric>
 
 namespace cudf::io::parquet::detail {
 
 namespace {
 
-struct cumulative_row_info {
-  size_t row_count;   // cumulative row count
+struct split_info {
+  row_range rows;
+  int64_t split_pos;
+};
+
+struct cumulative_page_info {
+  size_t row_index;   // row index
   size_t size_bytes;  // cumulative size in bytes
   int key;            // schema index
 };
 
+// the minimum amount of memory we can safely expect to be enough to
+// do a subpass decode. if the difference between the user specified limit and
+// the actual memory used for compressed/temp data is > than this value, we will still use
+// at least this many additional bytes.
+// Example:
+// - user has specified 1 GB limit
+// - we have read in 900 MB of compressed data
+// - that leaves us 100 MB of space for decompression batches
+// - to keep the gpu busy, we really don't want to do less than 200 MB at a time so we're just going
+// to use 200 MB of space
+//   even if that goes past the user-specified limit.
+constexpr size_t minimum_subpass_expected_size = 200 * 1024 * 1024;
+
+// percentage of the total available input read limit that should be reserved for compressed
+// data vs uncompressed data.
+constexpr float input_limit_compression_reserve = 0.3f;
+
 #if defined(CHUNKING_DEBUG)
-void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
-                                rmm::device_uvector<int32_t> const& page_index,
-                                rmm::device_uvector<cumulative_row_info> const& c_info,
+void print_cumulative_page_info(device_span<PageInfo const> d_pages,
+                                device_span<ColumnChunkDesc const> d_chunks,
+                                device_span<cumulative_page_info const> d_c_info,
                                 rmm::cuda_stream_view stream)
 {
-  pages.device_to_host_sync(stream);
+  std::vector<PageInfo> pages              = cudf::detail::make_std_vector_sync(d_pages, stream);
+  std::vector<ColumnChunkDesc> chunks      = cudf::detail::make_std_vector_sync(d_chunks, stream);
+  std::vector<cumulative_page_info> c_info = cudf::detail::make_std_vector_sync(d_c_info, stream);
 
   printf("------------\nCumulative sizes by page\n");
 
   std::vector<int> schemas(pages.size());
-  std::vector<int> h_page_index(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
-  std::vector<cumulative_row_info> h_cinfo(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
   auto schema_iter = cudf::detail::make_counting_transform_iterator(
-    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
+    0, [&](size_type i) { return pages[i].src_col_schema; });
   thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
   auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
   schemas.resize(last - schemas.begin());
@@ -66,38 +94,44 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages
   for (size_t idx = 0; idx < schemas.size(); idx++) {
     printf("Schema %d\n", schemas[idx]);
     for (size_t pidx = 0; pidx < pages.size(); pidx++) {
-      auto const& page = pages[h_page_index[pidx]];
+      auto const& page = pages[pidx];
       if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
         continue;
       }
-      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
+      bool const is_list = chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0;
+      printf("\tP %s: {%lu, %lu, %lu}\n",
+             is_list ? "(L)" : "",
+             pidx,
+             c_info[pidx].row_index,
+             c_info[pidx].size_bytes);
     }
   }
 }
 
-void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
+void print_cumulative_row_info(host_span<cumulative_page_info const> sizes,
                                std::string const& label,
-                               std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
+                               std::optional<std::vector<row_range>> splits = std::nullopt)
 {
   if (splits.has_value()) {
-    printf("------------\nSplits\n");
+    printf("------------\nSplits (skip_rows, num_rows)\n");
     for (size_t idx = 0; idx < splits->size(); idx++) {
       printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
     }
   }
 
-  printf("------------\nCumulative sizes %s\n", label.c_str());
+  printf("------------\nCumulative sizes %s (index, row_index, size_bytes, page_key)\n",
+         label.c_str());
   for (size_t idx = 0; idx < sizes.size(); idx++) {
-    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
+    printf("{%lu, %lu, %lu, %d}", idx, sizes[idx].row_index, sizes[idx].size_bytes, sizes[idx].key);
     if (splits.has_value()) {
       // if we have a split at this row count and this is the last instance of this row count
-      auto start = thrust::make_transform_iterator(
-        splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; });
+      auto start             = thrust::make_transform_iterator(splits->begin(),
+                                                   [](row_range const& i) { return i.skip_rows; });
       auto end               = start + splits->size();
-      auto split             = std::find(start, end, sizes[idx].row_count);
+      auto split             = std::find(start, end, sizes[idx].row_index);
       auto const split_index = [&]() -> int {
         if (split != end &&
-            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
+            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_index > sizes[idx].row_index))) {
           return static_cast<int>(std::distance(start, split));
         }
         return idx == 0 ? 0 : -1;
@@ -114,13 +148,13 @@ void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
 #endif  // CHUNKING_DEBUG
 
 /**
- * @brief Functor which reduces two cumulative_row_info structs of the same key.
+ * @brief Functor which reduces two cumulative_page_info structs of the same key.
  */
-struct cumulative_row_sum {
-  cumulative_row_info operator()
-    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
+struct cumulative_page_sum {
+  cumulative_page_info operator()
+    __device__(cumulative_page_info const& a, cumulative_page_info const& b) const
   {
-    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
+    return cumulative_page_info{0, a.size_bytes + b.size_bytes, a.key};
   }
 };
 
@@ -178,32 +212,57 @@ __device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, boo
  *
  * Sums across all nesting levels.
  */
-struct get_cumulative_row_info {
-  PageInfo const* const pages;
-
-  __device__ cumulative_row_info operator()(size_type index)
+struct get_page_output_size {
+  __device__ cumulative_page_info operator()(PageInfo const& page) const
   {
-    auto const& page = pages[index];
     if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
-      return cumulative_row_info{0, 0, page.src_col_schema};
+      return cumulative_page_info{0, 0, page.src_col_schema};
     }
 
     // total nested size, not counting string data
-    auto iter =
-      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, cuda::proclaim_return_type<size_t>([page] __device__(size_type i) {
         auto const& pni = page.nesting[i];
         return cudf::type_dispatcher(
           data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
-      });
-
-    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
+      }));
     return {
-      row_count,
+      0,
       thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
       page.src_col_schema};
   }
 };
 
+/**
+ * @brief Functor which sets the (uncompressed) size of a page.
+ */
+struct get_page_input_size {
+  __device__ cumulative_page_info operator()(PageInfo const& page) const
+  {
+    // we treat dictionary page sizes as 0 for subpasses because we have already paid the price for
+    // them at the pass level.
+    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return {0, 0, page.src_col_schema}; }
+    return {0, static_cast<size_t>(page.uncompressed_page_size), page.src_col_schema};
+  }
+};
+
+/**
+ * @brief Functor which sets the absolute row index of a page in a cumulative_page_info struct
+ */
+struct set_row_index {
+  device_span<ColumnChunkDesc const> chunks;
+  device_span<PageInfo const> pages;
+  device_span<cumulative_page_info> c_info;
+
+  __device__ void operator()(size_t i)
+  {
+    auto const& page            = pages[i];
+    auto const& chunk           = chunks[page.chunk_idx];
+    size_t const page_start_row = chunk.start_row + page.chunk_row + page.num_rows;
+    c_info[i].row_index         = page_start_row;
+  }
+};
+
 /**
  * @brief Functor which computes the effective size of all input columns by page.
  *
@@ -219,12 +278,12 @@ struct get_cumulative_row_info {
  * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
  * page. Essentially, a conservative over-estimate of the real size.
  */
-struct row_total_size {
-  cumulative_row_info const* c_info;
+struct page_total_size {
+  cumulative_page_info const* c_info;
   size_type const* key_offsets;
   size_t num_keys;
 
-  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
+  __device__ cumulative_page_info operator()(cumulative_page_info const& i) const
   {
     // sum sizes for each input column at this row
     size_t sum = 0;
@@ -232,71 +291,81 @@ struct row_total_size {
       auto const start = key_offsets[idx];
       auto const end   = key_offsets[idx + 1];
       auto iter        = cudf::detail::make_counting_transform_iterator(
-        0, [&] __device__(size_type i) { return c_info[i].row_count; });
+        0, cuda::proclaim_return_type<size_t>([&] __device__(size_type i) {
+          return c_info[i].row_index;
+        }));
       auto const page_index =
-        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
+        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_index) - iter;
       sum += c_info[page_index].size_bytes;
     }
-    return {i.row_count, sum, i.key};
+    return {i.row_index, sum, i.key};
   }
 };
 
 /**
- * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
- * limit, determine the set of splits.
+ * @brief Functor which returns the compressed data size for a chunk
+ */
+struct get_chunk_compressed_size {
+  __device__ size_t operator()(ColumnChunkDesc const& chunk) const { return chunk.compressed_size; }
+};
+
+/**
+ * @brief Find the first entry in the aggreggated_info that corresponds to the specified row
  *
- * @param sizes Vector of cumulative {row_count, byte_size} pairs
- * @param num_rows Total number of rows to read
- * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
  */
-std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
-                                         size_t num_rows,
-                                         size_t chunk_read_limit)
+size_t find_start_index(cudf::host_span<cumulative_page_info const> aggregated_info,
+                        size_t start_row)
 {
-  // now we have an array of {row_count, real output bytes}. just walk through it and generate
-  // splits.
-  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
-  // sizes are reasonably large, this shouldn't iterate too many times
-  std::vector<chunk_read_info> splits;
-  {
-    size_t cur_pos             = 0;
-    size_t cur_cumulative_size = 0;
-    size_t cur_row_count       = 0;
-    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
-      return i.size_bytes - cur_cumulative_size;
-    });
-    auto end   = start + sizes.size();
-    while (cur_row_count < num_rows) {
-      int64_t split_pos =
-        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
-
-      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
-      // one.
-      if (static_cast<size_t>(split_pos) >= sizes.size() ||
-          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
-        split_pos--;
-      }
+  auto start = thrust::make_transform_iterator(
+    aggregated_info.begin(), [&](cumulative_page_info const& i) { return i.row_index; });
+  auto start_index =
+    thrust::lower_bound(thrust::host, start, start + aggregated_info.size(), start_row) - start;
+
+  // cumulative_page_info.row_index is the -end- of the rows of a given page. so move forward until
+  // we find the next group of pages
+  while (start_index < (static_cast<int64_t>(aggregated_info.size()) - 1) &&
+         (start_index < 0 || aggregated_info[start_index].row_index == start_row)) {
+    start_index++;
+  }
 
-      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
-      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
-      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
-      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
-      // either do this, or we have to call unique() on the input first.
-      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
-        split_pos++;
-      }
+  return start_index;
+}
 
-      auto const start_row = cur_row_count;
-      cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(chunk_read_info{start_row, cur_row_count - start_row});
-      cur_pos             = split_pos;
-      cur_cumulative_size = sizes[split_pos].size_bytes;
-    }
+/**
+ * @brief Given a current position and row index, find the next split based on the
+ * specified size limit
+ *
+ * @returns The inclusive index within `sizes` where the next split should happen
+ *
+ */
+int64_t find_next_split(int64_t cur_pos,
+                        size_t cur_row_index,
+                        size_t cur_cumulative_size,
+                        cudf::host_span<cumulative_page_info const> sizes,
+                        size_t size_limit)
+{
+  auto const start = thrust::make_transform_iterator(
+    sizes.begin(),
+    [&](cumulative_page_info const& i) { return i.size_bytes - cur_cumulative_size; });
+  auto const end = start + sizes.size();
+
+  int64_t split_pos = thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit) - start;
+
+  // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
+  // one.
+  if (static_cast<size_t>(split_pos) >= sizes.size() ||
+      (sizes[split_pos].size_bytes - cur_cumulative_size > size_limit)) {
+    split_pos--;
   }
-  // print_cumulative_row_info(sizes, "adjusted", splits);
 
-  return splits;
+  // cumulative_page_info.row_index is the -end- of the rows of a given page. so move forward until
+  // we find the next group of pages
+  while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
+         (split_pos < 0 || sizes[split_pos].row_index == cur_row_index)) {
+    split_pos++;
+  }
+
+  return split_pos;
 }
 
 /**
@@ -340,15 +409,969 @@ template <typename T = uint8_t>
   return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
 }
 
-struct row_count_compare {
-  __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b)
+struct row_count_less {
+  __device__ bool operator()(cumulative_page_info const& a, cumulative_page_info const& b) const
+  {
+    return a.row_index < b.row_index;
+  }
+};
+
+/**
+ * @brief return compressed and total size of the data in a row group
+ *
+ */
+std::pair<size_t, size_t> get_row_group_size(RowGroup const& rg)
+{
+  auto compressed_size_iter = thrust::make_transform_iterator(
+    rg.columns.begin(), [](ColumnChunk const& c) { return c.meta_data.total_compressed_size; });
+
+  // the trick is that total temp space needed is tricky to know
+  auto const compressed_size =
+    std::reduce(compressed_size_iter, compressed_size_iter + rg.columns.size());
+  auto const total_size = compressed_size + rg.total_byte_size;
+  return {compressed_size, total_size};
+}
+
+/**
+ * @brief For a set of cumulative_page_info data, adjust the size_bytes field
+ * such that it reflects the worst case for all pages that span the same rows.
+ *
+ * By doing this, we can now look at row X and know the total
+ * byte cost for all pages that span row X, not just the cost up to row X itself.
+ *
+ * This function is asynchronous. Call stream.synchronize() before using the
+ * results.
+ */
+std::pair<rmm::device_uvector<cumulative_page_info>, rmm::device_uvector<int32_t>>
+adjust_cumulative_sizes(device_span<cumulative_page_info const> c_info,
+                        device_span<PageInfo const> pages,
+                        rmm::cuda_stream_view stream)
+{
+  // sort by row count
+  rmm::device_uvector<cumulative_page_info> c_info_sorted =
+    make_device_uvector_async(c_info, stream, rmm::mr::get_current_device_resource());
+  thrust::sort(
+    rmm::exec_policy_nosync(stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_less{});
+
+  // page keys grouped by split.
+  rmm::device_uvector<int32_t> page_keys_by_split{c_info.size(), stream};
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    c_info_sorted.begin(),
+                    c_info_sorted.end(),
+                    page_keys_by_split.begin(),
+                    cuda::proclaim_return_type<int>(
+                      [] __device__(cumulative_page_info const& c) { return c.key; }));
+
+  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
+  // key
+  rmm::device_uvector<size_type> key_offsets(pages.size() + 1, stream);
+  auto page_keys             = make_page_key_iterator(pages);
+  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(stream),
+                                                     page_keys,
+                                                     page_keys + pages.size(),
+                                                     thrust::make_constant_iterator(1),
+                                                     thrust::make_discard_iterator(),
+                                                     key_offsets.begin())
+                                 .second;
+  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
+  thrust::exclusive_scan(
+    rmm::exec_policy_nosync(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
+
+  // adjust the cumulative info such that for each row count, the size includes any pages that span
+  // that row count. this is so that if we have this case:
+  //              page row counts
+  // Column A:    0 <----> 100 <----> 200
+  // Column B:    0 <---------------> 200 <--------> 400
+  //                        |
+  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+  // page.
+  //
+  rmm::device_uvector<cumulative_page_info> aggregated_info(c_info.size(), stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    c_info_sorted.begin(),
+                    c_info_sorted.end(),
+                    aggregated_info.begin(),
+                    page_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
+  return {std::move(aggregated_info), std::move(page_keys_by_split)};
+}
+
+struct page_span {
+  size_t start, end;
+};
+
+struct get_page_row_index {
+  device_span<cumulative_page_info const> c_info;
+
+  __device__ size_t operator()(size_t i) const { return c_info[i].row_index; }
+};
+
+/**
+ * @brief Return the span of page indices for a given column index that spans start_row and end_row
+ *
+ */
+template <typename RowIndexIter>
+struct get_page_span {
+  device_span<size_type const> page_offsets;
+  RowIndexIter page_row_index;
+  size_t const start_row;
+  size_t const end_row;
+
+  get_page_span(device_span<size_type const> _page_offsets,
+                RowIndexIter _page_row_index,
+                size_t _start_row,
+                size_t _end_row)
+    : page_offsets(_page_offsets),
+      page_row_index(_page_row_index),
+      start_row(_start_row),
+      end_row(_end_row)
+  {
+  }
+
+  __device__ page_span operator()(size_t column_index) const
+  {
+    auto const first_page_index  = page_offsets[column_index];
+    auto const column_page_start = page_row_index + first_page_index;
+    auto const column_page_end   = page_row_index + page_offsets[column_index + 1];
+    auto const num_pages         = column_page_end - column_page_start;
+
+    auto start_page =
+      (thrust::lower_bound(thrust::seq, column_page_start, column_page_end, start_row) -
+       column_page_start) +
+      first_page_index;
+    if (page_row_index[start_page] == start_row) { start_page++; }
+
+    auto end_page = (thrust::lower_bound(thrust::seq, column_page_start, column_page_end, end_row) -
+                     column_page_start) +
+                    first_page_index;
+    if (end_page < (first_page_index + num_pages)) { end_page++; }
+
+    return {static_cast<size_t>(start_page), static_cast<size_t>(end_page)};
+  }
+};
+
+struct get_span_size {
+  __device__ size_t operator()(page_span const& s) const { return s.end - s.start; }
+};
+
+/**
+ * @brief Computes the next subpass within the current pass.
+ *
+ * A subpass is a subset of the pages within the parent pass that is decompressed
+ * as a batch and decoded.  Subpasses are the level at which we control memory intermediate
+ * memory usage. A pass consists of >= 1 subpass.  We cannot compute all subpasses in one
+ * shot because we do not know how many rows we actually have in the pages of list columns.
+ * So we have to make an educated guess that fits within the memory limits, and then adjust
+ * for subsequent subpasses when we see how many rows we actually receive.
+ *
+ * @param c_info The cumulative page size information (row count and byte size) per column
+ * @param pages All of the pages in the pass
+ * @param page_offsets Offsets into the pages array representing the first page for each column
+ * @param start_row The row to start the subpass at
+ * @param size_limit The size limit in bytes of the subpass
+ * @param num_columns The number of columns
+ * @param stream The stream to execute cuda operations on
+ * @returns A tuple containing a vector of page_span structs indicating the page indices to include
+ * for each column to be processed, the total number of pages over all columns, and the total
+ * expected memory usage (including scratch space)
+ *
+ */
+std::tuple<std::vector<page_span>, size_t, size_t> compute_next_subpass(
+  device_span<cumulative_page_info const> c_info,
+  device_span<PageInfo const> pages,
+  device_span<size_type const> page_offsets,
+  size_t start_row,
+  size_t size_limit,
+  size_t num_columns,
+  rmm::cuda_stream_view stream)
+{
+  auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
+
+  // bring back to the cpu
+  auto const h_aggregated_info = cudf::detail::make_std_vector_sync(aggregated_info, stream);
+  // print_cumulative_row_info(h_aggregated_info, "adjusted");
+
+  // TODO: if the user has explicitly specified skip_rows/num_rows we could be more intelligent
+  // about skipping subpasses/pages that do not fall within the range of values, but only if the
+  // data does not contain lists (because our row counts are only estimates in that case)
+
+  // find the next split
+  auto const start_index = find_start_index(h_aggregated_info, start_row);
+  auto const cumulative_size =
+    start_row == 0 || start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes;
+  auto const end_index =
+    find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit);
+  auto const end_row = h_aggregated_info[end_index].row_index;
+
+  // for each column, collect the set of pages that spans start_row / end_row
+  rmm::device_uvector<page_span> page_bounds(num_columns, stream);
+  auto iter = thrust::make_counting_iterator(size_t{0});
+  auto page_row_index =
+    cudf::detail::make_counting_transform_iterator(0, get_page_row_index{c_info});
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    iter,
+                    iter + num_columns,
+                    page_bounds.begin(),
+                    get_page_span{page_offsets, page_row_index, start_row, end_row});
+
+  // total page count over all columns
+  auto page_count_iter = thrust::make_transform_iterator(page_bounds.begin(), get_span_size{});
+  size_t const total_pages =
+    thrust::reduce(rmm::exec_policy(stream), page_count_iter, page_count_iter + num_columns);
+
+  return {cudf::detail::make_std_vector_sync(page_bounds, stream),
+          total_pages,
+          h_aggregated_info[end_index].size_bytes - cumulative_size};
+}
+
+std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_info const> c_info,
+                                                  device_span<PageInfo const> pages,
+                                                  size_t skip_rows,
+                                                  size_t num_rows,
+                                                  size_t size_limit,
+                                                  rmm::cuda_stream_view stream)
+{
+  auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
+
+  // bring back to the cpu
+  std::vector<cumulative_page_info> h_aggregated_info =
+    cudf::detail::make_std_vector_sync(aggregated_info, stream);
+  // print_cumulative_row_info(h_aggregated_info, "adjusted");
+
+  std::vector<row_range> splits;
+  // note: we are working with absolute row indices so skip_rows represents the absolute min row
+  // index we care about
+  size_t cur_pos             = find_start_index(h_aggregated_info, skip_rows);
+  size_t cur_row_index       = skip_rows;
+  size_t cur_cumulative_size = 0;
+  auto const max_row         = min(skip_rows + num_rows, h_aggregated_info.back().row_index);
+  while (cur_row_index < max_row) {
+    auto const split_pos =
+      find_next_split(cur_pos, cur_row_index, cur_cumulative_size, h_aggregated_info, size_limit);
+
+    auto const start_row = cur_row_index;
+    cur_row_index        = min(max_row, h_aggregated_info[split_pos].row_index);
+    splits.push_back({start_row, cur_row_index - start_row});
+    cur_pos             = split_pos;
+    cur_cumulative_size = h_aggregated_info[split_pos].size_bytes;
+  }
+  // print_cumulative_row_info(h_aggregated_info, "adjusted w/splits", splits);
+
+  return splits;
+}
+
+/**
+ * @brief Decompresses a set of pages contained in the set of chunks.
+ *
+ * This function handles the case where `pages` is only a subset of all available
+ * pages in `chunks`.
+ *
+ * @param chunks List of column chunk descriptors
+ * @param pages List of page information
+ * @param dict_pages If true, decompress dictionary pages only. Otherwise decompress non-dictionary
+ * pages only.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return Device buffer to decompressed page data
+ */
+[[nodiscard]] rmm::device_buffer decompress_page_data(
+  cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+  cudf::detail::hostdevice_vector<PageInfo>& pages,
+  bool dict_pages,
+  rmm::cuda_stream_view stream)
+{
+  auto for_each_codec_page = [&](Compression codec, std::function<void(size_t)> const& f) {
+    for (size_t p = 0; p < pages.size(); p++) {
+      if (chunks[pages[p].chunk_idx].codec == codec &&
+          ((dict_pages && (pages[p].flags & PAGEINFO_FLAGS_DICTIONARY)) ||
+           (!dict_pages && !(pages[p].flags & PAGEINFO_FLAGS_DICTIONARY)))) {
+        f(p);
+      }
+    }
+  };
+
+  // Brotli scratch memory for decompressing
+  rmm::device_buffer debrotli_scratch;
+
+  // Count the exact number of compressed pages
+  size_t num_comp_pages    = 0;
+  size_t total_decomp_size = 0;
+
+  struct codec_stats {
+    Compression compression_type  = UNCOMPRESSED;
+    size_t num_pages              = 0;
+    int32_t max_decompressed_size = 0;
+    size_t total_decomp_size      = 0;
+  };
+
+  std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}};
+
+  auto is_codec_supported = [&codecs](int8_t codec) {
+    if (codec == UNCOMPRESSED) return true;
+    return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
+             return codec == cstats.compression_type;
+           }) != codecs.end();
+  };
+  CUDF_EXPECTS(std::all_of(chunks.begin(),
+                           chunks.end(),
+                           [&is_codec_supported](auto const& chunk) {
+                             return is_codec_supported(chunk.codec);
+                           }),
+               "Unsupported compression type");
+
+  for (auto& codec : codecs) {
+    for_each_codec_page(codec.compression_type, [&](size_t page) {
+      auto page_uncomp_size = pages[page].uncompressed_page_size;
+      total_decomp_size += page_uncomp_size;
+      codec.total_decomp_size += page_uncomp_size;
+      codec.max_decompressed_size = std::max(codec.max_decompressed_size, page_uncomp_size);
+      codec.num_pages++;
+      num_comp_pages++;
+    });
+    if (codec.compression_type == BROTLI && codec.num_pages > 0) {
+      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
+    }
+  }
+
+  // Dispatch batches of pages to decompress for each codec.
+  // Buffer needs to be padded, required by `gpuDecodePageData`.
+  rmm::device_buffer decomp_pages(
+    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+
+  std::vector<device_span<uint8_t const>> comp_in;
+  comp_in.reserve(num_comp_pages);
+  std::vector<device_span<uint8_t>> comp_out;
+  comp_out.reserve(num_comp_pages);
+
+  // vectors to save v2 def and rep level data, if any
+  std::vector<device_span<uint8_t const>> copy_in;
+  copy_in.reserve(num_comp_pages);
+  std::vector<device_span<uint8_t>> copy_out;
+  copy_out.reserve(num_comp_pages);
+
+  rmm::device_uvector<compression_result> comp_res(num_comp_pages, stream);
+  thrust::fill(rmm::exec_policy_nosync(stream),
+               comp_res.begin(),
+               comp_res.end(),
+               compression_result{0, compression_status::FAILURE});
+
+  size_t decomp_offset = 0;
+  int32_t start_pos    = 0;
+  for (auto const& codec : codecs) {
+    if (codec.num_pages == 0) { continue; }
+
+    for_each_codec_page(codec.compression_type, [&](size_t page_idx) {
+      auto const dst_base = static_cast<uint8_t*>(decomp_pages.data()) + decomp_offset;
+      auto& page          = pages[page_idx];
+      // offset will only be non-zero for V2 pages
+      auto const offset =
+        page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION];
+      // for V2 need to copy def and rep level info into place, and then offset the
+      // input and output buffers. otherwise we'd have to keep both the compressed
+      // and decompressed data.
+      if (offset != 0) {
+        copy_in.emplace_back(page.page_data, offset);
+        copy_out.emplace_back(dst_base, offset);
+      }
+      comp_in.emplace_back(page.page_data + offset,
+                           static_cast<size_t>(page.compressed_page_size - offset));
+      comp_out.emplace_back(dst_base + offset,
+                            static_cast<size_t>(page.uncompressed_page_size - offset));
+      page.page_data = dst_base;
+      decomp_offset += page.uncompressed_page_size;
+    });
+
+    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
+                                                             codec.num_pages};
+    auto const d_comp_in = cudf::detail::make_device_uvector_async(
+      comp_in_view, stream, rmm::mr::get_current_device_resource());
+    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
+                                                        codec.num_pages);
+    auto const d_comp_out = cudf::detail::make_device_uvector_async(
+      comp_out_view, stream, rmm::mr::get_current_device_resource());
+    device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
+
+    switch (codec.compression_type) {
+      case GZIP:
+        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
+        break;
+      case SNAPPY:
+        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
+          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
+                                     d_comp_in,
+                                     d_comp_out,
+                                     d_comp_res_view,
+                                     codec.max_decompressed_size,
+                                     codec.total_decomp_size,
+                                     stream);
+        } else {
+          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
+        }
+        break;
+      case ZSTD:
+        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
+                                   d_comp_in,
+                                   d_comp_out,
+                                   d_comp_res_view,
+                                   codec.max_decompressed_size,
+                                   codec.total_decomp_size,
+                                   stream);
+        break;
+      case BROTLI:
+        gpu_debrotli(d_comp_in,
+                     d_comp_out,
+                     d_comp_res_view,
+                     debrotli_scratch.data(),
+                     debrotli_scratch.size(),
+                     stream);
+        break;
+      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
+    }
+    start_pos += codec.num_pages;
+  }
+
+  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
+                              comp_res.begin(),
+                              comp_res.end(),
+                              cuda::proclaim_return_type<bool>([] __device__(auto const& res) {
+                                return res.status == compression_status::SUCCESS;
+                              })),
+               "Error during decompression");
+
+  // now copy the uncompressed V2 def and rep level data
+  if (not copy_in.empty()) {
+    auto const d_copy_in = cudf::detail::make_device_uvector_async(
+      copy_in, stream, rmm::mr::get_current_device_resource());
+    auto const d_copy_out = cudf::detail::make_device_uvector_async(
+      copy_out, stream, rmm::mr::get_current_device_resource());
+
+    gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
+    stream.synchronize();
+  }
+
+  pages.host_to_device_async(stream);
+
+  stream.synchronize();
+  return decomp_pages;
+}
+
+struct flat_column_num_rows {
+  ColumnChunkDesc const* chunks;
+
+  __device__ size_type operator()(PageInfo const& page) const
+  {
+    // ignore dictionary pages and pages belonging to any column containing repetition (lists)
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) ||
+        (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) {
+      return 0;
+    }
+    return page.num_rows;
+  }
+};
+
+struct row_counts_nonzero {
+  __device__ bool operator()(size_type count) const { return count > 0; }
+};
+
+struct row_counts_different {
+  size_type const expected;
+  __device__ bool operator()(size_type count) const { return (count != 0) && (count != expected); }
+};
+
+/**
+ * @brief Detect malformed parquet input data.
+ *
+ * We have seen cases where parquet files can be oddly malformed. This function specifically
+ * detects one case in particular:
+ *
+ * - When you have a file containing N rows
+ * - For some reason, the sum total of the number of rows over all pages for a given column
+ *   is != N
+ *
+ * @param pages All pages to be decoded
+ * @param chunks Chunk data
+ * @param expected_row_count Expected row count, if applicable
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void detect_malformed_pages(device_span<PageInfo const> pages,
+                            device_span<ColumnChunkDesc const> chunks,
+                            std::optional<size_t> expected_row_count,
+                            rmm::cuda_stream_view stream)
+{
+  // sum row counts for all non-dictionary, non-list columns. other columns will be indicated as 0
+  rmm::device_uvector<size_type> row_counts(pages.size(),
+                                            stream);  // worst case:  num keys == num pages
+  auto const size_iter =
+    thrust::make_transform_iterator(pages.begin(), flat_column_num_rows{chunks.data()});
+  auto const row_counts_begin = row_counts.begin();
+  auto page_keys              = make_page_key_iterator(pages);
+  auto const row_counts_end   = thrust::reduce_by_key(rmm::exec_policy(stream),
+                                                    page_keys,
+                                                    page_keys + pages.size(),
+                                                    size_iter,
+                                                    thrust::make_discard_iterator(),
+                                                    row_counts_begin)
+                                .second;
+
+  // make sure all non-zero row counts are the same
+  rmm::device_uvector<size_type> compacted_row_counts(pages.size(), stream);
+  auto const compacted_row_counts_begin = compacted_row_counts.begin();
+  auto const compacted_row_counts_end   = thrust::copy_if(rmm::exec_policy(stream),
+                                                        row_counts_begin,
+                                                        row_counts_end,
+                                                        compacted_row_counts_begin,
+                                                        row_counts_nonzero{});
+  if (compacted_row_counts_end != compacted_row_counts_begin) {
+    size_t const found_row_count = static_cast<size_t>(compacted_row_counts.element(0, stream));
+
+    // if we somehow don't match the expected row count from the row groups themselves
+    if (expected_row_count.has_value()) {
+      CUDF_EXPECTS(expected_row_count.value() == found_row_count,
+                   "Encountered malformed parquet page data (unexpected row count in page data)");
+    }
+
+    // all non-zero row counts must be the same
+    auto const chk =
+      thrust::count_if(rmm::exec_policy(stream),
+                       compacted_row_counts_begin,
+                       compacted_row_counts_end,
+                       row_counts_different{static_cast<size_type>(found_row_count)});
+    CUDF_EXPECTS(chk == 0,
+                 "Encountered malformed parquet page data (row count mismatch in page data)");
+  }
+}
+
+struct decompression_info {
+  Compression codec;
+  size_t num_pages;
+  size_t max_page_decompressed_size;
+  size_t total_decompressed_size;
+};
+
+/**
+ * @brief Functor which retrieves per-page decompression information.
+ *
+ */
+struct get_decomp_info {
+  device_span<const ColumnChunkDesc> chunks;
+
+  __device__ decompression_info operator()(PageInfo const& p) const
+  {
+    return {static_cast<Compression>(chunks[p.chunk_idx].codec),
+            1,
+            static_cast<size_t>(p.uncompressed_page_size),
+            static_cast<size_t>(p.uncompressed_page_size)};
+  }
+};
+
+/**
+ * @brief Functor which accumulates per-page decompression information.
+ *
+ */
+struct decomp_sum {
+  __device__ decompression_info operator()(decompression_info const& a,
+                                           decompression_info const& b) const
+  {
+    return {a.codec,
+            a.num_pages + b.num_pages,
+            std::max(a.max_page_decompressed_size, b.max_page_decompressed_size),
+            a.total_decompressed_size + b.total_decompressed_size};
+  }
+};
+
+/**
+ * @brief Functor which returns total scratch space required based on computed decompression_info
+ * data.
+ *
+ */
+struct get_decomp_scratch {
+  size_t operator()(decompression_info const& di) const
   {
-    return a.row_count < b.row_count;
+    switch (di.codec) {
+      case UNCOMPRESSED:
+      case GZIP: return 0;
+
+      case BROTLI: return get_gpu_debrotli_scratch_size(di.num_pages);
+
+      case SNAPPY:
+        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
+          return cudf::io::nvcomp::batched_decompress_temp_size(
+            cudf::io::nvcomp::compression_type::SNAPPY,
+            di.num_pages,
+            di.max_page_decompressed_size,
+            di.total_decompressed_size);
+        } else {
+          return 0;
+        }
+        break;
+
+      case ZSTD:
+        return cudf::io::nvcomp::batched_decompress_temp_size(
+          cudf::io::nvcomp::compression_type::ZSTD,
+          di.num_pages,
+          di.max_page_decompressed_size,
+          di.total_decompressed_size);
+
+      default: CUDF_FAIL("Invalid compression codec for parquet decompression");
+    }
   }
 };
 
+/**
+ * @brief Add the cost of decompression codec scratch space to the per-page cumulative
+ * size information.
+ *
+ */
+void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunks,
+                                        device_span<PageInfo const> pages,
+                                        device_span<cumulative_page_info> c_info,
+                                        rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() == c_info.size(),
+               "Encountered page/cumulative_page_info size mismatch");
+
+  auto page_keys = make_page_key_iterator(pages);
+
+  // per-codec page counts and decompression sizes
+  rmm::device_uvector<decompression_info> decomp_info(pages.size(), stream);
+  auto decomp_iter = thrust::make_transform_iterator(pages.begin(), get_decomp_info{chunks});
+  thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(stream),
+                                page_keys,
+                                page_keys + pages.size(),
+                                decomp_iter,
+                                decomp_info.begin(),
+                                thrust::equal_to<int32_t>{},
+                                decomp_sum{});
+
+  // retrieve to host so we can call nvcomp to get compression scratch sizes
+  std::vector<decompression_info> h_decomp_info =
+    cudf::detail::make_std_vector_sync(decomp_info, stream);
+  std::vector<size_t> temp_cost(pages.size());
+  thrust::transform(thrust::host,
+                    h_decomp_info.begin(),
+                    h_decomp_info.end(),
+                    temp_cost.begin(),
+                    get_decomp_scratch{});
+
+  // add to the cumulative_page_info data
+  rmm::device_uvector<size_t> d_temp_cost = cudf::detail::make_device_uvector_async(
+    temp_cost, stream, rmm::mr::get_current_device_resource());
+  auto iter = thrust::make_counting_iterator(size_t{0});
+  thrust::for_each(rmm::exec_policy_nosync(stream),
+                   iter,
+                   iter + pages.size(),
+                   [temp_cost = d_temp_cost.begin(), c_info = c_info.begin()] __device__(size_t i) {
+                     c_info[i].size_bytes += temp_cost[i];
+                   });
+  stream.synchronize();
+}
+
 }  // anonymous namespace
 
+void reader::impl::handle_chunking(bool uses_custom_row_bounds)
+{
+  // if this is our first time in here, setup the first pass.
+  if (!_pass_itm_data) {
+    // setup the next pass
+    setup_next_pass(uses_custom_row_bounds);
+  }
+
+  auto& pass = *_pass_itm_data;
+
+  // if we already have a subpass in flight.
+  if (pass.subpass != nullptr) {
+    // if it still has more chunks in flight, there's nothing more to do
+    if (pass.subpass->current_output_chunk < pass.subpass->output_chunk_read_info.size()) {
+      return;
+    }
+
+    // increment rows processed
+    pass.processed_rows += pass.subpass->num_rows;
+
+    // release the old subpass (will free memory)
+    pass.subpass.reset();
+
+    // otherwise we are done with the pass entirely
+    if (pass.processed_rows == pass.num_rows) {
+      // release the old pass
+      _pass_itm_data.reset();
+
+      _file_itm_data._current_input_pass++;
+      // no more passes. we are absolutely done with this file.
+      if (_file_itm_data._current_input_pass == _file_itm_data.num_passes()) { return; }
+
+      // setup the next pass
+      setup_next_pass(uses_custom_row_bounds);
+    }
+  }
+
+  // setup the next sub pass
+  setup_next_subpass(uses_custom_row_bounds);
+}
+
+void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
+{
+  auto const num_passes = _file_itm_data.num_passes();
+
+  // always create the pass struct, even if we end up with no work.
+  // this will also cause the previous pass information to be deleted
+  _pass_itm_data = std::make_unique<pass_intermediate_data>();
+
+  if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
+      not _input_columns.empty() && _file_itm_data._current_input_pass < num_passes) {
+    auto& pass = *_pass_itm_data;
+
+    // setup row groups to be loaded for this pass
+    auto const row_group_start =
+      _file_itm_data.input_pass_row_group_offsets[_file_itm_data._current_input_pass];
+    auto const row_group_end =
+      _file_itm_data.input_pass_row_group_offsets[_file_itm_data._current_input_pass + 1];
+    auto const num_row_groups = row_group_end - row_group_start;
+    pass.row_groups.resize(num_row_groups);
+    std::copy(_file_itm_data.row_groups.begin() + row_group_start,
+              _file_itm_data.row_groups.begin() + row_group_end,
+              pass.row_groups.begin());
+
+    CUDF_EXPECTS(_file_itm_data._current_input_pass < num_passes,
+                 "Encountered an invalid read pass index");
+
+    auto const chunks_per_rowgroup = _input_columns.size();
+    auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
+
+    auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
+    auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
+
+    pass.chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
+    std::copy(chunk_start, chunk_end, pass.chunks.begin());
+
+    // compute skip_rows / num_rows for this pass.
+    if (num_passes == 1) {
+      pass.skip_rows = _file_itm_data.global_skip_rows;
+      pass.num_rows  = _file_itm_data.global_num_rows;
+    } else {
+      auto const global_start_row = _file_itm_data.global_skip_rows;
+      auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
+      auto const start_row =
+        std::max(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass],
+                 global_start_row);
+      auto const end_row =
+        std::min(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass + 1],
+                 global_end_row);
+
+      // skip_rows is always global in the sense that it is relative to the first row of
+      // everything we will be reading, regardless of what pass we are on.
+      // num_rows is how many rows we are reading this pass.
+      pass.skip_rows =
+        global_start_row +
+        _file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass];
+      pass.num_rows = end_row - start_row;
+    }
+
+    // load page information for the chunk. this retrieves the compressed bytes for all the
+    // pages, and their headers (which we can access without decompressing)
+    read_compressed_data();
+
+    // detect malformed columns.
+    // - we have seen some cases in the wild where we have a row group containing N
+    //   rows, but the total number of rows in the pages for column X is != N. while it
+    //   is possible to load this by just capping the number of rows read, we cannot tell
+    //   which rows are invalid so we may be returning bad data. in addition, this mismatch
+    //   confuses the chunked reader
+    detect_malformed_pages(
+      pass.pages,
+      pass.chunks,
+      uses_custom_row_bounds ? std::nullopt : std::make_optional(pass.num_rows),
+      _stream);
+
+    // decompress dictionary data if applicable.
+    if (pass.has_compressed_data) {
+      pass.decomp_dict_data = decompress_page_data(pass.chunks, pass.pages, true, _stream);
+    }
+
+    // store off how much memory we've used so far. This includes the compressed page data and the
+    // decompressed dictionary data. we will subtract this from the available total memory for the
+    // subpasses
+    auto chunk_iter =
+      thrust::make_transform_iterator(pass.chunks.d_begin(), get_chunk_compressed_size{});
+    pass.base_mem_size =
+      pass.decomp_dict_data.size() +
+      thrust::reduce(rmm::exec_policy(_stream), chunk_iter, chunk_iter + pass.chunks.size());
+
+    // since there is only ever 1 dictionary per chunk (the first page), do it at the
+    // pass level.
+    build_string_dict_indices();
+
+    // if we are doing subpass reading, generate more accurate num_row estimates for list columns.
+    // this helps us to generate more accurate subpass splits.
+    if (_input_pass_read_limit != 0) { generate_list_column_row_count_estimates(); }
+
+#if defined(PARQUET_CHUNK_LOGGING)
+    printf("Pass: row_groups(%'lu), chunks(%'lu), pages(%'lu)\n",
+           pass.row_groups.size(),
+           pass.chunks.size(),
+           pass.pages.size());
+    printf("\tskip_rows: %'lu\n", pass.skip_rows);
+    printf("\tnum_rows: %'lu\n", pass.num_rows);
+    printf("\tbase mem usage: %'lu\n", pass.base_mem_size);
+    auto const num_columns = _input_columns.size();
+    for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
+      printf("\t\tColumn %'lu: num_pages(%'d)\n",
+             c_idx,
+             pass.page_offsets[c_idx + 1] - pass.page_offsets[c_idx]);
+    }
+#endif
+
+    _stream.synchronize();
+  }
+}
+
+void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
+{
+  auto& pass    = *_pass_itm_data;
+  pass.subpass  = std::make_unique<subpass_intermediate_data>();
+  auto& subpass = *pass.subpass;
+
+  auto const num_columns = _input_columns.size();
+
+  // if the user has passed a very small value (under the hardcoded minimum_subpass_expected_size),
+  // respect it.
+  auto const min_subpass_size = std::min(_input_pass_read_limit, minimum_subpass_expected_size);
+
+  // what do we do if the base memory size (the compressed data) itself is approaching or larger
+  // than the overall read limit? we are still going to be decompressing in subpasses, but we have
+  // to assume some reasonable minimum size needed to safely decompress a single subpass. so always
+  // reserve at least that much space. this can result in using up to 2x the specified user limit
+  // but should only ever happen with unrealistically low numbers.
+  size_t const remaining_read_limit =
+    _input_pass_read_limit == 0 ? 0
+    : pass.base_mem_size + min_subpass_size >= _input_pass_read_limit
+      ? min_subpass_size
+      : _input_pass_read_limit - pass.base_mem_size;
+
+  auto [page_indices, total_pages, total_expected_size] =
+    [&]() -> std::tuple<std::vector<page_span>, size_t, size_t> {
+    // special case:  if we contain no compressed data, or if we have no input limit, we can always
+    // just do 1 subpass since what we already have loaded is all the temporary memory we will ever
+    // use.
+    if (!pass.has_compressed_data || _input_pass_read_limit == 0) {
+      std::vector<page_span> page_indices;
+      page_indices.reserve(num_columns);
+      auto iter = thrust::make_counting_iterator(0);
+      std::transform(
+        iter, iter + num_columns, std::back_inserter(page_indices), [&](size_t i) -> page_span {
+          return {static_cast<size_t>(pass.page_offsets[i]),
+                  static_cast<size_t>(pass.page_offsets[i + 1])};
+        });
+      return {page_indices, pass.pages.size(), 0};
+    }
+    // otherwise we have to look forward and choose a batch of pages
+
+    // as subpasses get decoded, the initial estimates we have for list row counts
+    // get updated with accurate data, so regenerate cumulative size info and row
+    // indices
+    rmm::device_uvector<cumulative_page_info> c_info(pass.pages.size(), _stream);
+    auto page_keys = make_page_key_iterator(pass.pages);
+    auto page_size = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_input_size{});
+    thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
+                                  page_keys,
+                                  page_keys + pass.pages.size(),
+                                  page_size,
+                                  c_info.begin(),
+                                  thrust::equal_to{},
+                                  cumulative_page_sum{});
+
+    // include scratch space needed for decompression. for certain codecs (eg ZSTD) this
+    // can be considerable.
+    include_decompression_scratch_size(pass.chunks, pass.pages, c_info, _stream);
+
+    auto iter = thrust::make_counting_iterator(0);
+    thrust::for_each(rmm::exec_policy_nosync(_stream),
+                     iter,
+                     iter + pass.pages.size(),
+                     set_row_index{pass.chunks, pass.pages, c_info});
+    // print_cumulative_page_info(pass.pages, pass.chunks, c_info, _stream);
+
+    // get the next batch of pages
+    return compute_next_subpass(c_info,
+                                pass.pages,
+                                pass.page_offsets,
+                                pass.processed_rows + pass.skip_rows,
+                                remaining_read_limit,
+                                num_columns,
+                                _stream);
+  }();
+
+  // fill out the subpass struct
+  subpass.pages = cudf::detail::hostdevice_vector<PageInfo>(0, total_pages, _stream);
+  subpass.page_src_index =
+    cudf::detail::hostdevice_vector<size_t>(total_pages, total_pages, _stream);
+  // copy the appropriate subset of pages from each column
+  size_t page_count = 0;
+  for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
+    auto const num_column_pages = page_indices[c_idx].end - page_indices[c_idx].start;
+    subpass.column_page_count.push_back(num_column_pages);
+    std::copy(pass.pages.begin() + page_indices[c_idx].start,
+              pass.pages.begin() + page_indices[c_idx].end,
+              std::back_inserter(subpass.pages));
+
+    // mapping back to original pages in the pass
+    thrust::sequence(thrust::host,
+                     subpass.page_src_index.begin() + page_count,
+                     subpass.page_src_index.begin() + page_count + num_column_pages,
+                     page_indices[c_idx].start);
+    page_count += num_column_pages;
+  }
+  // print_hostdevice_vector(subpass.page_src_index);
+
+  // decompress the data for the pages in this subpass.
+  if (pass.has_compressed_data) {
+    subpass.decomp_page_data = decompress_page_data(pass.chunks, subpass.pages, false, _stream);
+  }
+
+  subpass.pages.host_to_device_async(_stream);
+  subpass.page_src_index.host_to_device_async(_stream);
+  _stream.synchronize();
+
+  // buffers needed by the decode kernels
+  {
+    // nesting information (sizes, etc) stored -per page-
+    // note : even for flat schemas, we allocate 1 level of "nesting" info
+    allocate_nesting_info();
+
+    // level decode space
+    allocate_level_decode_space();
+  }
+  subpass.pages.host_to_device_async(_stream);
+
+  // preprocess pages (computes row counts for lists, computes output chunks and computes
+  // the actual row counts we will be able load out of this subpass)
+  preprocess_subpass_pages(uses_custom_row_bounds, _output_chunk_read_limit);
+
+#if defined(PARQUET_CHUNK_LOGGING)
+  printf("\tSubpass: skip_rows(%'lu), num_rows(%'lu), remaining read limit(%'lu)\n",
+         subpass.skip_rows,
+         subpass.num_rows,
+         remaining_read_limit);
+  printf("\t\tDecompressed size: %'lu\n", subpass.decomp_page_data.size());
+  printf("\t\tTotal expected usage: %'lu\n",
+         total_expected_size == 0 ? subpass.decomp_page_data.size() + pass.base_mem_size
+                                  : total_expected_size + pass.base_mem_size);
+  for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
+    printf("\t\tColumn %'lu: pages(%'lu - %'lu)\n",
+           c_idx,
+           page_indices[c_idx].start,
+           page_indices[c_idx].end);
+  }
+  printf("\t\tOutput chunks:\n");
+  for (size_t idx = 0; idx < subpass.output_chunk_read_info.size(); idx++) {
+    printf("\t\t\t%'lu: skip_rows(%'lu) num_rows(%'lu)\n",
+           idx,
+           subpass.output_chunk_read_info[idx].skip_rows,
+           subpass.output_chunk_read_info[idx].num_rows);
+  }
+#endif
+}
+
 void reader::impl::create_global_chunk_info()
 {
   auto const num_rows         = _file_itm_data.global_num_rows;
@@ -380,6 +1403,14 @@ void reader::impl::create_global_chunk_info()
                         schema.converted_type,
                         schema.type_length);
 
+      // for lists, estimate the number of bytes per row. this is used by the subpass reader to
+      // determine where to split the decompression boundaries
+      float const list_bytes_per_row_est =
+        schema.max_repetition_level > 0 && row_group.num_rows > 0
+          ? static_cast<float>(col_meta.total_uncompressed_size) /
+              static_cast<float>(row_group.num_rows)
+          : 0.0f;
+
       chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
                                        nullptr,
                                        col_meta.num_values,
@@ -398,7 +1429,8 @@ void reader::impl::create_global_chunk_info()
                                        schema.decimal_precision,
                                        clock_rate,
                                        i,
-                                       col.schema_idx));
+                                       col.schema_idx,
+                                       list_bytes_per_row_est));
     }
 
     remaining_rows -= row_group_rows;
@@ -415,185 +1447,101 @@ void reader::impl::compute_input_passes()
   if (_input_pass_read_limit == 0) {
     _file_itm_data.input_pass_row_group_offsets.push_back(0);
     _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    _file_itm_data.input_pass_start_row_count.push_back(0);
+    auto rg_row_count = cudf::detail::make_counting_transform_iterator(0, [&](size_t i) {
+      auto const& rgi       = row_groups_info[i];
+      auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
+      return row_group.num_rows;
+    });
+    _file_itm_data.input_pass_start_row_count.push_back(
+      std::reduce(rg_row_count, rg_row_count + row_groups_info.size()));
     return;
   }
 
   // generate passes. make sure to account for the case where a single row group doesn't fit within
   //
-  std::size_t const read_limit =
-    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
+  std::size_t const comp_read_limit =
+    _input_pass_read_limit > 0
+      ? static_cast<size_t>(_input_pass_read_limit * input_limit_compression_reserve)
+      : std::numeric_limits<std::size_t>::max();
   std::size_t cur_pass_byte_size = 0;
   std::size_t cur_rg_start       = 0;
   std::size_t cur_row_count      = 0;
   _file_itm_data.input_pass_row_group_offsets.push_back(0);
-  _file_itm_data.input_pass_row_count.push_back(0);
+  _file_itm_data.input_pass_start_row_count.push_back(0);
 
   for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
     auto const& rgi       = row_groups_info[cur_rg_index];
     auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
 
+    // total compressed size and total size (compressed + uncompressed) for
+    auto const [compressed_rg_size, _ /*compressed + uncompressed*/] =
+      get_row_group_size(row_group);
+
     // can we add this row group
-    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
+    if (cur_pass_byte_size + compressed_rg_size >= comp_read_limit) {
       // A single row group (the current one) is larger than the read limit:
       // We always need to include at least one row group, so end the pass at the end of the current
       // row group
       if (cur_rg_start == cur_rg_index) {
         _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
-        _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
+        _file_itm_data.input_pass_start_row_count.push_back(cur_row_count + row_group.num_rows);
         cur_rg_start       = cur_rg_index + 1;
         cur_pass_byte_size = 0;
       }
       // End the pass at the end of the previous row group
       else {
         _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index);
-        _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+        _file_itm_data.input_pass_start_row_count.push_back(cur_row_count);
         cur_rg_start       = cur_rg_index;
-        cur_pass_byte_size = row_group.total_byte_size;
+        cur_pass_byte_size = compressed_rg_size;
       }
     } else {
-      cur_pass_byte_size += row_group.total_byte_size;
+      cur_pass_byte_size += compressed_rg_size;
     }
     cur_row_count += row_group.num_rows;
   }
+
   // add the last pass if necessary
   if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) {
     _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
-    _file_itm_data.input_pass_row_count.push_back(cur_row_count);
-  }
-}
-
-void reader::impl::setup_next_pass()
-{
-  // this will also cause the previous pass information to be deleted
-  _pass_itm_data = std::make_unique<cudf::io::parquet::detail::pass_intermediate_data>();
-
-  // setup row groups to be loaded for this pass
-  auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass];
-  auto const row_group_end   = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1];
-  auto const num_row_groups  = row_group_end - row_group_start;
-  _pass_itm_data->row_groups.resize(num_row_groups);
-  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
-            _file_itm_data.row_groups.begin() + row_group_end,
-            _pass_itm_data->row_groups.begin());
-
-  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
-  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
-
-  auto const chunks_per_rowgroup = _input_columns.size();
-  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
-
-  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
-  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
-
-  _pass_itm_data->chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
-  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
-
-  // adjust skip_rows and num_rows by what's available in the row groups we are processing
-  if (num_passes == 1) {
-    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
-    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
-  } else {
-    auto const global_start_row = _file_itm_data.global_skip_rows;
-    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-    auto const start_row =
-      std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row);
-    auto const end_row =
-      std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row);
-
-    // skip_rows is always global in the sense that it is relative to the first row of
-    // everything we will be reading, regardless of what pass we are on.
-    // num_rows is how many rows we are reading this pass.
-    _pass_itm_data->skip_rows =
-      global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass];
-    _pass_itm_data->num_rows = end_row - start_row;
+    _file_itm_data.input_pass_start_row_count.push_back(cur_row_count);
   }
 }
 
-void reader::impl::compute_splits_for_pass()
+void reader::impl::compute_output_chunks_for_subpass()
 {
-  auto const skip_rows = _pass_itm_data->skip_rows;
-  auto const num_rows  = _pass_itm_data->num_rows;
+  auto& pass    = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
 
   // simple case : no chunk size, no splits
   if (_output_chunk_read_limit <= 0) {
-    _pass_itm_data->output_chunk_read_info = std::vector<chunk_read_info>{{skip_rows, num_rows}};
+    subpass.output_chunk_read_info.push_back({subpass.skip_rows, subpass.num_rows});
     return;
   }
 
-  auto& pages = _pass_itm_data->pages_info;
-
-  auto const& page_keys  = _pass_itm_data->page_keys;
-  auto const& page_index = _pass_itm_data->page_index;
-
-  // generate cumulative row counts and sizes
-  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), _stream);
-  // convert PageInfo to cumulative_row_info
-  auto page_input = thrust::make_transform_iterator(page_index.begin(),
-                                                    get_cumulative_row_info{pages.device_ptr()});
-  thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
-                                page_keys.begin(),
-                                page_keys.end(),
+  // generate row_indices and cumulative output sizes for all pages
+  rmm::device_uvector<cumulative_page_info> c_info(subpass.pages.size(), _stream);
+  auto page_input =
+    thrust::make_transform_iterator(subpass.pages.d_begin(), get_page_output_size{});
+  auto page_keys = make_page_key_iterator(subpass.pages);
+  thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
+                                page_keys,
+                                page_keys + subpass.pages.size(),
                                 page_input,
                                 c_info.begin(),
                                 thrust::equal_to{},
-                                cumulative_row_sum{});
-  // print_cumulative_page_info(pages, page_index, c_info, stream);
-
-  // sort by row count
-  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, _stream};
-  thrust::sort(
-    rmm::exec_policy(_stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_compare{});
-
-  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
-  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
-  //                          c_info_sorted.data(),
-  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
-  //                          cudaMemcpyDefault));
-  // print_cumulative_row_info(h_c_info_sorted, "raw");
-
-  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
-  // key
-  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, _stream);
-  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream),
-                                                     page_keys.begin(),
-                                                     page_keys.end(),
-                                                     thrust::make_constant_iterator(1),
-                                                     thrust::make_discard_iterator(),
-                                                     key_offsets.begin())
-                                 .second;
-  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
-  thrust::exclusive_scan(
-    rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
-
-  // adjust the cumulative info such that for each row count, the size includes any pages that span
-  // that row count. this is so that if we have this case:
-  //              page row counts
-  // Column A:    0 <----> 100 <----> 200
-  // Column B:    0 <---------------> 200 <--------> 400
-  //                        |
-  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
-  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
-  // page.
-  //
-  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), _stream);
-  thrust::transform(rmm::exec_policy(_stream),
-                    c_info_sorted.begin(),
-                    c_info_sorted.end(),
-                    aggregated_info.begin(),
-                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
-
-  // bring back to the cpu
-  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
-                                aggregated_info.data(),
-                                sizeof(cumulative_row_info) * c_info.size(),
-                                cudaMemcpyDefault,
-                                _stream.value()));
-  _stream.synchronize();
-
-  // generate the actual splits
-  _pass_itm_data->output_chunk_read_info =
-    find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit);
+                                cumulative_page_sum{});
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::for_each(rmm::exec_policy_nosync(_stream),
+                   iter,
+                   iter + subpass.pages.size(),
+                   set_row_index{pass.chunks, subpass.pages, c_info});
+  // print_cumulative_page_info(subpass.pages, c_info, _stream);
+
+  // compute the splits
+  subpass.output_chunk_read_info = compute_page_splits_by_row(
+    c_info, subpass.pages, subpass.skip_rows, subpass.num_rows, _output_chunk_read_limit, _stream);
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index dfc239d8451..a9cf0e94ec8 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,58 +30,105 @@ struct file_intermediate_data {
   // all row groups to read
   std::vector<row_group_info> row_groups{};
 
-  // all chunks from the selected row groups. We may end up reading these chunks progressively
-  // instead of all at once
+  // all chunks from the selected row groups.
   std::vector<ColumnChunkDesc> chunks{};
 
   // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
   // the start/end of the chunks to be loaded for a given pass.
   std::vector<std::size_t> input_pass_row_group_offsets{};
-  // row counts per input-pass
-  std::vector<std::size_t> input_pass_row_count{};
 
-  // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
-  // may not be visiting every row group that contains these bounds
+  // start row counts per input-pass. this includes all rows in the row groups of the pass and
+  // is not capped by global_skip_rows and global_num_rows.
+  std::vector<std::size_t> input_pass_start_row_count{};
+
+  size_t _current_input_pass{0};  // current input pass index
+  size_t _output_chunk_count{0};  // how many output chunks we have produced
+
+  // skip_rows/num_rows values for the entire file.
   size_t global_skip_rows;
   size_t global_num_rows;
+
+  [[nodiscard]] size_t num_passes() const
+  {
+    return input_pass_row_group_offsets.size() == 0 ? 0 : input_pass_row_group_offsets.size() - 1;
+  }
 };
 
 /**
- * @brief Struct to identify the range for each chunk of rows during a chunked reading pass.
+ * @brief Struct to identify a range of rows.
  */
-struct chunk_read_info {
+struct row_range {
+  size_t skip_rows;
+  size_t num_rows;
+};
+
+/**
+ * @brief Passes are broken down into subpasses based on temporary memory constraints.
+ */
+struct subpass_intermediate_data {
+  rmm::device_buffer decomp_page_data;
+
+  rmm::device_buffer level_decode_data{};
+  cudf::detail::hostdevice_vector<PageInfo> pages{};
+  // for each page in the subpass, the index of our source page in the pass
+  cudf::detail::hostdevice_vector<size_t> page_src_index{};
+  // for each column in the file (indexed by _input_columns.size())
+  // the number of associated pages for this subpass
+  std::vector<size_t> column_page_count;
+  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
+  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
+
+  std::vector<row_range> output_chunk_read_info;
+  std::size_t current_output_chunk{0};
+
+  // skip_rows and num_rows values for this particular subpass. in absolute row indices.
   size_t skip_rows;
   size_t num_rows;
 };
 
 /**
  * @brief Struct to store pass-level data that remains constant for a single pass.
+ *
+ * A pass is defined as a set of rowgroups read but not yet decompressed. This set of
+ * rowgroups may represent less than all of the rowgroups to be read for the file.
  */
 struct pass_intermediate_data {
   std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
-  rmm::device_buffer decomp_page_data;
 
   // rowgroup, chunk and page information for the current pass.
+  bool has_compressed_data{false};
   std::vector<row_group_info> row_groups{};
   cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
-  cudf::detail::hostdevice_vector<PageInfo> pages_info{};
-  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
-  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
+  cudf::detail::hostdevice_vector<PageInfo> pages{};
 
-  rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
-  rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
-  rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
+  // base memory used for the pass itself (compressed data in the loaded chunks and any
+  // decompressed dictionary pages)
+  size_t base_mem_size{0};
 
-  std::vector<chunk_read_info> output_chunk_read_info;
-  std::size_t current_output_chunk{0};
+  // offsets to each group of input pages (by column/schema, indexed by _input_columns.size())
+  // so if we had 2 columns/schemas, with page keys
+  //
+  // 1 1 1 1 1 2 2 2
+  //
+  // page_offsets would be 0, 5, 8
+  cudf::detail::hostdevice_vector<size_type> page_offsets{};
+
+  rmm::device_buffer decomp_dict_data{0, rmm::cuda_stream_default};
+  rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
 
-  rmm::device_buffer level_decode_data{};
   int level_type_size{0};
 
-  // skip_rows and num_rows values for this particular pass. these may be adjusted values from the
-  // global values stored in file_intermediate_data.
+  // skip_rows / num_rows for this pass.
+  // NOTE: skip_rows is the absolute row index in the file.
   size_t skip_rows;
   size_t num_rows;
+  // number of rows we have processed so far (out of num_rows). note that this
+  // only includes the number of rows we have processed before starting the current
+  // subpass. it does not get updated as a subpass iterates through output chunks.
+  size_t processed_rows{0};
+
+  // currently active subpass
+  std::unique_ptr<subpass_intermediate_data> subpass{};
 };
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index e10f2c00f40..ee3b1c466e0 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,6 @@
 #include "error.hpp"
 #include "reader_impl.hpp"
 
-#include <io/comp/nvcomp_adapter.hpp>
-#include <io/utilities/config_utils.hpp>
-
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -49,6 +46,28 @@
 namespace cudf::io::parquet::detail {
 namespace {
 
+#if defined(PREPROCESS_DEBUG)
+void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_stream_view _stream)
+{
+  pages.device_to_host_sync(_stream);
+  for (size_t idx = 0; idx < pages.size(); idx++) {
+    auto const& p = pages[idx];
+    // skip dictionary pages
+    if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    printf(
+      "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), "
+      "str_bytes(%d)\n",
+      idx,
+      p.src_col_schema,
+      p.chunk_row,
+      p.num_rows,
+      p.skipped_values,
+      p.skipped_leaf_values,
+      p.str_bytes);
+  }
+}
+#endif  // PREPROCESS_DEBUG
+
 /**
  * @brief Generate depth remappings for repetition and definition levels.
  *
@@ -269,7 +288,7 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
 
   kernel_error error_code(stream);
   chunks.host_to_device_async(stream);
-  DecodePageHeaders(chunks.device_ptr(), chunks.size(), error_code.data(), stream);
+  DecodePageHeaders(chunks.device_ptr(), nullptr, chunks.size(), error_code.data(), stream);
   chunks.device_to_host_sync(stream);
 
   // It's required to ignore unsupported encodings in this function
@@ -351,33 +370,37 @@ std::string encoding_to_string(Encoding encoding)
 }
 
 /**
- * @brief Decode the page information from the given column chunks.
+ * @brief Decode the page information for a given pass.
  *
- * @param chunks List of column chunk descriptors
- * @param pages List of page information
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @returns The size in bytes of level type data required
+ * @param pass_intermediate_data The struct containing pass information
  */
-int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
-                        cudf::detail::hostdevice_vector<PageInfo>& pages,
-                        rmm::cuda_stream_view stream)
+void decode_page_headers(pass_intermediate_data& pass,
+                         device_span<PageInfo> unsorted_pages,
+                         rmm::cuda_stream_view stream)
 {
+  cudf::detail::hostdevice_vector<chunk_page_info> chunk_page_info(pass.chunks.size(), stream);
+
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
   // please update preprocess_nested_columns to reflect this.
-  for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-    chunks[c].max_num_pages = chunks[c].num_data_pages + chunks[c].num_dict_pages;
-    chunks[c].page_info     = pages.device_ptr(page_count);
-    page_count += chunks[c].max_num_pages;
+  for (size_t c = 0, page_count = 0; c < pass.chunks.size(); c++) {
+    pass.chunks[c].max_num_pages = pass.chunks[c].num_data_pages + pass.chunks[c].num_dict_pages;
+    chunk_page_info[c].pages     = &unsorted_pages[page_count];
+    page_count += pass.chunks[c].max_num_pages;
   }
 
   kernel_error error_code(stream);
-  chunks.host_to_device_async(stream);
-  DecodePageHeaders(chunks.device_ptr(), chunks.size(), error_code.data(), stream);
+  pass.chunks.host_to_device_async(stream);
+  chunk_page_info.host_to_device_async(stream);
+  DecodePageHeaders(pass.chunks.device_ptr(),
+                    chunk_page_info.device_ptr(),
+                    pass.chunks.size(),
+                    error_code.data(),
+                    stream);
 
   if (error_code.value() != 0) {
     if (BitAnd(error_code.value(), decode_error::UNSUPPORTED_ENCODING) != 0) {
       auto const unsupported_str =
-        ". With unsupported encodings found: " + list_unsupported_encodings(pages, stream);
+        ". With unsupported encodings found: " + list_unsupported_encodings(pass.pages, stream);
       CUDF_FAIL("Parquet header parsing failed with code(s) " + error_code.str() + unsupported_str);
     } else {
       CUDF_FAIL("Parquet header parsing failed with code(s) " + error_code.str());
@@ -386,7 +409,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
 
   // compute max bytes needed for level data
   auto level_bit_size = cudf::detail::make_counting_transform_iterator(
-    0, cuda::proclaim_return_type<int>([chunks = chunks.d_begin()] __device__(int i) {
+    0, cuda::proclaim_return_type<int>([chunks = pass.chunks.d_begin()] __device__(int i) {
       auto c = chunks[i];
       return static_cast<int>(
         max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION]));
@@ -394,223 +417,243 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
   // max level data bit size.
   int const max_level_bits = thrust::reduce(rmm::exec_policy(stream),
                                             level_bit_size,
-                                            level_bit_size + chunks.size(),
+                                            level_bit_size + pass.chunks.size(),
                                             0,
                                             thrust::maximum<int>());
+  pass.level_type_size     = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
 
-  return std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
-}
+  // sort the pages in chunk/schema order. we use chunk.src_col_index instead of
+  // chunk.src_col_schema because the user may have reordered them (reading columns, "a" and "b" but
+  // returning them as "b" and "a")
+  //
+  // ordering of pages is by input column schema, repeated across row groups.  so
+  // if we had 3 columns, each with 2 pages, and 1 row group, our schema values might look like
+  //
+  // 1, 1, 2, 2, 3, 3
+  //
+  // However, if we had more than one row group, the pattern would be
+  //
+  // 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3
+  // ^ row group 0     |
+  //                   ^ row group 1
+  //
+  // To process pages by key (exclusive_scan_by_key, reduce_by_key, etc), the ordering we actually
+  // want is
+  //
+  // 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
+  //
+  // We also need to preserve key-relative page ordering, so we need to use a stable sort.
+  {
+    rmm::device_uvector<int32_t> page_keys{unsorted_pages.size(), stream};
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      unsorted_pages.begin(),
+                      unsorted_pages.end(),
+                      page_keys.begin(),
+                      [chunks = pass.chunks.d_begin()] __device__(PageInfo const& page) {
+                        return chunks[page.chunk_idx].src_col_index;
+                      });
+    // we are doing this by sorting indices first and then transforming the output because nvcc
+    // started generating kernels using too much shared memory when trying to sort the pages
+    // directly.
+    rmm::device_uvector<int32_t> sort_indices(unsorted_pages.size(), stream);
+    thrust::sequence(rmm::exec_policy_nosync(stream), sort_indices.begin(), sort_indices.end(), 0);
+    thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
+                               page_keys.begin(),
+                               page_keys.end(),
+                               sort_indices.begin(),
+                               thrust::less<int>());
+    pass.pages = cudf::detail::hostdevice_vector<PageInfo>(
+      unsorted_pages.size(), unsorted_pages.size(), stream);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      sort_indices.begin(),
+                      sort_indices.end(),
+                      pass.pages.d_begin(),
+                      [unsorted_pages = unsorted_pages.begin()] __device__(int32_t i) {
+                        return unsorted_pages[i];
+                      });
+  }
 
-/**
- * @brief Decompresses the page data, at page granularity.
- *
- * @param chunks List of column chunk descriptors
- * @param pages List of page information
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return Device buffer to decompressed page data
- */
-[[nodiscard]] rmm::device_buffer decompress_page_data(
-  cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
-  cudf::detail::hostdevice_vector<PageInfo>& pages,
-  rmm::cuda_stream_view stream)
-{
-  auto for_each_codec_page = [&](Compression codec, std::function<void(size_t)> const& f) {
-    for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-      const auto page_stride = chunks[c].max_num_pages;
-      if (chunks[c].codec == codec) {
-        for (int k = 0; k < page_stride; k++) {
-          f(page_count + k);
-        }
-      }
-      page_count += page_stride;
-    }
-  };
+  // compute offsets to each group of input pages.
+  // page_keys:   1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
+  //
+  // result:      0,          4,          8
+  rmm::device_uvector<size_type> page_counts(pass.pages.size() + 1, stream);
+  auto page_keys             = make_page_key_iterator(pass.pages);
+  auto const page_counts_end = thrust::reduce_by_key(rmm::exec_policy(stream),
+                                                     page_keys,
+                                                     page_keys + pass.pages.size(),
+                                                     thrust::make_constant_iterator(1),
+                                                     thrust::make_discard_iterator(),
+                                                     page_counts.begin())
+                                 .second;
+  auto const num_page_counts = page_counts_end - page_counts.begin();
+  pass.page_offsets = cudf::detail::hostdevice_vector<size_type>(num_page_counts + 1, stream);
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                         page_counts.begin(),
+                         page_counts.begin() + num_page_counts + 1,
+                         pass.page_offsets.d_begin());
+
+  // setup dict_page for each chunk if necessary
+  thrust::for_each(rmm::exec_policy_nosync(stream),
+                   pass.pages.d_begin(),
+                   pass.pages.d_end(),
+                   [chunks = pass.chunks.d_begin()] __device__(PageInfo const& p) {
+                     if (p.flags & PAGEINFO_FLAGS_DICTIONARY) {
+                       chunks[p.chunk_idx].dict_page = &p;
+                     }
+                   });
 
-  // Brotli scratch memory for decompressing
-  rmm::device_buffer debrotli_scratch;
+  pass.page_offsets.device_to_host_async(stream);
+  pass.pages.device_to_host_async(stream);
+  pass.chunks.device_to_host_async(stream);
+  stream.synchronize();
+}
 
-  // Count the exact number of compressed pages
-  size_t num_comp_pages    = 0;
-  size_t total_decomp_size = 0;
+struct set_str_dict_index_count {
+  device_span<size_t> str_dict_index_count;
+  device_span<const ColumnChunkDesc> chunks;
 
-  struct codec_stats {
-    Compression compression_type  = UNCOMPRESSED;
-    size_t num_pages              = 0;
-    int32_t max_decompressed_size = 0;
-    size_t total_decomp_size      = 0;
-  };
+  __device__ void operator()(PageInfo const& page)
+  {
+    auto const& chunk = chunks[page.chunk_idx];
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) && (chunk.data_type & 0x7) == BYTE_ARRAY &&
+        (chunk.num_dict_pages > 0)) {
+      // there is only ever one dictionary page per chunk, so this is safe to do in parallel.
+      str_dict_index_count[page.chunk_idx] = page.num_input_values;
+    }
+  }
+};
 
-  std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}};
+struct set_str_dict_index_ptr {
+  string_index_pair* const base;
+  device_span<const size_t> str_dict_index_offsets;
+  device_span<ColumnChunkDesc> chunks;
 
-  auto is_codec_supported = [&codecs](int8_t codec) {
-    if (codec == UNCOMPRESSED) return true;
-    return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
-             return codec == cstats.compression_type;
-           }) != codecs.end();
-  };
-  CUDF_EXPECTS(std::all_of(chunks.begin(),
-                           chunks.end(),
-                           [&is_codec_supported](auto const& chunk) {
-                             return is_codec_supported(chunk.codec);
-                           }),
-               "Unsupported compression type");
-
-  for (auto& codec : codecs) {
-    for_each_codec_page(codec.compression_type, [&](size_t page) {
-      auto page_uncomp_size = pages[page].uncompressed_page_size;
-      total_decomp_size += page_uncomp_size;
-      codec.total_decomp_size += page_uncomp_size;
-      codec.max_decompressed_size = std::max(codec.max_decompressed_size, page_uncomp_size);
-      codec.num_pages++;
-      num_comp_pages++;
-    });
-    if (codec.compression_type == BROTLI && codec.num_pages > 0) {
-      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
+  __device__ void operator()(size_t i)
+  {
+    auto& chunk = chunks[i];
+    if ((chunk.data_type & 0x7) == BYTE_ARRAY && (chunk.num_dict_pages > 0)) {
+      chunk.str_dict_index = base + str_dict_index_offsets[i];
     }
   }
+};
 
-  // Dispatch batches of pages to decompress for each codec.
-  // Buffer needs to be padded, required by `gpuDecodePageData`.
-  rmm::device_buffer decomp_pages(
-    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
-
-  std::vector<device_span<uint8_t const>> comp_in;
-  comp_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> comp_out;
-  comp_out.reserve(num_comp_pages);
-
-  // vectors to save v2 def and rep level data, if any
-  std::vector<device_span<uint8_t const>> copy_in;
-  copy_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> copy_out;
-  copy_out.reserve(num_comp_pages);
-
-  rmm::device_uvector<compression_result> comp_res(num_comp_pages, stream);
-  thrust::fill(rmm::exec_policy(stream),
-               comp_res.begin(),
-               comp_res.end(),
-               compression_result{0, compression_status::FAILURE});
-
-  size_t decomp_offset = 0;
-  int32_t start_pos    = 0;
-  for (auto const& codec : codecs) {
-    if (codec.num_pages == 0) { continue; }
-
-    for_each_codec_page(codec.compression_type, [&](size_t page_idx) {
-      auto const dst_base = static_cast<uint8_t*>(decomp_pages.data()) + decomp_offset;
-      auto& page          = pages[page_idx];
-      // offset will only be non-zero for V2 pages
-      auto const offset =
-        page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION];
-      // for V2 need to copy def and rep level info into place, and then offset the
-      // input and output buffers. otherwise we'd have to keep both the compressed
-      // and decompressed data.
-      if (offset != 0) {
-        copy_in.emplace_back(page.page_data, offset);
-        copy_out.emplace_back(dst_base, offset);
-      }
-      comp_in.emplace_back(page.page_data + offset,
-                           static_cast<size_t>(page.compressed_page_size - offset));
-      comp_out.emplace_back(dst_base + offset,
-                            static_cast<size_t>(page.uncompressed_page_size - offset));
-      page.page_data = dst_base;
-      decomp_offset += page.uncompressed_page_size;
-    });
+/**
+ * @brief Functor which computes an estimated row count for list pages.
+ *
+ */
+struct set_list_row_count_estimate {
+  device_span<const ColumnChunkDesc> chunks;
 
-    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
-                                                             codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(
-      comp_in_view, stream, rmm::mr::get_current_device_resource());
-    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
-                                                        codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(
-      comp_out_view, stream, rmm::mr::get_current_device_resource());
-    device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
-
-    switch (codec.compression_type) {
-      case GZIP:
-        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
-        break;
-      case SNAPPY:
-        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
-          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     d_comp_in,
-                                     d_comp_out,
-                                     d_comp_res_view,
-                                     codec.max_decompressed_size,
-                                     codec.total_decomp_size,
-                                     stream);
-        } else {
-          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
-        }
-        break;
-      case ZSTD:
-        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   d_comp_in,
-                                   d_comp_out,
-                                   d_comp_res_view,
-                                   codec.max_decompressed_size,
-                                   codec.total_decomp_size,
-                                   stream);
-        break;
-      case BROTLI:
-        gpu_debrotli(d_comp_in,
-                     d_comp_out,
-                     d_comp_res_view,
-                     debrotli_scratch.data(),
-                     debrotli_scratch.size(),
-                     stream);
-        break;
-      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
-    }
-    start_pos += codec.num_pages;
+  __device__ void operator()(PageInfo& page)
+  {
+    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return; }
+    auto const& chunk  = chunks[page.chunk_idx];
+    auto const is_list = chunk.max_level[level_type::REPETITION] > 0;
+    if (!is_list) { return; }
+
+    // For LIST pages that we have not yet decoded, page.num_rows is not an accurate number.
+    // so we instead estimate the number of rows as follows:
+    // - each chunk stores an estimated number of bytes per row E
+    // - estimate number of rows in a page = page.uncompressed_page_size / E
+    //
+    // it is not required that this number is accurate. we just want it to be somewhat close so that
+    // we get reasonable results as we choose subpass splits.
+    //
+    // all other columns can use page.num_rows directly as it will be accurate.
+    page.num_rows = static_cast<size_t>(static_cast<float>(page.uncompressed_page_size) /
+                                        chunk.list_bytes_per_row_est);
   }
+};
+
+/**
+ * @brief Set the expected row count on the final page for all columns.
+ *
+ */
+struct set_final_row_count {
+  device_span<PageInfo> pages;
+  device_span<const ColumnChunkDesc> chunks;
+  device_span<const size_type> page_offsets;
+  size_t const max_row;
 
-  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
-                              comp_res.begin(),
-                              comp_res.end(),
-                              [] __device__(auto const& res) {
-                                return res.status == compression_status::SUCCESS;
-                              }),
-               "Error during decompression");
-
-  // now copy the uncompressed V2 def and rep level data
-  if (not copy_in.empty()) {
-    auto const d_copy_in = cudf::detail::make_device_uvector_async(
-      copy_in, stream, rmm::mr::get_current_device_resource());
-    auto const d_copy_out = cudf::detail::make_device_uvector_async(
-      copy_out, stream, rmm::mr::get_current_device_resource());
-
-    gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
-    stream.synchronize();
+  __device__ void operator()(size_t i)
+  {
+    auto const last_page_index      = page_offsets[i + 1] - 1;
+    auto const& page                = pages[last_page_index];
+    auto const& chunk               = chunks[page.chunk_idx];
+    size_t const page_start_row     = chunk.start_row + page.chunk_row;
+    pages[last_page_index].num_rows = max_row - page_start_row;
   }
+};
 
-  // Update the page information in device memory with the updated value of
-  // page_data; it now points to the uncompressed data buffer
-  pages.host_to_device_async(stream);
+}  // anonymous namespace
 
-  return decomp_pages;
+void reader::impl::build_string_dict_indices()
+{
+  auto& pass = *_pass_itm_data;
+
+  // compute number of indices per chunk and a summed total
+  rmm::device_uvector<size_t> str_dict_index_count(pass.chunks.size() + 1, _stream);
+  thrust::fill(
+    rmm::exec_policy_nosync(_stream), str_dict_index_count.begin(), str_dict_index_count.end(), 0);
+  thrust::for_each(rmm::exec_policy_nosync(_stream),
+                   pass.pages.begin(),
+                   pass.pages.end(),
+                   set_str_dict_index_count{str_dict_index_count, pass.chunks});
+
+  size_t const total_str_dict_indexes = thrust::reduce(
+    rmm::exec_policy(_stream), str_dict_index_count.begin(), str_dict_index_count.end());
+  if (total_str_dict_indexes == 0) { return; }
+
+  // convert to offsets
+  rmm::device_uvector<size_t>& str_dict_index_offsets = str_dict_index_count;
+  thrust::exclusive_scan(rmm::exec_policy_nosync(_stream),
+                         str_dict_index_offsets.begin(),
+                         str_dict_index_offsets.end(),
+                         str_dict_index_offsets.begin(),
+                         0);
+
+  // allocate and distribute pointers
+  pass.str_dict_index = cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+    total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
+
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::for_each(
+    rmm::exec_policy_nosync(_stream),
+    iter,
+    iter + pass.chunks.size(),
+    set_str_dict_index_ptr{pass.str_dict_index.data(), str_dict_index_offsets, pass.chunks});
+
+  // compute the indices
+  BuildStringDictionaryIndex(pass.chunks.device_ptr(), pass.chunks.size(), _stream);
+  pass.chunks.device_to_host_sync(_stream);
 }
 
-}  // namespace
-
 void reader::impl::allocate_nesting_info()
 {
-  auto const& chunks             = _pass_itm_data->chunks;
-  auto& pages                    = _pass_itm_data->pages_info;
-  auto& page_nesting_info        = _pass_itm_data->page_nesting_info;
-  auto& page_nesting_decode_info = _pass_itm_data->page_nesting_decode_info;
+  auto& pass    = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
+
+  auto const num_columns         = _input_columns.size();
+  auto& pages                    = subpass.pages;
+  auto& page_nesting_info        = subpass.page_nesting_info;
+  auto& page_nesting_decode_info = subpass.page_nesting_decode_info;
+
+  // generate the number of nesting info structs needed per-page, by column
+  std::vector<int> per_page_nesting_info_size(num_columns);
+  auto iter = thrust::make_counting_iterator(size_type{0});
+  std::transform(iter, iter + num_columns, per_page_nesting_info_size.begin(), [&](size_type i) {
+    auto const schema_idx = _input_columns[i].schema_idx;
+    auto const& schema    = _metadata->get_schema(schema_idx);
+    return max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(schema_idx));
+  });
 
   // compute total # of page_nesting infos needed and allocate space. doing this in one
   // buffer to keep it to a single gpu allocation
-  size_t const total_page_nesting_infos = std::accumulate(
-    chunks.host_ptr(), chunks.host_ptr() + chunks.size(), 0, [&](int total, auto& chunk) {
-      // the schema of the input column
-      auto const& schema                    = _metadata->get_schema(chunk.src_col_schema);
-      auto const per_page_nesting_info_size = max(
-        schema.max_definition_level + 1, _metadata->get_output_nesting_depth(chunk.src_col_schema));
-      return total + (per_page_nesting_info_size * chunk.num_data_pages);
+  auto counting_iter = thrust::make_counting_iterator(size_t{0});
+  size_t const total_page_nesting_infos =
+    std::accumulate(counting_iter, counting_iter + num_columns, 0, [&](int total, size_t index) {
+      return total + (per_page_nesting_info_size[index] * subpass.column_page_count[index]);
     });
 
   page_nesting_info =
@@ -621,41 +664,33 @@ void reader::impl::allocate_nesting_info()
   // update pointers in the PageInfos
   int target_page_index = 0;
   int src_info_index    = 0;
-  for (size_t idx = 0; idx < chunks.size(); idx++) {
-    int src_col_schema                    = chunks[idx].src_col_schema;
-    auto& schema                          = _metadata->get_schema(src_col_schema);
-    auto const per_page_nesting_info_size = std::max(
-      schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
-
-    // skip my dict pages
-    target_page_index += chunks[idx].num_dict_pages;
-    for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
+  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
+    auto const src_col_schema = _input_columns[idx].schema_idx;
+
+    for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
       pages[target_page_index + p_idx].nesting = page_nesting_info.device_ptr() + src_info_index;
       pages[target_page_index + p_idx].nesting_decode =
         page_nesting_decode_info.device_ptr() + src_info_index;
 
-      pages[target_page_index + p_idx].nesting_info_size = per_page_nesting_info_size;
+      pages[target_page_index + p_idx].nesting_info_size = per_page_nesting_info_size[idx];
       pages[target_page_index + p_idx].num_output_nesting_levels =
         _metadata->get_output_nesting_depth(src_col_schema);
 
-      src_info_index += per_page_nesting_info_size;
+      src_info_index += per_page_nesting_info_size[idx];
     }
-    target_page_index += chunks[idx].num_data_pages;
+    target_page_index += subpass.column_page_count[idx];
   }
 
   // fill in
   int nesting_info_index = 0;
   std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
-  for (size_t idx = 0; idx < chunks.size(); idx++) {
-    int src_col_schema = chunks[idx].src_col_schema;
+  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
+    auto const src_col_schema = _input_columns[idx].schema_idx;
 
     // schema of the input column
     auto& schema = _metadata->get_schema(src_col_schema);
     // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc)
-    int max_depth = _metadata->get_output_nesting_depth(src_col_schema);
-
-    // # of nesting infos stored per page for this column
-    auto const per_page_nesting_info_size = std::max(schema.max_definition_level + 1, max_depth);
+    int const max_output_depth = _metadata->get_output_nesting_depth(src_col_schema);
 
     // if this column has lists, generate depth remapping
     std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
@@ -666,18 +701,19 @@ void reader::impl::allocate_nesting_info()
     // fill in host-side nesting info
     int schema_idx  = src_col_schema;
     auto cur_schema = _metadata->get_schema(schema_idx);
-    int cur_depth   = max_depth - 1;
+    int cur_depth   = max_output_depth - 1;
     while (schema_idx > 0) {
-      // stub columns (basically the inner field of a list scheme element) are not real columns.
+      // stub columns (basically the inner field of a list schema element) are not real columns.
       // we can ignore them for the purposes of output nesting info
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
-        for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
+        for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
           PageNestingInfo* pni =
-            &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
+            &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])];
 
           PageNestingDecodeInfo* nesting_info =
-            &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
+            &page_nesting_decode_info[nesting_info_index +
+                                      (p_idx * per_page_nesting_info_size[idx])];
 
           // if we have lists, set our start and end depth remappings
           if (schema.max_repetition_level > 0) {
@@ -712,7 +748,7 @@ void reader::impl::allocate_nesting_info()
       cur_schema = _metadata->get_schema(schema_idx);
     }
 
-    nesting_info_index += (per_page_nesting_info_size * chunks[idx].num_data_pages);
+    nesting_info_index += (per_page_nesting_info_size[idx] * subpass.column_page_count[idx]);
   }
 
   // copy nesting info to the device
@@ -722,32 +758,33 @@ void reader::impl::allocate_nesting_info()
 
 void reader::impl::allocate_level_decode_space()
 {
-  auto& pages = _pass_itm_data->pages_info;
+  auto& pass    = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
+
+  auto& pages = subpass.pages;
 
   // TODO: this could be made smaller if we ignored dictionary pages and pages with no
   // repetition data.
-  size_t const per_page_decode_buf_size =
-    LEVEL_DECODE_BUF_SIZE * 2 * _pass_itm_data->level_type_size;
-  auto const decode_buf_size = per_page_decode_buf_size * pages.size();
-  _pass_itm_data->level_decode_data =
+  size_t const per_page_decode_buf_size = LEVEL_DECODE_BUF_SIZE * 2 * pass.level_type_size;
+  auto const decode_buf_size            = per_page_decode_buf_size * pages.size();
+  subpass.level_decode_data =
     rmm::device_buffer(decode_buf_size, _stream, rmm::mr::get_current_device_resource());
 
   // distribute the buffers
-  uint8_t* buf = static_cast<uint8_t*>(_pass_itm_data->level_decode_data.data());
+  uint8_t* buf = static_cast<uint8_t*>(subpass.level_decode_data.data());
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto& p = pages[idx];
 
     p.lvl_decode_buf[level_type::DEFINITION] = buf;
-    buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
+    buf += (LEVEL_DECODE_BUF_SIZE * pass.level_type_size);
     p.lvl_decode_buf[level_type::REPETITION] = buf;
-    buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
+    buf += (LEVEL_DECODE_BUF_SIZE * pass.level_type_size);
   }
 }
 
-std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompress_column_chunks()
+std::pair<bool, std::vector<std::future<void>>> reader::impl::read_column_chunks()
 {
   auto const& row_groups_info = _pass_itm_data->row_groups;
-  auto const num_rows         = _pass_itm_data->num_rows;
 
   auto& raw_page_data = _pass_itm_data->raw_page_data;
   auto& chunks        = _pass_itm_data->chunks;
@@ -767,13 +804,14 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompres
 
   // Initialize column chunk information
   size_t total_decompressed_size = 0;
-  auto remaining_rows            = num_rows;
+  // TODO: make this respect the pass-wide skip_rows/num_rows instead of the file-wide
+  // skip_rows/num_rows
+  // auto remaining_rows            = num_rows;
   std::vector<std::future<void>> read_chunk_tasks;
   size_type chunk_count = 0;
   for (auto const& rg : row_groups_info) {
     auto const& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
     auto const row_group_source = rg.source_index;
-    auto const row_group_rows   = std::min<int>(remaining_rows, row_group.num_rows);
 
     // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
     for (size_t i = 0; i < num_input_columns; ++i) {
@@ -795,7 +833,6 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompres
 
       chunk_count++;
     }
-    remaining_rows -= row_group_rows;
   }
 
   // Read compressed chunk data to device memory
@@ -808,22 +845,20 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompres
                                                       chunk_source_map,
                                                       _stream));
 
-  CUDF_EXPECTS(remaining_rows == 0, "All rows data must be read.");
-
   return {total_decompressed_size > 0, std::move(read_chunk_tasks)};
 }
 
-void reader::impl::load_and_decompress_data()
+void reader::impl::read_compressed_data()
 {
+  auto& pass = *_pass_itm_data;
+
   // This function should never be called if `num_rows == 0`.
   CUDF_EXPECTS(_pass_itm_data->num_rows > 0, "Number of reading rows must not be zero.");
 
-  auto& raw_page_data    = _pass_itm_data->raw_page_data;
-  auto& decomp_page_data = _pass_itm_data->decomp_page_data;
-  auto& chunks           = _pass_itm_data->chunks;
-  auto& pages            = _pass_itm_data->pages_info;
+  auto& chunks = pass.chunks;
 
-  auto const [has_compressed_data, read_chunks_tasks] = read_and_decompress_column_chunks();
+  auto const [has_compressed_data, read_chunks_tasks] = read_column_chunks();
+  pass.has_compressed_data                            = has_compressed_data;
 
   for (auto& task : read_chunks_tasks) {
     task.wait();
@@ -832,44 +867,12 @@ void reader::impl::load_and_decompress_data()
   // Process dataset chunk pages into output columns
   auto const total_pages = count_page_headers(chunks, _stream);
   if (total_pages <= 0) { return; }
-  pages = cudf::detail::hostdevice_vector<PageInfo>(total_pages, total_pages, _stream);
+  rmm::device_uvector<PageInfo> unsorted_pages(total_pages, _stream);
 
   // decoding of column/page information
-  _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream);
-  pages.device_to_host_sync(_stream);
-  if (has_compressed_data) {
-    decomp_page_data = decompress_page_data(chunks, pages, _stream);
-    // Free compressed data
-    for (size_t c = 0; c < chunks.size(); c++) {
-      if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
-    }
-  }
-
-  // build output column info
-  // walk the schema, building out_buffers that mirror what our final cudf columns will look
-  // like. important : there is not necessarily a 1:1 mapping between input columns and output
-  // columns. For example, parquet does not explicitly store a ColumnChunkDesc for struct
-  // columns. The "structiness" is simply implied by the schema.  For example, this schema:
-  //  required group field_id=1 name {
-  //    required binary field_id=2 firstname (String);
-  //    required binary field_id=3 middlename (String);
-  //    required binary field_id=4 lastname (String);
-  // }
-  // will only contain 3 columns of data (firstname, middlename, lastname).  But of course
-  // "name" is a struct column that we want to return, so we have to make sure that we
-  // create it ourselves.
-  // std::vector<output_column_info> output_info = build_output_column_info();
-
-  // the following two allocate functions modify the page data
-  {
-    // nesting information (sizes, etc) stored -per page-
-    // note : even for flat schemas, we allocate 1 level of "nesting" info
-    allocate_nesting_info();
-
-    // level decode space
-    allocate_level_decode_space();
-  }
-  pages.host_to_device_async(_stream);
+  decode_page_headers(pass, unsorted_pages, _stream);
+  CUDF_EXPECTS(pass.page_offsets.size() - 1 == static_cast<size_t>(_input_columns.size()),
+               "Encountered page_offsets / num_columns mismatch");
 }
 
 namespace {
@@ -880,28 +883,6 @@ struct cumulative_row_info {
   int key;            // schema index
 };
 
-#if defined(PREPROCESS_DEBUG)
-void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_stream_view _stream)
-{
-  pages.device_to_host_sync(_stream);
-  for (size_t idx = 0; idx < pages.size(); idx++) {
-    auto const& p = pages[idx];
-    // skip dictionary pages
-    if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
-    printf(
-      "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), "
-      "str_bytes(%d)\n",
-      idx,
-      p.src_col_schema,
-      p.chunk_row,
-      p.num_rows,
-      p.skipped_values,
-      p.skipped_leaf_values,
-      p.str_bytes);
-  }
-}
-#endif  // PREPROCESS_DEBUG
-
 struct get_page_chunk_idx {
   __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; }
 };
@@ -910,14 +891,6 @@ struct get_page_num_rows {
   __device__ size_type operator()(PageInfo const& page) { return page.num_rows; }
 };
 
-struct get_page_column_index {
-  ColumnChunkDesc const* chunks;
-  __device__ size_type operator()(PageInfo const& page)
-  {
-    return chunks[page.chunk_idx].src_col_index;
-  }
-};
-
 struct input_col_info {
   int const schema_idx;
   size_type const nesting_depth;
@@ -950,13 +923,12 @@ struct get_page_nesting_size {
   size_type const max_depth;
   size_t const num_pages;
   PageInfo const* const pages;
-  int const* page_indices;
 
   __device__ size_type operator()(size_t index) const
   {
     auto const indices = reduction_indices{index, max_depth, num_pages};
 
-    auto const& page = pages[page_indices[indices.page_idx]];
+    auto const& page = pages[indices.page_idx];
     if (page.src_col_schema != input_cols[indices.col_idx].schema_idx ||
         page.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
@@ -995,12 +967,14 @@ struct chunk_row_output_iter {
   __device__ reference operator*() { return p->chunk_row; }
 };
 
+/**
+ * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema.
+ */
 /**
  * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema.
  */
 struct start_offset_output_iterator {
   PageInfo const* pages;
-  int const* page_indices;
   size_t cur_index;
   input_col_info const* input_cols;
   size_type max_depth;
@@ -1014,17 +988,16 @@ struct start_offset_output_iterator {
 
   constexpr void operator=(start_offset_output_iterator const& other)
   {
-    pages        = other.pages;
-    page_indices = other.page_indices;
-    cur_index    = other.cur_index;
-    input_cols   = other.input_cols;
-    max_depth    = other.max_depth;
-    num_pages    = other.num_pages;
+    pages      = other.pages;
+    cur_index  = other.cur_index;
+    input_cols = other.input_cols;
+    max_depth  = other.max_depth;
+    num_pages  = other.num_pages;
   }
 
   constexpr start_offset_output_iterator operator+(size_t i)
   {
-    return {pages, page_indices, cur_index + i, input_cols, max_depth, num_pages};
+    return start_offset_output_iterator{pages, cur_index + i, input_cols, max_depth, num_pages};
   }
 
   constexpr start_offset_output_iterator& operator++()
@@ -1041,7 +1014,7 @@ struct start_offset_output_iterator {
   {
     auto const indices = reduction_indices{index, max_depth, num_pages};
 
-    PageInfo const& p = pages[page_indices[indices.page_idx]];
+    PageInfo const& p = pages[indices.page_idx];
     if (p.src_col_schema != input_cols[indices.col_idx].schema_idx ||
         p.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
@@ -1051,114 +1024,20 @@ struct start_offset_output_iterator {
   }
 };
 
-struct flat_column_num_rows {
-  PageInfo const* pages;
-  ColumnChunkDesc const* chunks;
-
-  __device__ size_type operator()(size_type pindex) const
-  {
-    PageInfo const& page = pages[pindex];
-    // ignore dictionary pages and pages belonging to any column containing repetition (lists)
-    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) ||
-        (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) {
-      return 0;
-    }
-    return page.num_rows;
-  }
-};
-
-struct row_counts_nonzero {
-  __device__ bool operator()(size_type count) const { return count > 0; }
-};
-
-struct row_counts_different {
-  size_type const expected;
-  __device__ bool operator()(size_type count) const { return (count != 0) && (count != expected); }
-};
-
-/**
- * @brief Detect malformed parquet input data.
- *
- * We have seen cases where parquet files can be oddly malformed. This function specifically
- * detects one case in particular:
- *
- * - When you have a file containing N rows
- * - For some reason, the sum total of the number of rows over all pages for a given column
- *   is != N
- *
- * @param pages All pages to be decoded
- * @param chunks Chunk data
- * @param page_keys Keys (schema id) associated with each page, sorted by column
- * @param page_index Page indices for iteration, sorted by column
- * @param expected_row_count Expected row count, if applicable
- * @param stream CUDA stream used for device memory operations and kernel launches
- */
-void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo>& pages,
-                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
-                            device_span<int const> page_keys,
-                            device_span<int const> page_index,
-                            std::optional<size_t> expected_row_count,
-                            rmm::cuda_stream_view stream)
-{
-  // sum row counts for all non-dictionary, non-list columns. other columns will be indicated as 0
-  rmm::device_uvector<size_type> row_counts(pages.size(),
-                                            stream);  // worst case:  num keys == num pages
-  auto const size_iter = thrust::make_transform_iterator(
-    page_index.begin(), flat_column_num_rows{pages.device_ptr(), chunks.device_ptr()});
-  auto const row_counts_begin = row_counts.begin();
-  auto const row_counts_end   = thrust::reduce_by_key(rmm::exec_policy(stream),
-                                                    page_keys.begin(),
-                                                    page_keys.end(),
-                                                    size_iter,
-                                                    thrust::make_discard_iterator(),
-                                                    row_counts_begin)
-                                .second;
-
-  // make sure all non-zero row counts are the same
-  rmm::device_uvector<size_type> compacted_row_counts(pages.size(), stream);
-  auto const compacted_row_counts_begin = compacted_row_counts.begin();
-  auto const compacted_row_counts_end   = thrust::copy_if(rmm::exec_policy(stream),
-                                                        row_counts_begin,
-                                                        row_counts_end,
-                                                        compacted_row_counts_begin,
-                                                        row_counts_nonzero{});
-  if (compacted_row_counts_end != compacted_row_counts_begin) {
-    size_t const found_row_count = static_cast<size_t>(compacted_row_counts.element(0, stream));
-
-    // if we somehow don't match the expected row count from the row groups themselves
-    if (expected_row_count.has_value()) {
-      CUDF_EXPECTS(expected_row_count.value() == found_row_count,
-                   "Encountered malformed parquet page data (unexpected row count in page data)");
-    }
-
-    // all non-zero row counts must be the same
-    auto const chk =
-      thrust::count_if(rmm::exec_policy(stream),
-                       compacted_row_counts_begin,
-                       compacted_row_counts_end,
-                       row_counts_different{static_cast<size_type>(found_row_count)});
-    CUDF_EXPECTS(chk == 0,
-                 "Encountered malformed parquet page data (row count mismatch in page data)");
-  }
-}
-
 struct page_to_string_size {
-  PageInfo* pages;
   ColumnChunkDesc const* chunks;
 
-  __device__ size_t operator()(size_type page_idx) const
+  __device__ size_t operator()(PageInfo const& page) const
   {
-    auto const page  = pages[page_idx];
     auto const chunk = chunks[page.chunk_idx];
 
     if (not is_string_col(chunk) || (page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return 0; }
-    return pages[page_idx].str_bytes;
+    return page.str_bytes;
   }
 };
 
 struct page_offset_output_iter {
   PageInfo* p;
-  size_type const* index;
 
   using value_type        = size_type;
   using difference_type   = size_type;
@@ -1166,78 +1045,148 @@ struct page_offset_output_iter {
   using reference         = size_type&;
   using iterator_category = thrust::output_device_iterator_tag;
 
-  __host__ __device__ page_offset_output_iter operator+(int i) { return {p, index + i}; }
+  __host__ __device__ page_offset_output_iter operator+(int i) { return {p + i}; }
 
   __host__ __device__ page_offset_output_iter& operator++()
   {
-    index++;
+    p++;
     return *this;
   }
 
-  __device__ reference operator[](int i) { return p[index[i]].str_offset; }
-  __device__ reference operator*() { return p[*index].str_offset; }
+  __device__ reference operator[](int i) { return p[i].str_offset; }
+  __device__ reference operator*() { return p->str_offset; }
 };
+// update chunk_row field in subpass page from pass page
+struct update_subpass_chunk_row {
+  device_span<PageInfo> pass_pages;
+  device_span<PageInfo> subpass_pages;
+  device_span<size_t> page_src_index;
 
-}  // anonymous namespace
+  __device__ void operator()(size_t i)
+  {
+    subpass_pages[i].chunk_row = pass_pages[page_src_index[i]].chunk_row;
+  }
+};
 
-void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)
-{
-  auto const skip_rows = _pass_itm_data->skip_rows;
-  auto const num_rows  = _pass_itm_data->num_rows;
-  auto& chunks         = _pass_itm_data->chunks;
-  auto& pages          = _pass_itm_data->pages_info;
+// update num_rows field from pass page to subpass page
+struct update_pass_num_rows {
+  device_span<PageInfo> pass_pages;
+  device_span<PageInfo> subpass_pages;
+  device_span<size_t> page_src_index;
 
-  // compute page ordering.
-  //
-  // ordering of pages is by input column schema, repeated across row groups.  so
-  // if we had 3 columns, each with 2 pages, and 1 row group, our schema values might look like
-  //
-  // 1, 1, 2, 2, 3, 3
-  //
-  // However, if we had more than one row group, the pattern would be
-  //
-  // 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3
-  // ^ row group 0     |
-  //                   ^ row group 1
-  //
-  // To process pages by key (exclusive_scan_by_key, reduce_by_key, etc), the ordering we actually
-  // want is
-  //
-  // 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
-  //
-  // We also need to preserve key-relative page ordering, so we need to use a stable sort.
-  rmm::device_uvector<int> page_keys(pages.size(), _stream);
-  rmm::device_uvector<int> page_index(pages.size(), _stream);
+  __device__ void operator()(size_t i)
   {
-    thrust::transform(rmm::exec_policy(_stream),
-                      pages.device_ptr(),
-                      pages.device_ptr() + pages.size(),
-                      page_keys.begin(),
-                      get_page_column_index{chunks.device_ptr()});
+    pass_pages[page_src_index[i]].num_rows = subpass_pages[i].num_rows;
+  }
+};
 
-    thrust::sequence(rmm::exec_policy(_stream), page_index.begin(), page_index.end());
-    thrust::stable_sort_by_key(rmm::exec_policy(_stream),
-                               page_keys.begin(),
-                               page_keys.end(),
-                               page_index.begin(),
-                               thrust::less<int>());
+}  // anonymous namespace
+
+void reader::impl::preprocess_file(
+  int64_t skip_rows,
+  std::optional<size_type> const& num_rows,
+  host_span<std::vector<size_type> const> row_group_indices,
+  std::optional<std::reference_wrapper<ast::expression const>> filter)
+{
+  CUDF_EXPECTS(!_file_preprocessed, "Attempted to preprocess file more than once");
+
+  // if filter is not empty, then create output types as vector and pass for filtering.
+  std::vector<data_type> output_types;
+  if (filter.has_value()) {
+    std::transform(_output_buffers.cbegin(),
+                   _output_buffers.cend(),
+                   std::back_inserter(output_types),
+                   [](auto const& col) { return col.type; });
+  }
+  std::tie(
+    _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
+    _metadata->select_row_groups(
+      row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
+
+  if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
+      not _input_columns.empty()) {
+    // fills in chunk information without physically loading or decompressing
+    // the associated data
+    create_global_chunk_info();
+
+    // compute schedule of input reads.
+    compute_input_passes();
+  }
+
+#if defined(PARQUET_CHUNK_LOGGING)
+  printf("==============================================\n");
+  setlocale(LC_NUMERIC, "");
+  printf("File: skip_rows(%'lu), num_rows(%'lu), input_read_limit(%'lu), output_read_limit(%'lu)\n",
+         _file_itm_data.global_skip_rows,
+         _file_itm_data.global_num_rows,
+         _input_pass_read_limit,
+         _output_chunk_read_limit);
+  printf("# Row groups: %'lu\n", _file_itm_data.row_groups.size());
+  printf("# Input passes: %'lu\n", _file_itm_data.num_passes());
+  printf("# Input columns: %'lu\n", _input_columns.size());
+  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
+    auto const& schema = _metadata->get_schema(_input_columns[idx].schema_idx);
+    auto const type_id = to_type_id(schema, _strings_to_categorical, _timestamp_type.id());
+    printf("\tC(%'lu, %s): %s\n",
+           idx,
+           _input_columns[idx].name.c_str(),
+           cudf::type_to_name(cudf::data_type{type_id}).c_str());
+  }
+  printf("# Output columns: %'lu\n", _output_buffers.size());
+  for (size_t idx = 0; idx < _output_buffers.size(); idx++) {
+    printf("\tC(%'lu): %s\n", idx, cudf::io::detail::type_to_name(_output_buffers[idx]).c_str());
   }
+#endif
+
+  _file_preprocessed = true;
+}
+
+void reader::impl::generate_list_column_row_count_estimates()
+{
+  auto& pass = *_pass_itm_data;
+  thrust::for_each(rmm::exec_policy(_stream),
+                   pass.pages.d_begin(),
+                   pass.pages.d_end(),
+                   set_list_row_count_estimate{pass.chunks});
+
+  // computes:
+  // PageInfo::chunk_row (the chunk-relative row index) for all pages in the pass. The start_row
+  // field in ColumnChunkDesc is the absolute row index for the whole file. chunk_row in PageInfo is
+  // relative to the beginning of the chunk. so in the kernels, chunk.start_row + page.chunk_row
+  // gives us the absolute row index
+  auto key_input  = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_chunk_idx{});
+  auto page_input = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_num_rows{});
+  thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
+                                key_input,
+                                key_input + pass.pages.size(),
+                                page_input,
+                                chunk_row_output_iter{pass.pages.device_ptr()});
+
+  // finally, fudge the last page for each column such that it ends on the real known row count
+  // for the pass. this is so that as we march through the subpasses, we will find that every column
+  // cleanly ends up the expected row count at the row group boundary.
+  auto const& last_chunk = pass.chunks[pass.chunks.size() - 1];
+  auto const num_columns = _input_columns.size();
+  size_t const max_row   = last_chunk.start_row + last_chunk.num_rows;
+  auto iter              = thrust::make_counting_iterator(0);
+  thrust::for_each(rmm::exec_policy_nosync(_stream),
+                   iter,
+                   iter + num_columns,
+                   set_final_row_count{pass.pages, pass.chunks, pass.page_offsets, max_row});
+
+  pass.chunks.device_to_host_async(_stream);
+  pass.pages.device_to_host_async(_stream);
+  _stream.synchronize();
+}
+
+void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)
+{
+  auto& pass    = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
 
-  // detect malformed columns.
-  // - we have seen some cases in the wild where we have a row group containing N
-  //   rows, but the total number of rows in the pages for column X is != N. while it
-  //   is possible to load this by just capping the number of rows read, we cannot tell
-  //   which rows are invalid so we may be returning bad data. in addition, this mismatch
-  //   confuses the chunked reader
-  detect_malformed_pages(pages,
-                         chunks,
-                         page_keys,
-                         page_index,
-                         uses_custom_row_bounds ? std::nullopt : std::make_optional(num_rows),
-                         _stream);
-
-  // iterate over all input columns and determine if they contain lists so we can further
-  // preprocess them.
+  // iterate over all input columns and determine if they contain lists.
+  // TODO: we could do this once at the file level instead of every time we get in here. the set of
+  // columns we are processing does not change over multiple passes/subpasses/output chunks.
   bool has_lists = false;
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const& input_col  = _input_columns[idx];
@@ -1258,49 +1207,9 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
     if (has_lists) { break; }
   }
 
-  // generate string dict indices if necessary
-  {
-    auto is_dict_chunk = [](ColumnChunkDesc const& chunk) {
-      return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
-    };
-
-    // Count the number of string dictionary entries
-    // NOTE: Assumes first page in the chunk is always the dictionary page
-    size_t total_str_dict_indexes = 0;
-    for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-      if (is_dict_chunk(chunks[c])) {
-        total_str_dict_indexes += pages[page_count].num_input_values;
-      }
-      page_count += chunks[c].max_num_pages;
-    }
-
-    // Build index for string dictionaries since they can't be indexed
-    // directly due to variable-sized elements
-    _pass_itm_data->str_dict_index =
-      cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-        total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
-
-    // Update chunks with pointers to string dict indices
-    for (size_t c = 0, page_count = 0, str_ofs = 0; c < chunks.size(); c++) {
-      input_column_info const& input_col = _input_columns[chunks[c].src_col_index];
-      CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
-                   "Column/page schema index mismatch");
-      if (is_dict_chunk(chunks[c])) {
-        chunks[c].str_dict_index = _pass_itm_data->str_dict_index.data() + str_ofs;
-        str_ofs += pages[page_count].num_input_values;
-      }
-
-      // column_data_base will always point to leaf data, even for nested types.
-      page_count += chunks[c].max_num_pages;
-    }
-
-    if (total_str_dict_indexes > 0) {
-      chunks.host_to_device_async(_stream);
-      BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
-    }
-  }
-
-  // intermediate data we will need for further chunked reads
+  // in some cases we will need to do further preprocessing of pages.
+  // - if we have lists, the num_rows field in PageInfo will be incorrect coming out of the file
+  // - if we are doing a chunked read, we need to compute the size of all string data
   if (has_lists || chunk_read_limit > 0) {
     // computes:
     // PageNestingInfo::num_rows for each page. the true number of rows (taking repetition into
@@ -1311,48 +1220,92 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
     // if:
     // - user has passed custom row bounds
     // - we will be doing a chunked read
-    ComputePageSizes(pages,
-                     chunks,
+    ComputePageSizes(subpass.pages,
+                     pass.chunks,
                      0,  // 0-max size_t. process all possible rows
                      std::numeric_limits<size_t>::max(),
                      true,                  // compute num_rows
                      chunk_read_limit > 0,  // compute string sizes
                      _pass_itm_data->level_type_size,
                      _stream);
+  }
 
-    // computes:
-    // PageInfo::chunk_row (the absolute start row index) for all pages
-    // Note: this is doing some redundant work for pages in flat hierarchies.  chunk_row has already
-    // been computed during header decoding. the overall amount of work here is very small though.
-    auto key_input  = thrust::make_transform_iterator(pages.device_ptr(), get_page_chunk_idx{});
-    auto page_input = thrust::make_transform_iterator(pages.device_ptr(), get_page_num_rows{});
-    thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
-                                  key_input,
-                                  key_input + pages.size(),
-                                  page_input,
-                                  chunk_row_output_iter{pages.device_ptr()});
-
-    // retrieve pages back
-    pages.device_to_host_sync(_stream);
+  // copy our now-correct row counts  back to the base pages stored in the pass.
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::for_each(rmm::exec_policy_nosync(_stream),
+                   iter,
+                   iter + subpass.pages.size(),
+                   update_pass_num_rows{pass.pages, subpass.pages, subpass.page_src_index});
 
-    // print_pages(pages, _stream);
-  }
+  // computes:
+  // PageInfo::chunk_row (the chunk-relative row index) for all pages in the pass. The start_row
+  // field in ColumnChunkDesc is the absolute row index for the whole file. chunk_row in PageInfo is
+  // relative to the beginning of the chunk. so in the kernels, chunk.start_row + page.chunk_row
+  // gives us the absolute row index
+  auto key_input  = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_chunk_idx{});
+  auto page_input = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_num_rows{});
+  thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
+                                key_input,
+                                key_input + pass.pages.size(),
+                                page_input,
+                                chunk_row_output_iter{pass.pages.device_ptr()});
+
+  // copy chunk row into the subpass pages
+  thrust::for_each(rmm::exec_policy_nosync(_stream),
+                   iter,
+                   iter + subpass.pages.size(),
+                   update_subpass_chunk_row{pass.pages, subpass.pages, subpass.page_src_index});
+
+  // retrieve pages back
+  pass.pages.device_to_host_async(_stream);
+  subpass.pages.device_to_host_async(_stream);
+  _stream.synchronize();
 
-  // preserve page ordering data for string decoder
-  _pass_itm_data->page_keys  = std::move(page_keys);
-  _pass_itm_data->page_index = std::move(page_index);
+  // at this point we have an accurate row count so we can compute how many rows we will actually be
+  // able to decode for this pass. we will have selected a set of pages for each column in the
+  // row group, but not every page will have the same number of rows. so, we can only read as many
+  // rows as the smallest batch (by column) we have decompressed.
+  size_t page_index = 0;
+  size_t max_row    = std::numeric_limits<size_t>::max();
+  auto const last_pass_row =
+    _file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass + 1];
+  for (size_t idx = 0; idx < subpass.column_page_count.size(); idx++) {
+    auto const& last_page = subpass.pages[page_index + (subpass.column_page_count[idx] - 1)];
+    auto const& chunk     = pass.chunks[last_page.chunk_idx];
+
+    size_t max_col_row =
+      static_cast<size_t>(chunk.start_row + last_page.chunk_row + last_page.num_rows);
+    // special case.  list rows can span page boundaries, but we can't tell if that is happening
+    // here because we have not yet decoded the pages. the very last row starting in the page may
+    // not terminate in the page. to handle this, only decode up to the second to last row in the
+    // subpass since we know that will safely completed.
+    bool const is_list = chunk.max_level[level_type::REPETITION] > 0;
+    if (is_list && max_col_row < last_pass_row) {
+      size_t const min_col_row = static_cast<size_t>(chunk.start_row + last_page.chunk_row);
+      CUDF_EXPECTS((max_col_row - min_col_row) > 1, "Unexpected short subpass");
+      max_col_row--;
+    }
+
+    max_row = min(max_row, max_col_row);
+
+    page_index += subpass.column_page_count[idx];
+  }
+  subpass.skip_rows   = pass.skip_rows + pass.processed_rows;
+  auto const pass_end = pass.skip_rows + pass.num_rows;
+  max_row             = min(max_row, pass_end);
+  subpass.num_rows    = max_row - subpass.skip_rows;
 
-  // compute splits for the pass
-  compute_splits_for_pass();
+  // now split up the output into chunks as necessary
+  compute_output_chunks_for_subpass();
 }
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
 {
-  auto const& chunks = _pass_itm_data->chunks;
-  auto& pages        = _pass_itm_data->pages_info;
+  auto& pass    = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
 
   // Should not reach here if there is no page data.
-  CUDF_EXPECTS(pages.size() > 0, "There is no page to parse");
+  CUDF_EXPECTS(subpass.pages.size() > 0, "There are no pages present in the subpass");
 
   // computes:
   // PageNestingInfo::batch_size for each level of nesting, for each page, taking row bounds into
@@ -1360,13 +1313,13 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds
   // is set (if the user has specified artificial bounds).
   if (uses_custom_row_bounds) {
-    ComputePageSizes(pages,
-                     chunks,
+    ComputePageSizes(subpass.pages,
+                     pass.chunks,
                      skip_rows,
                      num_rows,
                      false,  // num_rows is already computed
                      false,  // no need to compute string sizes
-                     _pass_itm_data->level_type_size,
+                     pass.level_type_size,
                      _stream);
 
     // print_pages(pages, _stream);
@@ -1403,8 +1356,6 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
 
   // compute output column sizes by examining the pages of the -input- columns
   if (has_lists) {
-    auto& page_index = _pass_itm_data->page_index;
-
     std::vector<input_col_info> h_cols_info;
     h_cols_info.reserve(_input_columns.size());
     std::transform(_input_columns.cbegin(),
@@ -1423,7 +1374,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
     auto const d_cols_info = cudf::detail::make_device_uvector_async(
       h_cols_info, _stream, rmm::mr::get_current_device_resource());
 
-    auto const num_keys = _input_columns.size() * max_depth * pages.size();
+    auto const num_keys = _input_columns.size() * max_depth * subpass.pages.size();
     // size iterator. indexes pages by sorted order
     rmm::device_uvector<size_type> size_input{num_keys, _stream};
     thrust::transform(
@@ -1432,9 +1383,9 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       thrust::make_counting_iterator<size_type>(num_keys),
       size_input.begin(),
       get_page_nesting_size{
-        d_cols_info.data(), max_depth, pages.size(), pages.device_ptr(), page_index.begin()});
+        d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.d_begin()});
     auto const reduction_keys =
-      cudf::detail::make_counting_transform_iterator(0, get_reduction_key{pages.size()});
+      cudf::detail::make_counting_transform_iterator(0, get_reduction_key{subpass.pages.size()});
     cudf::detail::hostdevice_vector<size_t> sizes{_input_columns.size() * max_depth, _stream};
 
     // find the size of each column
@@ -1452,7 +1403,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       reduction_keys + num_keys,
       size_input.cbegin(),
       start_offset_output_iterator{
-        pages.device_ptr(), page_index.begin(), 0, d_cols_info.data(), max_depth, pages.size()});
+        subpass.pages.d_begin(), 0, d_cols_info.data(), max_depth, subpass.pages.size()});
 
     sizes.device_to_host_sync(_stream);
     for (size_type idx = 0; idx < static_cast<size_type>(_input_columns.size()); idx++) {
@@ -1483,30 +1434,30 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
 
 std::vector<size_t> reader::impl::calculate_page_string_offsets()
 {
-  auto& chunks           = _pass_itm_data->chunks;
-  auto& pages            = _pass_itm_data->pages_info;
-  auto const& page_keys  = _pass_itm_data->page_keys;
-  auto const& page_index = _pass_itm_data->page_index;
+  auto& pass    = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
+
+  auto page_keys = make_page_key_iterator(subpass.pages);
 
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
 
   // use page_index to fetch page string sizes in the proper order
-  auto val_iter = thrust::make_transform_iterator(
-    page_index.begin(), page_to_string_size{pages.device_ptr(), chunks.device_ptr()});
+  auto val_iter = thrust::make_transform_iterator(subpass.pages.d_begin(),
+                                                  page_to_string_size{pass.chunks.d_begin()});
 
   // do scan by key to calculate string offsets for each page
   thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
-                                page_keys.begin(),
-                                page_keys.end(),
+                                page_keys,
+                                page_keys + subpass.pages.size(),
                                 val_iter,
-                                page_offset_output_iter{pages.device_ptr(), page_index.data()});
+                                page_offset_output_iter{subpass.pages.device_ptr()});
 
   // now sum up page sizes
   rmm::device_uvector<int> reduce_keys(col_sizes.size(), _stream);
   thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
-                        page_keys.begin(),
-                        page_keys.end(),
+                        page_keys,
+                        page_keys + subpass.pages.size(),
                         val_iter,
                         reduce_keys.begin(),
                         d_col_sizes.begin());
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 36303a60aa9..951217dc442 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -26,6 +26,9 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <iomanip>
+#include <sstream>
+
 namespace cudf::io::detail {
 
 void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
@@ -129,6 +132,30 @@ string_policy column_buffer_base<string_policy>::empty_like(string_policy const&
   return new_buff;
 }
 
+template <typename string_policy>
+std::string type_to_name(column_buffer_base<string_policy> const& buffer)
+{
+  if (buffer.type.id() == cudf::type_id::LIST) {
+    return "List<" + (type_to_name<string_policy>(buffer.children[0])) + ">";
+  }
+
+  if (buffer.type.id() == cudf::type_id::STRUCT) {
+    std::ostringstream out;
+
+    out << "Struct<";
+    auto iter = thrust::make_counting_iterator(0);
+    std::transform(
+      iter,
+      iter + buffer.children.size(),
+      std::ostream_iterator<std::string>(out, ","),
+      [&buffer](size_type i) { return type_to_name<string_policy>(buffer.children[i]); });
+    out << ">";
+    return out.str();
+  }
+
+  return cudf::type_to_name(buffer.type);
+}
+
 template <class string_policy>
 std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     column_name_info* schema_info,
@@ -336,6 +363,10 @@ template std::unique_ptr<column> empty_like<pointer_type>(pointer_column_buffer&
                                                           rmm::cuda_stream_view stream,
                                                           rmm::mr::device_memory_resource* mr);
 
+template std::string type_to_name<string_type>(string_column_buffer const& buffer);
+template std::string type_to_name<pointer_type>(pointer_column_buffer const& buffer);
+
 template class column_buffer_base<pointer_type>;
 template class column_buffer_base<string_type>;
+
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 2ee7c17e480..57ee1043ee9 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -253,6 +253,16 @@ std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Given a column_buffer, produce a formatted name string describing the type.
+ *
+ * @param buffer The column buffer
+ *
+ * @return A string describing the type of the buffer suitable for printing
+ */
+template <class string_policy>
+std::string type_to_name(column_buffer_base<string_policy> const& buffer);
+
 }  // namespace detail
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 24085eb5e10..d40b2410ca3 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -293,7 +293,7 @@ ConfigureTest(
 ConfigureTest(
   PARQUET_TEST
   io/parquet_test.cpp
-  io/parquet_chunked_reader_test.cpp
+  io/parquet_chunked_reader_test.cu
   io/parquet_chunked_writer_test.cpp
   io/parquet_common.cpp
   io/parquet_misc_test.cpp
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cpp b/cpp/tests/io/parquet_chunked_reader_test.cu
similarity index 73%
rename from cpp/tests/io/parquet_chunked_reader_test.cpp
rename to cpp/tests/io/parquet_chunked_reader_test.cu
index 05fb9a3ec48..dea44f0e7c3 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cpp
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "parquet_common.hpp"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -44,14 +46,12 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <fstream>
 #include <type_traits>
 
 namespace {
-// Global environment for temporary files
-auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
-  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
 
 using int32s_col       = cudf::test::fixed_width_column_wrapper<int32_t>;
 using int64s_col       = cudf::test::fixed_width_column_wrapper<int64_t>;
@@ -953,64 +953,296 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount)
   } while (reader.has_next());
 }
 
-TEST_F(ParquetChunkedReaderTest, InputLimitSimple)
+constexpr size_t input_limit_expected_file_count = 4;
+
+std::vector<std::string> input_limit_get_test_names(std::string const& base_filename)
 {
-  auto const filepath = temp_env->get_temp_filepath("input_limit_10_rowgroups.parquet");
-
-  // This results in 10 grow groups, at 4001150 bytes per row group
-  constexpr int num_rows = 25'000'000;
-  auto value_iter = cudf::detail::make_counting_transform_iterator(0, [](int i) { return i; });
-  cudf::test::fixed_width_column_wrapper<int> expected(value_iter, value_iter + num_rows);
-  cudf::io::parquet_writer_options opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath},
-                                              cudf::table_view{{expected}})
-      // note: it is unnecessary to force compression to NONE here because the size we are using in
-      // the row group is the uncompressed data size. But forcing the dictionary policy to
-      // dictionary_policy::NEVER is necessary to prevent changes in the
-      // decompressed-but-not-yet-decoded data.
-      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
-
-  cudf::io::write_parquet(opts);
-
-  {
-    // no chunking
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 0);
-    EXPECT_EQ(num_chunks, 1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
+  return {base_filename + "_a.parquet",
+          base_filename + "_b.parquet",
+          base_filename + "_c.parquet",
+          base_filename + "_d.parquet"};
+}
 
-  {
-    // 25 chunks of 100k rows each
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 1);
-    EXPECT_EQ(num_chunks, 25);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
+void input_limit_test_write_one(std::string const& filepath,
+                                cudf::table_view const& t,
+                                cudf::io::compression_type compression,
+                                cudf::io::dictionary_policy dict_policy)
+{
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
+      .compression(compression)
+      .dictionary_policy(dict_policy);
+  cudf::io::write_parquet(out_opts);
+}
 
-  {
-    // 25 chunks of 100k rows each
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 4000000);
-    EXPECT_EQ(num_chunks, 25);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
+void input_limit_test_write(std::vector<std::string> const& test_filenames,
+                            cudf::table_view const& t)
+{
+  CUDF_EXPECTS(test_filenames.size() == 4, "Unexpected count of test filenames");
+  CUDF_EXPECTS(test_filenames.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames");
+
+  // no compression
+  input_limit_test_write_one(
+    test_filenames[0], t, cudf::io::compression_type::NONE, cudf::io::dictionary_policy::NEVER);
+  // compression with a codec that uses a lot of scratch space at decode time (2.5x the total
+  // decompressed buffer size)
+  input_limit_test_write_one(
+    test_filenames[1], t, cudf::io::compression_type::ZSTD, cudf::io::dictionary_policy::NEVER);
+  // compression with a codec that uses no scratch space at decode time
+  input_limit_test_write_one(
+    test_filenames[2], t, cudf::io::compression_type::SNAPPY, cudf::io::dictionary_policy::NEVER);
+  input_limit_test_write_one(
+    test_filenames[3], t, cudf::io::compression_type::SNAPPY, cudf::io::dictionary_policy::ALWAYS);
+}
 
-  {
-    // 25 chunks of 100k rows each
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 4100000);
-    EXPECT_EQ(num_chunks, 25);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
+void input_limit_test_read(std::vector<std::string> const& test_filenames,
+                           cudf::table_view const& t,
+                           size_t output_limit,
+                           size_t input_limit,
+                           int const expected_chunk_counts[input_limit_expected_file_count])
+{
+  CUDF_EXPECTS(test_filenames.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames");
 
-  {
-    // 12 chunks of 200k rows each, plus 1 final chunk of 100k rows.
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 8002301);
-    EXPECT_EQ(num_chunks, 13);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  for (size_t idx = 0; idx < test_filenames.size(); idx++) {
+    auto result = chunked_read(test_filenames[idx], output_limit, input_limit);
+    CUDF_EXPECTS(result.second == expected_chunk_counts[idx],
+                 "Unexpected number of chunks produced in chunk read");
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
   }
+}
+
+struct ParquetChunkedReaderInputLimitConstrainedTest : public cudf::test::BaseFixture {};
+
+TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, SingleFixedWidthColumn)
+{
+  auto base_path      = temp_env->get_temp_filepath("single_col_fixed_width");
+  auto test_filenames = input_limit_get_test_names(base_path);
+
+  constexpr auto num_rows = 1'000'000;
+  auto iter1              = thrust::make_constant_iterator(15);
+  cudf::test::fixed_width_column_wrapper<double> col1(iter1, iter1 + num_rows);
+  auto tbl = cudf::table_view{{col1}};
+
+  input_limit_test_write(test_filenames, tbl);
+
+  // semi-reasonable limit
+  constexpr int expected_a[] = {1, 17, 4, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 2 * 1024 * 1024, expected_a);
+  // an unreasonable limit
+  constexpr int expected_b[] = {1, 50, 50, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 1, expected_b);
+}
+
+TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns)
+{
+  auto base_path      = temp_env->get_temp_filepath("mixed_columns");
+  auto test_filenames = input_limit_get_test_names(base_path);
+
+  constexpr auto num_rows = 1'000'000;
+
+  auto iter1 = thrust::make_counting_iterator<int>(0);
+  cudf::test::fixed_width_column_wrapper<int> col1(iter1, iter1 + num_rows);
+
+  auto iter2 = thrust::make_counting_iterator<double>(0);
+  cudf::test::fixed_width_column_wrapper<double> col2(iter2, iter2 + num_rows);
+
+  auto const strings  = std::vector<std::string>{"abc", "de", "fghi"};
+  auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+    if (i < 250000) { return strings[0]; }
+    if (i < 750000) { return strings[1]; }
+    return strings[2];
+  });
+  auto col3           = strings_col(str_iter, str_iter + num_rows);
+
+  auto tbl = cudf::table_view{{col1, col2, col3}};
+
+  input_limit_test_write(test_filenames, tbl);
 
+  constexpr int expected_a[] = {1, 50, 10, 7};
+  input_limit_test_read(test_filenames, tbl, 0, 2 * 1024 * 1024, expected_a);
+  constexpr int expected_b[] = {1, 50, 50, 50};
+  input_limit_test_read(test_filenames, tbl, 0, 1, expected_b);
+}
+
+struct ParquetChunkedReaderInputLimitTest : public cudf::test::BaseFixture {};
+
+struct offset_gen {
+  int const group_size;
+  __device__ int operator()(int i) { return i * group_size; }
+};
+
+template <typename T>
+struct value_gen {
+  __device__ T operator()(int i) { return i % 1024; }
+};
+TEST_F(ParquetChunkedReaderInputLimitTest, List)
+{
+  auto base_path      = temp_env->get_temp_filepath("list");
+  auto test_filenames = input_limit_get_test_names(base_path);
+
+  constexpr int num_rows  = 50'000'000;
+  constexpr int list_size = 4;
+
+  auto const stream = cudf::get_default_stream();
+
+  auto offset_iter = cudf::detail::make_counting_transform_iterator(0, offset_gen{list_size});
+  auto offset_col  = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               offset_iter,
+               offset_iter + num_rows + 1,
+               offset_col->mutable_view().begin<int>());
+
+  // list<int>
+  constexpr int num_ints = num_rows * list_size;
+  auto value_iter        = cudf::detail::make_counting_transform_iterator(0, value_gen<int>{});
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               value_iter,
+               value_iter + num_ints,
+               value_col->mutable_view().begin<int>());
+  auto col1 =
+    cudf::make_lists_column(num_rows,
+                            std::move(offset_col),
+                            std::move(value_col),
+                            0,
+                            cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED),
+                            stream);
+
+  auto tbl = cudf::table_view{{*col1}};
+
+  input_limit_test_write(test_filenames, tbl);
+
+  // even though we have a very large limit here, there are two cases where we actually produce
+  // splits.
+  // - uncompressed data (with no dict). This happens because the code has to make a guess at how
+  // much
+  //   space to reserve for compressed/uncompressed data prior to reading. It does not know that
+  //   everything it will be reading in this case is uncompressed already, so this guess ends up
+  //   causing it to generate two top level passes. in practice, this shouldn't matter because we
+  //   never really see uncompressed data in the wild.
+  //
+  // - ZSTD (with no dict). In this case, ZSTD simple requires a huge amount of temporary
+  // space: 2.5x the total
+  //   size of the decompressed data. so 2 GB is actually not enough to hold the whole thing at
+  //   once.
+  //
+  // Note that in the dictionary cases, both of these revert down to 1 chunk because the
+  // dictionaries dramatically shrink the size of the uncompressed data.
+  constexpr int expected_a[] = {2, 2, 1, 1};
+  input_limit_test_read(test_filenames, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
+  // smaller limit
+  constexpr int expected_b[] = {6, 6, 2, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 512 * 1024 * 1024, expected_b);
+  // include output chunking as well
+  constexpr int expected_c[] = {11, 11, 9, 8};
+  input_limit_test_read(test_filenames, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
+}
+
+struct char_values {
+  __device__ int8_t operator()(int i)
   {
-    // 1 big chunk
-    auto const [result, num_chunks] = chunked_read(filepath, 0, size_t{1} * 1024 * 1024 * 1024);
-    EXPECT_EQ(num_chunks, 1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+    int const index = (i / 2) % 3;
+    // generate repeating 3-runs of 2 values each. aabbcc
+    return index == 0 ? 'a' : (index == 1 ? 'b' : 'c');
   }
+};
+TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
+{
+  auto base_path      = temp_env->get_temp_filepath("mixed_types");
+  auto test_filenames = input_limit_get_test_names(base_path);
+
+  constexpr int num_rows  = 50'000'000;
+  constexpr int list_size = 4;
+  constexpr int str_size  = 3;
+
+  auto const stream = cudf::get_default_stream();
+
+  auto offset_iter = cudf::detail::make_counting_transform_iterator(0, offset_gen{list_size});
+  auto offset_col  = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               offset_iter,
+               offset_iter + num_rows + 1,
+               offset_col->mutable_view().begin<int>());
+
+  // list<int>
+  constexpr int num_ints = num_rows * list_size;
+  auto value_iter        = cudf::detail::make_counting_transform_iterator(0, value_gen<int>{});
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               value_iter,
+               value_iter + num_ints,
+               value_col->mutable_view().begin<int>());
+  auto col1 =
+    cudf::make_lists_column(num_rows,
+                            std::move(offset_col),
+                            std::move(value_col),
+                            0,
+                            cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED),
+                            stream);
+
+  // strings
+  constexpr int num_chars = num_rows * str_size;
+  auto str_offset_iter    = cudf::detail::make_counting_transform_iterator(0, offset_gen{str_size});
+  auto str_offset_col     = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               str_offset_iter,
+               str_offset_iter + num_rows + 1,
+               str_offset_col->mutable_view().begin<int>());
+  auto str_iter = cudf::detail::make_counting_transform_iterator(0, char_values{});
+  rmm::device_buffer str_chars(num_chars, stream);
+  thrust::copy(rmm::exec_policy(stream),
+               str_iter,
+               str_iter + num_chars,
+               static_cast<int8_t*>(str_chars.data()));
+  auto col2 =
+    cudf::make_strings_column(num_rows,
+                              std::move(str_offset_col),
+                              std::move(str_chars),
+                              0,
+                              cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED));
+
+  // doubles
+  auto double_iter = cudf::detail::make_counting_transform_iterator(0, value_gen<double>{});
+  auto col3        = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::FLOAT64}, num_rows, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               double_iter,
+               double_iter + num_rows,
+               col3->mutable_view().begin<double>());
+
+  auto tbl = cudf::table_view{{*col1, *col2, *col3}};
+
+  input_limit_test_write(test_filenames, tbl);
+
+  // even though we have a very large limit here, there are two cases where we actually produce
+  // splits.
+  // - uncompressed data (with no dict). This happens because the code has to make a guess at how
+  // much
+  //   space to reserve for compressed/uncompressed data prior to reading. It does not know that
+  //   everything it will be reading in this case is uncompressed already, so this guess ends up
+  //   causing it to generate two top level passes. in practice, this shouldn't matter because we
+  //   never really see uncompressed data in the wild.
+  //
+  // - ZSTD (with no dict). In this case, ZSTD simple requires a huge amount of temporary
+  // space: 2.5x the total
+  //   size of the decompressed data. so 2 GB is actually not enough to hold the whole thing at
+  //   once.
+  //
+  // Note that in the dictionary cases, both of these revert down to 1 chunk because the
+  // dictionaries dramatically shrink the size of the uncompressed data.
+  constexpr int expected_a[] = {3, 3, 1, 1};
+  input_limit_test_read(test_filenames, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
+  // smaller limit
+  constexpr int expected_b[] = {10, 11, 4, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 512 * 1024 * 1024, expected_b);
+  // include output chunking as well
+  constexpr int expected_c[] = {20, 21, 15, 14};
+  input_limit_test_read(test_filenames, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
 }

From 59199da40881cb392a0496ba89f865a5a0b0bdb1 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 24 Jan 2024 20:20:17 -0600
Subject: [PATCH 52/60] Update pre-commit hooks (#14837)

This PR updates pre-commit hook versions and reformats the YAML so its spacing is more similar to the YAML format elsewhere in cudf and in other RAPIDS repos. Feel free to review this as two separate commits: a content change, and a format change. The changes outside of `.pre-commit-config.yaml` are from minor updates in `black`, which removed some extraneous blank lines, and a few fixes requested by `ruff`.

Also, the newer version of nbqa in this PR supports Python 3.12.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/14837
---
 .pre-commit-config.yaml                       | 308 +++++++++---------
 python/cudf/benchmarks/conftest.py            |   3 +-
 python/cudf/benchmarks/internal/conftest.py   |   3 +-
 python/cudf/cudf/_fuzz_testing/fuzzer.py      |   4 +-
 .../cudf/cudf/core/buffer/spillable_buffer.py |   1 -
 python/cudf/cudf/core/dataframe.py            |   3 +-
 python/cudf/cudf/core/df_protocol.py          |   1 -
 python/cudf/cudf/core/resample.py             |   8 +-
 python/cudf/cudf/core/scalar.py               |   3 +-
 python/cudf/cudf/core/series.py               |   1 -
 python/cudf/cudf/core/subword_tokenizer.py    |   3 +-
 python/cudf/cudf/core/udf/strings_typing.py   |   4 +-
 python/cudf/cudf/io/parquet.py                |   4 +-
 python/cudf/cudf/pandas/_wrappers/pandas.py   |   3 +-
 python/cudf/cudf/testing/testing.py           |   3 +-
 .../test_avro_reader_fastavro_integration.py  |   8 +-
 python/cudf/cudf/tests/test_binops.py         |   9 +-
 python/cudf/cudf/tests/test_categorical.py    |   5 -
 python/cudf/cudf/tests/test_column.py         |   2 -
 python/cudf/cudf/tests/test_concat.py         |   3 +-
 python/cudf/cudf/tests/test_cut.py            |   8 +-
 python/cudf/cudf/tests/test_dataframe.py      |   1 -
 python/cudf/cudf/tests/test_datetime.py       |   6 -
 python/cudf/cudf/tests/test_dropna.py         |   3 +-
 python/cudf/cudf/tests/test_duplicates.py     |   3 +-
 .../cudf/tests/test_extension_compilation.py  |   5 +-
 python/cudf/cudf/tests/test_factorize.py      |   3 +-
 python/cudf/cudf/tests/test_groupby.py        |   1 -
 python/cudf/cudf/tests/test_hdf.py            |   3 +-
 python/cudf/cudf/tests/test_interval.py       |   3 +-
 python/cudf/cudf/tests/test_joining.py        |   9 +-
 python/cudf/cudf/tests/test_replace.py        |   6 +-
 python/cudf/cudf/tests/test_repr.py           |   3 +-
 python/cudf/cudf/tests/test_reshape.py        |   4 +-
 python/cudf/cudf/tests/test_rolling.py        |   5 +-
 python/cudf/cudf/tests/test_scalar.py         |   4 +-
 python/cudf/cudf/tests/test_search.py         |   4 +-
 python/cudf/cudf/tests/test_series.py         |   2 -
 python/cudf/cudf/tests/test_sorting.py        |   5 +-
 python/cudf/cudf/tests/test_stats.py          |   5 +-
 python/cudf/cudf/tests/test_string.py         |   6 +-
 python/cudf/cudf/tests/test_testing.py        |   1 -
 python/cudf/cudf/tests/test_timedelta.py      |   1 -
 python/cudf/cudf/tests/test_transform.py      |   3 +-
 python/cudf/cudf/tests/test_udf_masked_ops.py |   1 -
 python/cudf/cudf/utils/applyutils.py          |   3 +-
 python/cudf/cudf/utils/hash_vocab_utils.py    |   5 +-
 python/cudf/cudf/utils/ioutils.py             |   6 +-
 .../cudf_pandas_tests/test_cudf_pandas.py     |   1 -
 python/custreamz/custreamz/tests/conftest.py  |   3 +-
 python/dask_cudf/dask_cudf/backends.py        |   4 +-
 python/dask_cudf/dask_cudf/core.py            |   2 -
 python/dask_cudf/dask_cudf/io/parquet.py      |   9 +-
 .../dask_cudf/dask_cudf/io/tests/test_csv.py  |   3 +-
 .../dask_cudf/dask_cudf/io/tests/test_orc.py  |   4 +-
 python/dask_cudf/dask_cudf/io/text.py         |   3 +-
 .../dask_cudf/tests/test_accessor.py          |   4 +-
 python/dask_cudf/dask_cudf/tests/test_core.py |   6 -
 .../dask_cudf/tests/test_reductions.py        |   3 +-
 59 files changed, 203 insertions(+), 327 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4fab4ddc6bd..9ac373db309 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,159 +1,159 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 repos:
-      - repo: https://github.com/pre-commit/pre-commit-hooks
-        rev: v4.3.0
-        hooks:
-              - id: trailing-whitespace
-                exclude: |
-                  (?x)^(
-                    ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
-                  )
-              - id: end-of-file-fixer
-                exclude: |
-                  (?x)^(
-                    ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
-                  )
-      - repo: https://github.com/PyCQA/isort
-        rev: 5.12.0
-        hooks:
-              - id: isort
-                # Use the config file specific to each subproject so that each
-                # project can specify its own first/third-party packages.
-                args: ["--config-root=python/", "--resolve-all-configs"]
-                files: python/.*
-                types_or: [python, cython, pyi]
-      - repo: https://github.com/psf/black
-        rev: 22.3.0
-        hooks:
-              - id: black
-                files: python/.*
-                # Explicitly specify the pyproject.toml at the repo root, not per-project.
-                args: ["--config", "pyproject.toml"]
-      - repo: https://github.com/MarcoGorelli/cython-lint
-        rev: v0.15.0
-        hooks:
-              - id: cython-lint
-      - repo: https://github.com/pre-commit/mirrors-mypy
-        rev: 'v1.3.0'
-        hooks:
-              - id: mypy
-                additional_dependencies: [types-cachetools]
-                args: ["--config-file=pyproject.toml",
-                       "python/cudf/cudf",
-                       "python/custreamz/custreamz",
-                       "python/cudf_kafka/cudf_kafka",
-                       "python/dask_cudf/dask_cudf"]
-                pass_filenames: false
-      - repo: https://github.com/PyCQA/pydocstyle
-        rev: 6.1.1
-        hooks:
-              - id: pydocstyle
-                # https://github.com/PyCQA/pydocstyle/issues/603
-                additional_dependencies: [toml]
-                args: ["--config=pyproject.toml"]
-                exclude: |
-                  (?x)^(
-                    ^python/cudf/cudf/pandas/scripts/.*|
-                    ^python/cudf/cudf_pandas_tests/.*
-                  )
-      - repo: https://github.com/nbQA-dev/nbQA
-        rev: 1.6.3
-        hooks:
-              - id: nbqa-isort
-                # Use the cudf_kafka isort orderings in notebooks so that dask
-                # and RAPIDS packages have their own sections.
-                args: ["--settings-file=python/cudf_kafka/pyproject.toml"]
-              - id: nbqa-black
-                # Explicitly specify the pyproject.toml at the repo root, not per-project.
-                args: ["--config=pyproject.toml"]
-      - repo: https://github.com/pre-commit/mirrors-clang-format
-        rev: v16.0.6
-        hooks:
-              - id: clang-format
-                types_or: [c, c++, cuda]
-                args: ["-fallback-style=none", "-style=file", "-i"]
-      - repo: https://github.com/sirosen/texthooks
-        rev: 0.4.0
-        hooks:
-              - id: fix-smartquotes
-                exclude: |
-                  (?x)^(
-                    ^cpp/include/cudf_test/cxxopts.hpp|
-                    ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*|
-                    ^python/cudf/cudf/tests/text/test_text_methods.py
-                  )
-      - repo: local
-        hooks:
-              - id: no-deprecationwarning
-                name: no-deprecationwarning
-                description: 'Enforce that DeprecationWarning is not introduced (use FutureWarning instead)'
-                entry: '(category=|\s)DeprecationWarning[,)]'
-                language: pygrep
-                types_or: [python, cython]
-              - id: no-programmatic-xfail
-                name: no-programmatic-xfail
-                description: 'Enforce that pytest.xfail is not introduced (see dev docs for details)'
-                entry: 'pytest\.xfail'
-                language: pygrep
-                types: [python]
-              - id: cmake-format
-                name: cmake-format
-                entry: ./cpp/scripts/run-cmake-format.sh cmake-format
-                language: python
-                types: [cmake]
-                # Note that pre-commit autoupdate does not update the versions
-                # of dependencies, so we'll have to update this manually.
-                additional_dependencies:
-                  - cmakelang==0.6.13
-                verbose: true
-                require_serial: true
-              - id: cmake-lint
-                name: cmake-lint
-                entry: ./cpp/scripts/run-cmake-format.sh cmake-lint
-                language: python
-                types: [cmake]
-                # Note that pre-commit autoupdate does not update the versions
-                # of dependencies, so we'll have to update this manually.
-                additional_dependencies:
-                  - cmakelang==0.6.13
-                verbose: true
-                require_serial: true
-              - id: copyright-check
-                name: copyright-check
-                entry: python ./ci/checks/copyright.py --git-modified-only --update-current-year
-                language: python
-                pass_filenames: false
-                additional_dependencies: [gitpython]
-              - id: doxygen-check
-                name: doxygen-check
-                entry: ./ci/checks/doxygen.sh
-                files: ^cpp/include/
-                types_or: [file]
-                language: system
-                pass_filenames: false
-                verbose: true
-      - repo: https://github.com/codespell-project/codespell
-        rev: v2.2.2
-        hooks:
-              - id: codespell
-                additional_dependencies: [tomli]
-                args: ["--toml", "pyproject.toml"]
-                exclude: |
-                  (?x)^(
-                    .*test.*|
-                    ^CHANGELOG.md$
-                  )
-      - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.8.0
-        hooks:
-            - id: rapids-dependency-file-generator
-              args: ["--clean"]
-      - repo: https://github.com/astral-sh/ruff-pre-commit
-        rev: v0.0.278
-        hooks:
-          - id: ruff
-            files: python/.*$
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: trailing-whitespace
+        exclude: |
+          (?x)^(
+            ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
+          )
+      - id: end-of-file-fixer
+        exclude: |
+          (?x)^(
+            ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
+          )
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        # Use the config file specific to each subproject so that each
+        # project can specify its own first/third-party packages.
+        args: ["--config-root=python/", "--resolve-all-configs"]
+        files: python/.*
+        types_or: [python, cython, pyi]
+  - repo: https://github.com/psf/black
+    rev: 23.12.1
+    hooks:
+      - id: black
+        files: python/.*
+        # Explicitly specify the pyproject.toml at the repo root, not per-project.
+        args: ["--config", "pyproject.toml"]
+  - repo: https://github.com/MarcoGorelli/cython-lint
+    rev: v0.16.0
+    hooks:
+      - id: cython-lint
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: 'v1.3.0'
+    hooks:
+      - id: mypy
+        additional_dependencies: [types-cachetools]
+        args: ["--config-file=pyproject.toml",
+               "python/cudf/cudf",
+               "python/custreamz/custreamz",
+               "python/cudf_kafka/cudf_kafka",
+               "python/dask_cudf/dask_cudf"]
+        pass_filenames: false
+  - repo: https://github.com/PyCQA/pydocstyle
+    rev: 6.3.0
+    hooks:
+      - id: pydocstyle
+        # https://github.com/PyCQA/pydocstyle/issues/603
+        additional_dependencies: [tomli]
+        args: ["--config=pyproject.toml"]
+        exclude: |
+          (?x)^(
+            ^python/cudf/cudf/pandas/scripts/.*|
+            ^python/cudf/cudf_pandas_tests/.*
+          )
+  - repo: https://github.com/nbQA-dev/nbQA
+    rev: 1.7.1
+    hooks:
+      - id: nbqa-isort
+        # Use the cudf_kafka isort orderings in notebooks so that dask
+        # and RAPIDS packages have their own sections.
+        args: ["--settings-file=python/cudf_kafka/pyproject.toml"]
+      - id: nbqa-black
+        # Explicitly specify the pyproject.toml at the repo root, not per-project.
+        args: ["--config=pyproject.toml"]
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v16.0.6
+    hooks:
+      - id: clang-format
+        types_or: [c, c++, cuda]
+        args: ["-fallback-style=none", "-style=file", "-i"]
+  - repo: https://github.com/sirosen/texthooks
+    rev: 0.6.3
+    hooks:
+      - id: fix-smartquotes
+        exclude: |
+          (?x)^(
+            ^cpp/include/cudf_test/cxxopts.hpp|
+            ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*|
+            ^python/cudf/cudf/tests/text/test_text_methods.py
+          )
+  - repo: local
+    hooks:
+      - id: no-deprecationwarning
+        name: no-deprecationwarning
+        description: 'Enforce that DeprecationWarning is not introduced (use FutureWarning instead)'
+        entry: '(category=|\s)DeprecationWarning[,)]'
+        language: pygrep
+        types_or: [python, cython]
+      - id: no-programmatic-xfail
+        name: no-programmatic-xfail
+        description: 'Enforce that pytest.xfail is not introduced (see dev docs for details)'
+        entry: 'pytest\.xfail'
+        language: pygrep
+        types: [python]
+      - id: cmake-format
+        name: cmake-format
+        entry: ./cpp/scripts/run-cmake-format.sh cmake-format
+        language: python
+        types: [cmake]
+        # Note that pre-commit autoupdate does not update the versions
+        # of dependencies, so we'll have to update this manually.
+        additional_dependencies:
+          - cmakelang==0.6.13
+        verbose: true
+        require_serial: true
+      - id: cmake-lint
+        name: cmake-lint
+        entry: ./cpp/scripts/run-cmake-format.sh cmake-lint
+        language: python
+        types: [cmake]
+        # Note that pre-commit autoupdate does not update the versions
+        # of dependencies, so we'll have to update this manually.
+        additional_dependencies:
+          - cmakelang==0.6.13
+        verbose: true
+        require_serial: true
+      - id: copyright-check
+        name: copyright-check
+        entry: python ./ci/checks/copyright.py --git-modified-only --update-current-year
+        language: python
+        pass_filenames: false
+        additional_dependencies: [gitpython]
+      - id: doxygen-check
+        name: doxygen-check
+        entry: ./ci/checks/doxygen.sh
+        files: ^cpp/include/
+        types_or: [file]
+        language: system
+        pass_filenames: false
+        verbose: true
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.2
+    hooks:
+      - id: codespell
+        additional_dependencies: [tomli]
+        args: ["--toml", "pyproject.toml"]
+        exclude: |
+          (?x)^(
+            .*test.*|
+            ^CHANGELOG.md$
+          )
+  - repo: https://github.com/rapidsai/dependency-file-generator
+    rev: v1.8.0
+    hooks:
+      - id: rapids-dependency-file-generator
+        args: ["--clean"]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.13
+    hooks:
+      - id: ruff
+        files: python/.*$
 
 
 default_language_version:
diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py
index 4f2bb96061f..a70d2329625 100644
--- a/python/cudf/benchmarks/conftest.py
+++ b/python/cudf/benchmarks/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Defines pytest fixtures for all benchmarks.
 
@@ -206,7 +206,6 @@ def default_union_id(val):
         (r"_rows_\d+", ""),
         (r"_cols_\d+", ""),
     ]:
-
         collapse_fixtures(fixtures, pat, repl, globals(), idfunc)
 
     num_new_fixtures = len(fixtures) - num_fixtures
diff --git a/python/cudf/benchmarks/internal/conftest.py b/python/cudf/benchmarks/internal/conftest.py
index 7351f1d1427..a710cf61753 100644
--- a/python/cudf/benchmarks/internal/conftest.py
+++ b/python/cudf/benchmarks/internal/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Defines pytest fixtures for internal benchmarks."""
 
@@ -50,7 +50,6 @@ def column_nulls_true(request, nr=nr):
         ("_nulls_(true|false)", ""),
         (r"_rows_\d+", ""),
     ]:
-
         collapse_fixtures(fixtures, pat, repl, globals())
 
     num_new_fixtures = len(fixtures) - num_fixtures
diff --git a/python/cudf/cudf/_fuzz_testing/fuzzer.py b/python/cudf/cudf/_fuzz_testing/fuzzer.py
index 59d6f198681..ee1b2c1f1c4 100644
--- a/python/cudf/cudf/_fuzz_testing/fuzzer.py
+++ b/python/cudf/cudf/_fuzz_testing/fuzzer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import datetime
 import json
@@ -31,7 +31,6 @@ def __init__(
         max_lists_length=None,
         max_lists_nesting_depth=None,
     ):
-
         self._target = target
         self._dirs = [] if dirs is None else dirs
         self._crash_dir = crash_reports_dir
@@ -86,7 +85,6 @@ def write_crash(self, error):
             self._data_handler.write_data(error_file_name)
 
     def start(self):
-
         while True:
             logging.info(f"Running test {self._total_executions}")
             file_name = self._data_handler.generate_input()
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index aeac4b76e58..b25af13679c 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -226,7 +226,6 @@ def spill(self, target: str = "cpu") -> None:
                     color=_get_color_for_nvtx("SpillHtoD"),
                     domain="cudf_python-spill",
                 ):
-
                     dev_mem = rmm.DeviceBuffer.to_device(
                         self._ptr_desc.pop("memoryview")
                     )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7c48352d861..c61fd54db29 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3517,7 +3517,7 @@ def rename(
 
         if index:
             if (
-                any(type(item) == str for item in index.values())
+                any(isinstance(item, str) for item in index.values())
                 and type(self.index) != cudf.StringIndex
             ):
                 raise NotImplementedError(
@@ -5274,7 +5274,6 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
             )
 
         if isinstance(dataframe, pd.DataFrame):
-
             if not dataframe.columns.is_unique:
                 raise ValueError("Duplicate column names are not allowed")
 
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index c97d6dcdd2d..62ded8ac6f1 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -792,7 +792,6 @@ def _set_missing_values(
     cudf_col: cudf.core.column.ColumnBase,
     allow_copy: bool,
 ) -> cudf.core.column.ColumnBase:
-
     valid_mask = protocol_col.get_buffers()["validity"]
     if valid_mask is not None:
         null, invalid = protocol_col.describe_null
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index 0226c778da3..5b0df97de71 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -1,6 +1,6 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION &
-# AFFILIATES. All rights reserved.  SPDX-License-Identifier:
-# Apache-2.0
+# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,7 +33,6 @@
 
 
 class _Resampler(GroupBy):
-
     grouping: "_ResampleGrouping"
 
     def __init__(self, obj, by, axis=None, kind=None):
@@ -118,7 +117,6 @@ class SeriesResampler(_Resampler, SeriesGroupBy):
 
 
 class _ResampleGrouping(_Grouping):
-
     bin_labels: cudf.core.index.Index
 
     def __init__(self, obj, by=None, level=None):
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index a20628f6601..f7d05e53ce7 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import decimal
 import operator
@@ -114,7 +114,6 @@ class Scalar(BinaryOperand, metaclass=CachedScalarInstanceMeta):
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
     def __init__(self, value, dtype=None):
-
         self._host_value = None
         self._host_dtype = None
         self._device_value = None
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 7e25713e63c..d7249d1a781 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -201,7 +201,6 @@ def __getitem__(self, arg):
 
     @_cudf_nvtx_annotate
     def __setitem__(self, key, value):
-
         if isinstance(key, tuple):
             key = list(key)
 
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index 821afa2ebe2..24c49e3662a 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -50,7 +50,6 @@ class SubwordTokenizer:
     """
 
     def __init__(self, hash_file: str, do_lower_case: bool = True):
-
         self.do_lower_case = do_lower_case
         self.vocab_file = cpp_hashed_vocabulary(hash_file)
 
diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py
index 50d34be40a0..43604ab21a7 100644
--- a/python/cudf/cudf/core/udf/strings_typing.py
+++ b/python/cudf/cudf/core/udf/strings_typing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import operator
 
@@ -17,7 +17,6 @@
 
 # String object definitions
 class UDFString(types.Type):
-
     np_dtype = np.dtype("object")
 
     def __init__(self):
@@ -29,7 +28,6 @@ def return_type(self):
 
 
 class StringView(types.Type):
-
     np_dtype = np.dtype("object")
 
     def __init__(self):
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index bcc24a85cf9..bac919182c0 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import itertools
@@ -1259,7 +1259,7 @@ def write_table(self, df):
         """
         Write a dataframe to the file/dataset
         """
-        (part_names, grouped_df, part_offsets,) = _get_groups_and_offsets(
+        part_names, grouped_df, part_offsets = _get_groups_and_offsets(
             df=df,
             partition_cols=self.partition_cols,
             preserve_index=self.common_args["index"],
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 5ea2af7d002..afcfc13a9c4 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 import copyreg
@@ -1323,6 +1323,7 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs):
         typ,
     )
 
+
 # timestamps and timedeltas are not proxied, but non-proxied
 # pandas types are currently not picklable. Thus, we define
 # custom reducer/unpicker functions for these types:
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 6c2f073b7ac..39fdac0f71a 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -47,7 +47,6 @@ def _check_isinstance(left, right, obj):
 
 
 def raise_assert_detail(obj, message, left, right, diff=None):
-
     msg = f"""{obj} are different
 
 {message}
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 2272231fec1..0e38b10ed52 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -125,7 +125,6 @@ def test_can_detect_dtype_from_avro_type_nested(
     ],
 )
 def test_can_parse_single_value(avro_type, cudf_type, avro_val, cudf_val):
-
     schema_root = {
         "name": "root",
         "type": "record",
@@ -147,7 +146,6 @@ def test_can_parse_single_value(avro_type, cudf_type, avro_val, cudf_val):
 
 @pytest.mark.parametrize("avro_type, cudf_type", avro_type_params)
 def test_can_parse_single_null(avro_type, cudf_type):
-
     schema_root = {
         "name": "root",
         "type": "record",
@@ -167,7 +165,6 @@ def test_can_parse_single_null(avro_type, cudf_type):
 
 @pytest.mark.parametrize("avro_type, cudf_type", avro_type_params)
 def test_can_parse_no_data(avro_type, cudf_type):
-
     schema_root = {
         "name": "root",
         "type": "record",
@@ -188,7 +185,6 @@ def test_can_parse_no_data(avro_type, cudf_type):
 )
 @pytest.mark.parametrize("avro_type, cudf_type", avro_type_params)
 def test_can_parse_no_fields(avro_type, cudf_type):
-
     schema_root = {
         "name": "root",
         "type": "record",
@@ -205,7 +201,6 @@ def test_can_parse_no_fields(avro_type, cudf_type):
 
 
 def test_can_parse_no_schema():
-
     schema_root = None
     records = []
     actual = cudf_from_avro_util(schema_root, records)
@@ -307,7 +302,6 @@ def get_days_from_epoch(date: Optional[datetime.date]) -> Optional[int]:
 @pytest.mark.parametrize("nullable", [True, False])
 @pytest.mark.parametrize("prepend_null", [True, False])
 def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null):
-
     avro_type = {"logicalType": "date", "type": "int"}
     if nullable:
         if prepend_null:
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index cd3e8f75950..9de7dac652c 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import decimal
 import operator
@@ -605,7 +605,6 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class):
 
 @pytest.mark.parametrize("binop", _binops)
 def test_different_shapes_and_columns(binop):
-
     # TODO: support `pow()` on NaN values. Particularly, the cases:
     #       `pow(1, NaN) == 1` and `pow(NaN, 0) == 1`
     if binop is operator.pow:
@@ -639,7 +638,6 @@ def test_different_shapes_and_columns(binop):
 
 @pytest.mark.parametrize("binop", _binops)
 def test_different_shapes_and_same_columns(binop):
-
     # TODO: support `pow()` on NaN values. Particularly, the cases:
     #       `pow(1, NaN) == 1` and `pow(NaN, 0) == 1`
     if binop is operator.pow:
@@ -658,7 +656,6 @@ def test_different_shapes_and_same_columns(binop):
 
 @pytest.mark.parametrize("binop", _binops)
 def test_different_shapes_and_columns_with_unaligned_indices(binop):
-
     # TODO: support `pow()` on NaN values. Particularly, the cases:
     #       `pow(1, NaN) == 1` and `pow(NaN, 0) == 1`
     if binop is operator.pow:
@@ -791,7 +788,6 @@ def test_operator_func_series_and_scalar(
 def test_operator_func_between_series_logical(
     dtype, func, scalar_a, scalar_b, fill_value
 ):
-
     gdf_series_a = Series([scalar_a], nan_as_null=False).astype(dtype)
     gdf_series_b = Series([scalar_b], nan_as_null=False).astype(dtype)
 
@@ -1787,7 +1783,6 @@ def test_datetime_dateoffset_binaryop(
 )
 @pytest.mark.parametrize("op", [operator.add, operator.sub])
 def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
-
     gsr = cudf.Series(date_col, dtype="datetime64[ns]")
     psr = gsr.to_pandas()
 
@@ -2294,7 +2289,6 @@ def test_binops_with_NA_consistent(dtype, op):
     ],
 )
 def test_binops_decimal(op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype):
-
     if isinstance(lhs, (int, float)):
         a = cudf.Scalar(lhs, l_dtype)
     else:
@@ -2358,7 +2352,6 @@ def test_binops_decimal(op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype):
 def test_binops_reflect_decimal(
     op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype
 ):
-
     a = utils._decimal_series(lhs, l_dtype)
     b = utils._decimal_series(rhs, r_dtype)
     expect = utils._decimal_series(expect, expect_dtype)
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 52b7236b965..52c50ec58a8 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -364,7 +364,6 @@ def test_categorical_set_categories_preserves_order():
 
 @pytest.mark.parametrize("inplace", [True, False])
 def test_categorical_as_ordered(pd_str_cat, inplace):
-
     pd_sr = pd.Series(pd_str_cat.copy().set_ordered(False))
     cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(False))
 
@@ -388,7 +387,6 @@ def test_categorical_as_ordered(pd_str_cat, inplace):
 
 @pytest.mark.parametrize("inplace", [True, False])
 def test_categorical_as_unordered(pd_str_cat, inplace):
-
     pd_sr = pd.Series(pd_str_cat.copy().set_ordered(True))
     cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(True))
 
@@ -428,7 +426,6 @@ def test_categorical_as_unordered(pd_str_cat, inplace):
 def test_categorical_reorder_categories(
     pd_str_cat, from_ordered, to_ordered, inplace
 ):
-
     pd_sr = pd.Series(pd_str_cat.copy().set_ordered(from_ordered))
     cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(from_ordered))
 
@@ -469,7 +466,6 @@ def test_categorical_reorder_categories(
     ],
 )
 def test_categorical_add_categories(pd_str_cat, inplace):
-
     pd_sr = pd.Series(pd_str_cat.copy())
     cd_sr = cudf.Series(pd_str_cat.copy())
 
@@ -510,7 +506,6 @@ def test_categorical_add_categories(pd_str_cat, inplace):
     ],
 )
 def test_categorical_remove_categories(pd_str_cat, inplace):
-
     pd_sr = pd.Series(pd_str_cat.copy())
     cd_sr = cudf.Series(pd_str_cat.copy())
 
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 3d21994a8d5..c3623f495c0 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -333,7 +333,6 @@ def test_column_view_valid_string_to_numeric(data, to_dtype):
 
 
 def test_column_view_nulls_widths_even():
-
     data = [1, 2, None, 4, None]
     expect_data = [
         np.int32(val).view("float32") if val is not None else np.nan
@@ -361,7 +360,6 @@ def test_column_view_nulls_widths_even():
 
 @pytest.mark.parametrize("slc", [slice(1, 5), slice(0, 4), slice(2, 4)])
 def test_column_view_numeric_slice(slc):
-
     data = np.array([1, 2, 3, 4, 5], dtype="int32")
     sr = cudf.Series(data)
 
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index df743a96759..466455eb48c 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from decimal import Decimal
 
@@ -1057,7 +1057,6 @@ def test_concat_join_no_overlapping_columns_many_and_empty2(
 def test_concat_join_no_overlapping_columns_empty_df_basic(
     ignore_index, sort, join, axis
 ):
-
     pdf6 = pd.DataFrame(
         {
             "x": range(10),
diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/test_cut.py
index 02e48f2639b..24c1eaa8f02 100644
--- a/python/cudf/cudf/tests/test_cut.py
+++ b/python/cudf/cudf/tests/test_cut.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 """
 Test related to Cut
@@ -60,7 +60,6 @@ def test_cut_basic(x, bins, right, include_lowest, ordered, precision):
 def test_cut_labels(
     x, bins, right, include_lowest, ordered, precision, labels
 ):
-
     pcat = pd.cut(
         x=x,
         bins=bins,
@@ -98,7 +97,6 @@ def test_cut_labels(
 def test_cut_labels_non_unique(
     x, bins, right, include_lowest, ordered, precision, labels
 ):
-
     pcat = pd.cut(
         x=x,
         bins=bins,
@@ -138,7 +136,6 @@ def test_cut_labels_non_unique(
 @pytest.mark.parametrize("right", [True, False])
 @pytest.mark.parametrize("precision", [3])
 def test_cut_right(x, bins, right, precision):
-
     pcat = pd.cut(
         x=x,
         bins=bins,
@@ -177,7 +174,6 @@ def test_cut_right(x, bins, right, precision):
 def test_cut_drop_duplicates(
     x, bins, right, precision, duplicates, ordered, include_lowest
 ):
-
     pcat = pd.cut(
         x=x,
         bins=bins,
@@ -264,7 +260,6 @@ def test_cut_drop_duplicates_raises(
 @pytest.mark.parametrize("precision", [1, 2, 3])
 @pytest.mark.parametrize("duplicates", ["drop", "raise"])
 def test_cut_intervalindex_bin(x, bins, right, precision, duplicates):
-
     pcat = pd.cut(
         x=x,
         bins=bins,
@@ -294,7 +289,6 @@ def test_cut_intervalindex_bin(x, bins, right, precision, duplicates):
 @pytest.mark.parametrize("ordered", [True])
 @pytest.mark.parametrize("precision", [3])
 def test_cut_series(x, bins, right, include_lowest, ordered, precision):
-
     pcat = pd.cut(
         x=x,
         bins=bins,
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 37c115a47d9..0664e7991b5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4004,7 +4004,6 @@ def test_diff(dtype, period, data_empty):
 @pytest.mark.parametrize("df", _dataframe_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_dataframe_isnull_isna(df, nan_as_null):
-
     if nan_as_null is False and (
         df.select_dtypes(object).isna().any().any()
         and not df.select_dtypes(object).isna().all().all()
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index deddedbe3e8..ab1fb2eedd5 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -702,7 +702,6 @@ def test_to_datetime_errors(data):
 
 
 def test_to_datetime_not_implemented():
-
     with pytest.raises(NotImplementedError):
         cudf.to_datetime([], exact=False)
 
@@ -815,7 +814,6 @@ def test_to_datetime_different_formats_notimplemented():
 
 
 def test_datetime_can_cast_safely():
-
     sr = cudf.Series(
         ["1679-01-01", "2000-01-31", "2261-01-01"], dtype="datetime64[ms]"
     )
@@ -933,7 +931,6 @@ def test_str_to_datetime_error():
 @pytest.mark.parametrize("data_dtype", DATETIME_TYPES)
 @pytest.mark.parametrize("other_dtype", DATETIME_TYPES)
 def test_datetime_subtract(data, other, data_dtype, other_dtype):
-
     gsr = cudf.Series(data, dtype=data_dtype)
     psr = gsr.to_pandas()
 
@@ -1985,7 +1982,6 @@ def test_error_values():
     "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
 )
 def test_ceil(data, time_type, resolution):
-
     gs = cudf.Series(data, dtype=time_type)
     ps = gs.to_pandas()
 
@@ -2016,7 +2012,6 @@ def test_ceil(data, time_type, resolution):
     "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
 )
 def test_floor(data, time_type, resolution):
-
     gs = cudf.Series(data, dtype=time_type)
     ps = gs.to_pandas()
 
@@ -2047,7 +2042,6 @@ def test_floor(data, time_type, resolution):
     "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
 )
 def test_round(data, time_type, resolution):
-
     gs = cudf.Series(data, dtype=time_type)
     ps = gs.to_pandas()
 
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index d53d24cd6c6..ac104b7e513 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -24,7 +24,6 @@
 @pytest.mark.parametrize("nulls", ["one", "some", "all", "none"])
 @pytest.mark.parametrize("inplace", [True, False])
 def test_dropna_series(data, nulls, inplace):
-
     psr = _create_pandas_series_float64_default(data)
 
     if len(data) > 0:
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index ddbfdf5eee2..ad513ea3cd5 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import itertools
 import random
@@ -386,7 +386,6 @@ def test_dataframe_drop_duplicates_method():
 
 
 def test_datetime_drop_duplicates():
-
     date_df = cudf.DataFrame()
     date_df["date"] = pd.date_range("11/20/2018", periods=6, freq="D")
     date_df["value"] = np.random.sample(len(date_df))
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index 857cc114ffa..4272c70f898 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 import operator
 
 import cupy as cp
@@ -198,7 +198,6 @@ def func_na_is_x(x):
 
 @pytest.mark.parametrize("fn", (func_x_is_na, func_na_is_x))
 def test_is_na(fn):
-
     valid = Masked(1, True)
     invalid = Masked(1, False)
 
@@ -288,7 +287,6 @@ def func_na_le(x):
 @pytest.mark.parametrize("fn", na_comparison_funcs)
 @pytest.mark.parametrize("ty", number_types, ids=number_ids)
 def test_na_masked_comparisons(fn, ty):
-
     device_fn = cuda.jit(device=True)(fn)
 
     @cuda.jit
@@ -317,7 +315,6 @@ def test_kernel(err):
 @pytest.mark.parametrize("fn", na_comparison_funcs)
 @pytest.mark.parametrize("ty", number_types, ids=number_ids)
 def test_na_scalar_comparisons(fn, ty):
-
     device_fn = cuda.jit(device=True)(fn)
 
     @cuda.jit
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index bf409b30090..f8782681f62 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
@@ -124,7 +124,6 @@ def test_cudf_factorize_array():
 
 @pytest.mark.parametrize("pandas_compatibility", [True, False])
 def test_factorize_code_pandas_compatibility(pandas_compatibility):
-
     psr = pd.Series([1, 2, 3, 4, 5])
     gsr = cudf.from_pandas(psr)
 
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index b46949faa06..b757f8acb6e 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -625,7 +625,6 @@ def func(group):
     ],
 )
 def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype):
-
     dataset = groupby_jit_datasets[dataset]
 
     dataset["val1"] = dataset["val1"].astype(dtype)
diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py
index 71c94858cfe..063fffd948b 100644
--- a/python/cudf/cudf/tests/test_hdf.py
+++ b/python/cudf/cudf/tests/test_hdf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import os
 from string import ascii_letters
@@ -96,7 +96,6 @@ def test_hdf_reader(hdf_files, columns):
     )
 
     for column in hdf_series.keys():
-
         expect_series = pd.read_hdf(hdf_series[column])
         got_series = cudf.read_hdf(hdf_series[column])
 
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index a27de60c2c5..ef853a23004 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 
 import numpy as np
@@ -16,7 +16,6 @@
 @pytest.mark.parametrize("data3, data4", [(6, 10), (5.0, 9.0), (2, 6.0)])
 @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
 def test_create_interval_series(data1, data2, data3, data4, closed):
-
     expect = pd.Series(pd.Interval(data1, data2, closed), dtype="interval")
     got = cudf.Series(pd.Interval(data1, data2, closed), dtype="interval")
     assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 9c9c99a0cfa..ece676329bc 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from itertools import combinations, product, repeat
 
@@ -502,7 +502,6 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how):
 
 
 def test_safe_merging_with_left_empty():
-
     np.random.seed(0)
 
     pairs = ("bcd", "b")
@@ -910,7 +909,6 @@ def test_join_multi(how, column_a, column_b, column_c):
     ],
 )
 def test_merge_multi(kwargs):
-
     left = cudf.DataFrame(
         {
             "a": [1, 2, 3, 4, 3, 5, 6],
@@ -1072,7 +1070,6 @@ def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r):
 
 
 def test_typecast_on_join_no_float_round():
-
     other_data = ["a", "b", "c", "d", "e"]
 
     join_data_l = cudf.Series([1, 2, 3, 4, 5], dtype="int8")
@@ -1530,7 +1527,6 @@ def test_categorical_typecast_outer():
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["object"])
 def test_categorical_typecast_inner_one_cat(dtype):
-
     data = np.array([1, 2, 3], dtype=dtype)
 
     left = make_categorical_dataframe(data)
@@ -1542,7 +1538,6 @@ def test_categorical_typecast_inner_one_cat(dtype):
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["object"])
 def test_categorical_typecast_left_one_cat(dtype):
-
     data = np.array([1, 2, 3], dtype=dtype)
 
     left = make_categorical_dataframe(data)
@@ -1554,7 +1549,6 @@ def test_categorical_typecast_left_one_cat(dtype):
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["object"])
 def test_categorical_typecast_outer_one_cat(dtype):
-
     data = np.array([1, 2, 3], dtype=dtype)
 
     left = make_categorical_dataframe(data)
@@ -1810,7 +1804,6 @@ def test_typecast_on_join_indexes_matching_categorical():
     ],
 )
 def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs):
-
     if how in ("leftsemi", "leftanti") and (
         kwargs.get("left_index") or kwargs.get("right_index")
     ):
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 13e44e7cf59..e52bbe54072 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import re
 from decimal import Decimal
@@ -863,7 +863,6 @@ def test_dataframe_with_nulls_where_with_scalars(fill_value):
 
 
 def test_dataframe_with_different_types():
-
     # Testing for int and float
     pdf = pd.DataFrame(
         {"A": [111, 22, 31, 410, 56], "B": [-10.12, 121.2, 45.7, 98.4, 87.6]}
@@ -963,7 +962,6 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
     # to_replace is a list, replacement is a scalar
     if not can_replace:
         with pytest.raises(TypeError):
-
             sr.replace([2, 3], replacement)
     else:
         expect = psr.replace([2, 3], replacement).astype(psr.dtype)
@@ -1168,7 +1166,6 @@ def test_series_clip(data, lower, upper, inplace):
 
 
 def test_series_exceptions_for_clip():
-
     with pytest.raises(ValueError):
         cudf.Series([1, 2, 3, 4]).clip([1, 2], [2, 3])
 
@@ -1331,7 +1328,6 @@ def test_series_replace_errors():
     ],
 )
 def test_replace_nulls(gsr, old, new, expected):
-
     actual = gsr.replace(old, new)
     assert_eq(
         expected.sort_values().reset_index(drop=True),
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index a36cc1b3819..efc738eec1f 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import textwrap
 
@@ -382,7 +382,6 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows):
     ],
 )
 def test_generic_index_null(index, expected_repr):
-
     actual_repr = repr(index)
 
     assert expected_repr == actual_repr
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index 0a07eecd096..b437c82bf6e 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import re
 from itertools import chain
@@ -253,7 +253,6 @@ def test_df_stack_multiindex_column_axis_pd_example(level):
 )
 @pytest.mark.parametrize("nulls", ["none", "some"])
 def test_interleave_columns(nulls, num_cols, num_rows, dtype):
-
     if dtype not in ["float32", "float64"] and nulls in ["some"]:
         pytest.skip(reason="nulls not supported in dtype: " + dtype)
 
@@ -290,7 +289,6 @@ def test_interleave_columns(nulls, num_cols, num_rows, dtype):
 @pytest.mark.parametrize("dtype", ALL_TYPES)
 @pytest.mark.parametrize("nulls", ["none", "some"])
 def test_tile(nulls, num_cols, num_rows, dtype, count):
-
     if dtype not in ["float32", "float64"] and nulls in ["some"]:
         pytest.skip(reason="nulls not supported in dtype: " + dtype)
 
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index 19714b7b9d3..91643f21155 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import math
 from contextlib import contextmanager
@@ -154,7 +154,6 @@ def test_rolling_with_offset(agg):
 @pytest.mark.parametrize("seed", [100, 2000])
 @pytest.mark.parametrize("window_size", [2, 10, 100])
 def test_rolling_var_std_large(agg, ddof, center, seed, window_size):
-
     iupper_bound = math.sqrt(np.iinfo(np.int64).max / window_size)
     ilower_bound = -math.sqrt(abs(np.iinfo(np.int64).min) / window_size)
 
@@ -315,7 +314,6 @@ def test_rolling_getitem_window():
 )
 @pytest.mark.parametrize("center", [True, False])
 def test_rollling_series_numba_udf_basic(data, index, center):
-
     psr = _create_pandas_series_float64_default(data, index=index)
     gsr = cudf.from_pandas(psr)
 
@@ -352,7 +350,6 @@ def some_func(A):
 )
 @pytest.mark.parametrize("center", [True, False])
 def test_rolling_dataframe_numba_udf_basic(data, center):
-
     pdf = pd.DataFrame(data)
     gdf = cudf.from_pandas(pdf)
 
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index d73a1d40aaa..05a91a8fea3 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import datetime
 import re
@@ -178,7 +178,6 @@ def test_scalar_device_initialization_decimal(value, decimal_type):
 
 @pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES)
 def test_scalar_roundtrip(value):
-
     s = cudf.Scalar(value)
 
     assert s._is_host_value_current
@@ -352,7 +351,6 @@ def test_scalar_implicit_int_conversion(value):
 @pytest.mark.parametrize("cls", [int, float, bool])
 @pytest.mark.parametrize("dtype", sorted(set(ALL_TYPES) - {"category"}))
 def test_scalar_invalid_implicit_conversion(cls, dtype):
-
     try:
         cls(
             pd.NaT
diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py
index 17cf3cf8141..3ba652ff6c0 100644
--- a/python/cudf/cudf/tests/test_search.py
+++ b/python/cudf/cudf/tests/test_search.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 import cupy
 import numpy as np
 import pandas as pd
@@ -86,7 +86,6 @@ def test_search_sorted_dataframe_unequal_number_of_columns():
 
 @pytest.mark.parametrize("side", ["left", "right"])
 def test_searchsorted_categorical(side):
-
     cat1 = pd.Categorical(
         ["a", "a", "b", "c", "a"], categories=["a", "b", "c"], ordered=True
     )
@@ -106,7 +105,6 @@ def test_searchsorted_categorical(side):
 
 @pytest.mark.parametrize("side", ["left", "right"])
 def test_searchsorted_datetime(side):
-
     psr1 = pd.Series(
         pd.date_range("20190101", "20200101", freq="400h", name="times")
     )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 2e2b79386d7..7dcbf859f08 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -813,7 +813,6 @@ def test_round_nan_as_null_false(series, decimal):
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_isnull_isna(ps, nan_as_null):
-
     if nan_as_null is False and (
         ps.isna().any() and not ps.isna().all() and ps.dtype == object
     ):
@@ -829,7 +828,6 @@ def test_series_isnull_isna(ps, nan_as_null):
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_notnull_notna(ps, nan_as_null):
-
     if nan_as_null is False and (
         ps.isna().any() and not ps.isna().all() and ps.dtype == object
     ):
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index b3db1310adb..8152c1bc03c 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import string
 from itertools import product
@@ -205,7 +205,6 @@ def test_dataframe_nsmallest_sliced(counts, sliceobj):
 def test_dataframe_multi_column(
     num_cols, num_rows, dtype, ascending, na_position
 ):
-
     np.random.seed(0)
     by = list(string.ascii_lowercase[:num_cols])
     pdf = pd.DataFrame()
@@ -234,7 +233,6 @@ def test_dataframe_multi_column(
 def test_dataframe_multi_column_nulls(
     num_cols, num_rows, dtype, nulls, ascending, na_position
 ):
-
     np.random.seed(0)
     by = list(string.ascii_lowercase[:num_cols])
     pdf = pd.DataFrame()
@@ -298,7 +296,6 @@ def test_series_nlargest_nelem(nelem):
 @pytest.mark.parametrize("nelem", [1, 10, 100])
 @pytest.mark.parametrize("keep", [True, False])
 def test_dataframe_scatter_by_map(map_size, nelem, keep):
-
     strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"]
     np.random.seed(0)
     df = DataFrame()
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 5f010668383..8ff4dc73c4c 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from concurrent.futures import ThreadPoolExecutor
 
@@ -182,7 +182,6 @@ def test_exact_quantiles_int(int_method):
 
 
 def test_approx_quantiles():
-
     arr = np.asarray([6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7])
     quant_values = [0.0, 0.25, 0.33, 0.5, 1.0]
 
@@ -222,7 +221,6 @@ def test_approx_quantiles_int():
     ],
 )
 def test_misc_quantiles(data, q):
-
     pdf_series = _create_pandas_series_float64_default(data)
     gdf_series = _create_cudf_series_float64_default(data)
 
@@ -485,7 +483,6 @@ def test_corr1d(data1, data2, method):
 
 @pytest.mark.parametrize("method", ["spearman", "pearson"])
 def test_df_corr(method):
-
     gdf = randomdata(100, {str(x): float for x in range(50)})
     pdf = gdf.to_pandas()
     got = gdf.corr(method)
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 198dfa9372c..4c5598b547e 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import json
 import re
@@ -847,7 +847,6 @@ def test_string_contains_case(ps_gs):
     ],
 )
 def test_string_like(pat, esc, expect):
-
     expectation = does_not_raise()
     if len(esc) > 1:
         expectation = pytest.raises(ValueError)
@@ -2402,7 +2401,6 @@ def test_string_str_translate(data):
 
 
 def test_string_str_filter_characters():
-
     data = [
         "hello world",
         "A+B+C+D",
@@ -2432,7 +2430,6 @@ def test_string_str_filter_characters():
 
 
 def test_string_str_code_points():
-
     data = [
         "abc",
         "Def",
@@ -2598,7 +2595,6 @@ def test_string_typecast_error(data, obj_type, dtype):
     ],
 )
 def test_string_hex_to_int(data):
-
     gsr = cudf.Series(data)
 
     expected = cudf.Series([263988422296292, 0, 281474976710655])
diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
index 3024c8e2e7b..091cd6b57a4 100644
--- a/python/cudf/cudf/tests/test_testing.py
+++ b/python/cudf/cudf/tests/test_testing.py
@@ -112,7 +112,6 @@ def test_basic_assert_series_equal(
     check_categorical,
     dtype,
 ):
-
     p_left = pd.Series([1, 2, 3], name="a", dtype=dtype)
     p_right = pd.Series(rdata, name=rname, dtype=dtype)
 
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index d86612d3143..12f1ace7867 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1185,7 +1185,6 @@ def test_timedelta_fillna(data, dtype, fill_value):
     ],
 )
 def test_timedelta_str_roundtrip(gsr, expected_series):
-
     actual_series = gsr.astype("str")
 
     assert_eq(expected_series, actual_series)
diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py
index 723bbdf9371..88938457545 100644
--- a/python/cudf/cudf/tests/test_transform.py
+++ b/python/cudf/cudf/tests/test_transform.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 
 import numpy as np
@@ -23,7 +23,6 @@ def _generic_function(a):
     ],
 )
 def test_apply_python_lambda(dtype, udf, testfunc):
-
     size = 500
 
     lhs_arr = np.random.random(size).astype(dtype)
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 11970944a95..95ea4544917 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -74,7 +74,6 @@ def run_masked_udf_test(func, data, args=(), nullable=True, **kwargs):
 
 
 def run_masked_string_udf_test(func, data, args=(), **kwargs):
-
     gdf = data
     pdf = data.to_pandas(nullable=True)
 
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index 66dbd731e69..d57303ca122 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import functools
 from typing import Any, Dict
@@ -108,7 +108,6 @@ def apply_chunks(
 
 @acquire_spill_lock()
 def make_aggregate_nullmask(df, columns=None, op="__and__"):
-
     out_mask = None
     for k in columns or df._data:
         col = cudf.core.dataframe.extract_col(df, k)
diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py
index a0915951240..ef078ed8c5d 100644
--- a/python/cudf/cudf/utils/hash_vocab_utils.py
+++ b/python/cudf/cudf/utils/hash_vocab_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 # This function is from the rapidsai/clx repo at below link
 # https://github.com/rapidsai/clx/blob/267c6d30805c9dcbf80840f222bf31c5c4b7068a/python/clx/analytics/_perfect_hash.py
 import numpy as np
@@ -158,7 +158,6 @@ def _perfect_hash(integers, max_constant):
 
 
 def _pack_keys_and_values(flattened_hash_table, original_dict):
-
     for i in range(len(flattened_hash_table)):
         if flattened_hash_table[i] in original_dict:
             value = original_dict[flattened_hash_table[i]]
@@ -189,7 +188,6 @@ def _store_func(
     first_token_id,
     sep_token_id,
 ):
-
     with open(out_name, mode="w+") as f:
         f.write(f"{outer_a}\n")
         f.write(f"{outer_b}\n")
@@ -215,7 +213,6 @@ def _retrieve(
     inner_table_coeffs,
     offsets_into_ht,
 ):
-
     bin_hash = _hash_func(k, outer_a, outer_b, num_outer_bins)
     start_offset_in_ht = offsets_into_ht[bin_hash]
     inner_table_values = inner_table_coeffs[bin_hash]
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 6641bd8290a..57e657eb5c1 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import datetime
 import os
@@ -2028,7 +2028,7 @@ def _merge_ranges(byte_ranges, max_block=256_000_000, max_gap=64_000):
         return new_ranges
 
     offset, size = byte_ranges[0]
-    for (new_offset, new_size) in byte_ranges[1:]:
+    for new_offset, new_size in byte_ranges[1:]:
         gap = new_offset - (offset + size)
         if gap > max_gap or (size + new_size + gap) > max_block:
             # Gap is too large or total read is too large
@@ -2068,7 +2068,7 @@ def _read_byte_ranges(
     # Simple utility to copy remote byte ranges
     # into a local buffer for IO in libcudf
     workers = []
-    for (offset, nbytes) in ranges:
+    for offset, nbytes in ranges:
         if len(ranges) > 1:
             workers.append(
                 Thread(
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 738ff24f374..df4bed0be0a 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1135,7 +1135,6 @@ def test_index_new():
 
 @pytest.mark.xfail(not LOADED, reason="Should not fail in accelerated mode")
 def test_groupby_apply_callable_referencing_pandas(dataframe):
-
     pdf, df = dataframe
 
     class Callable1:
diff --git a/python/custreamz/custreamz/tests/conftest.py b/python/custreamz/custreamz/tests/conftest.py
index 5840ff710d5..1cda9b71387 100644
--- a/python/custreamz/custreamz/tests/conftest.py
+++ b/python/custreamz/custreamz/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 import socket
 
 import pytest
@@ -7,7 +7,6 @@
 
 @pytest.fixture(scope="session")
 def kafka_client():
-
     # Check for the existence of a kafka broker
     s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     try:
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 387643587d1..5d951cec266 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import warnings
 from collections.abc import Iterator
@@ -438,7 +438,6 @@ def hash_object_cudf(frame, index=True):
 @hash_object_dispatch.register(cudf.BaseIndex)
 @_dask_cudf_nvtx_annotate
 def hash_object_cudf_index(ind, index=None):
-
     if isinstance(ind, cudf.MultiIndex):
         return ind.to_frame(index=False).hash_values()
 
@@ -586,7 +585,6 @@ def from_dict(
         columns=None,
         constructor=cudf.DataFrame,
     ):
-
         return _default_backend(
             dd.from_dict,
             data,
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 08c03235484..c2b2428bf14 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -153,7 +153,6 @@ def set_index(
         shuffle_method=None,
         **kwargs,
     ):
-
         pre_sorted = sorted
         del sorted
 
@@ -165,7 +164,6 @@ def set_index(
                 and cudf.api.types.is_string_dtype(self[other].dtype)
             )
         ):
-
             # Let upstream-dask handle "pre-sorted" case
             if pre_sorted:
                 return dd.shuffle.set_sorted_index(
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index d82d539358d..fc962670c47 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 import itertools
 import warnings
 from contextlib import ExitStack
@@ -73,7 +73,6 @@ def _read_paths(
         dataset_kwargs=None,
         **kwargs,
     ):
-
         # Simplify row_groups if all None
         if row_groups == [None for path in paths]:
             row_groups = None
@@ -94,7 +93,6 @@ def _read_paths(
         dataset_kwargs = dataset_kwargs or {}
         dataset_kwargs["partitioning"] = partitioning or "hive"
         with ExitStack() as stack:
-
             # Non-local filesystem handling
             paths_or_fobs = paths
             if not _is_local_filesystem(fs):
@@ -153,7 +151,6 @@ def _read_paths(
             df = df[projected_columns]
 
         if partitions and partition_keys is None:
-
             # Use `HivePartitioning` by default
             ds = pa_ds.dataset(
                 paths,
@@ -175,7 +172,6 @@ def _read_paths(
                 raise ValueError("Must pass partition sets")
 
             for i, (name, index2) in enumerate(partition_keys):
-
                 if len(partitions[i].keys):
                     # Build a categorical column from `codes` directly
                     # (since the category is often a larger dtype)
@@ -211,7 +207,6 @@ def read_partition(
         open_file_options=None,
         **kwargs,
     ):
-
         if columns is not None:
             columns = [c for c in columns]
         if isinstance(index, list):
@@ -241,7 +236,6 @@ def read_partition(
         # inform the user that the `read_parquet` partition
         # size is too large for the available memory
         try:
-
             # Assume multi-piece read
             paths = []
             rgs = []
@@ -249,7 +243,6 @@ def read_partition(
             dfs = []
 
             for i, piece in enumerate(pieces):
-
                 (path, row_group, partition_keys) = piece
                 row_group = None if row_group == [None] else row_group
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
index 4ff630a89e8..5f1aa98e888 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import gzip
 import os
@@ -248,7 +248,6 @@ def test_read_csv_nrows(csv_end_bad_lines):
 
 
 def test_read_csv_nrows_error(csv_end_bad_lines):
-
     with pytest.raises(ValueError):
         dask_cudf.read_csv(
             csv_end_bad_lines, nrows=2, blocksize="100 MiB"
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index 5565a44c7d8..c2be75e8ddd 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import glob
 import os
@@ -85,7 +85,6 @@ def test_read_orc_filtered(tmpdir, engine, predicate, expected_len):
 
 
 def test_read_orc_first_file_empty(tmpdir):
-
     # Write a 3-file dataset where the first file is empty
     # See: https://github.com/rapidsai/cudf/issues/8011
     path = str(tmpdir)
@@ -112,7 +111,6 @@ def test_read_orc_first_file_empty(tmpdir):
     ],
 )
 def test_to_orc(tmpdir, dtypes, compression, compute):
-
     # Create cudf and dask_cudf dataframes
     df = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1)
     df = df.set_index("index").sort_index()
diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py
index 2adace565d5..9cdb7c5220b 100644
--- a/python/dask_cudf/dask_cudf/io/text.py
+++ b/python/dask_cudf/dask_cudf/io/text.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import os
 from glob import glob
@@ -11,7 +11,6 @@
 
 
 def read_text(path, chunksize="256 MiB", **kwargs):
-
     if isinstance(chunksize, str):
         chunksize = parse_bytes(chunksize)
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 7ed5d797822..3a54672c1d3 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -254,7 +254,6 @@ def test_string_slicing(data):
 
 
 def test_categorical_categories():
-
     df = DataFrame(
         {"a": ["a", "b", "c", "d", "e", "e", "a", "d"], "b": range(8)}
     )
@@ -283,7 +282,6 @@ def test_categorical_as_known():
 
 
 def test_str_slice():
-
     df = DataFrame({"a": ["abc,def,123", "xyz,hi,bye"]})
 
     ddf = dgd.from_cudf(df, 1)
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 63fd6599496..5b11b337f21 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -106,7 +106,6 @@ def test_from_cudf():
 
 
 def test_from_cudf_multiindex_raises():
-
     df = cudf.DataFrame({"x": list("abc"), "y": [1, 2, 3], "z": [1, 2, 3]})
 
     with pytest.raises(NotImplementedError):
@@ -115,7 +114,6 @@ def test_from_cudf_multiindex_raises():
 
 
 def test_from_cudf_with_generic_idx():
-
     cdf = cudf.DataFrame(
         {
             "a": list(range(20)),
@@ -641,7 +639,6 @@ def test_concat(gdf, gddf, series):
 
 
 def test_boolean_index(gdf, gddf):
-
     gdf2 = gdf[gdf.x > 2]
     gddf2 = gddf[gddf.x > 2]
 
@@ -658,7 +655,6 @@ def test_drop(gdf, gddf):
 @pytest.mark.parametrize("deep", [True, False])
 @pytest.mark.parametrize("index", [True, False])
 def test_memory_usage(gdf, gddf, index, deep):
-
     dd.assert_eq(
         gdf.memory_usage(deep=deep, index=index),
         gddf.memory_usage(deep=deep, index=index),
@@ -710,7 +706,6 @@ def test_hash_object_dispatch(index):
     ],
 )
 def test_make_meta_backends(index):
-
     dtypes = ["int8", "int32", "int64", "float64"]
     df = cudf.DataFrame(
         {dt: np.arange(start=0, stop=3, dtype=dt) for dt in dtypes}
@@ -734,7 +729,6 @@ def test_make_meta_backends(index):
 
     # Check dask code path if not MultiIndex
     if not isinstance(df.index, cudf.MultiIndex):
-
         ddf = dgd.from_cudf(df, npartitions=1)
 
         # Check "empty" metadata types
diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
index c34fbc3b0e7..e966e58f46e 100644
--- a/python/dask_cudf/dask_cudf/tests/test_reductions.py
+++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -66,7 +66,6 @@ def test_series_reduce(reducer):
     "op", ["max", "min", "sum", "prod", "mean", "var", "std"]
 )
 def test_rowwise_reductions(data, op):
-
     gddf = dgd.from_cudf(data, npartitions=10)
     pddf = gddf.to_dask_dataframe()
 

From f14ba2221efab298f064b4275f69b58a833b6374 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Jan 2024 07:37:10 -0800
Subject: [PATCH 53/60] Move all core types to using enum class in Cython
 (#14876)

This change is a necessary prerequisite for adding other APIs to pylibcudf that need these types.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14876
---
 python/cudf/cudf/_lib/cpp/types.pxd         | 83 ++++++++++-----------
 python/cudf/cudf/_lib/lists.pyx             |  4 +-
 python/cudf/cudf/_lib/stream_compaction.pyx |  4 +-
 3 files changed, 42 insertions(+), 49 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd
index 14bf8a83de0..13aebdff726 100644
--- a/python/cudf/cudf/_lib/cpp/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/types.pxd
@@ -1,58 +1,56 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, uint32_t
+from libcpp cimport bool
 
 
 cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
-    # The declaration below is to work around
-    # https://github.com/cython/cython/issues/5637
-    """
-    #define __PYX_ENUM_CLASS_DECL enum
-    """
     ctypedef int32_t size_type
     ctypedef uint32_t bitmask_type
     ctypedef uint32_t char_utf8
 
-    ctypedef enum mask_state:
-        UNALLOCATED "cudf::mask_state::UNALLOCATED"
-        UNINITIALIZED "cudf::mask_state::UNINITIALIZED"
-        ALL_VALID "cudf::mask_state::ALL_VALID"
-        ALL_NULL "cudf::mask_state::ALL_NULL"
+    # A Hack to let cython compile with __int128_t symbol
+    # https://stackoverflow.com/a/27609033
+    ctypedef int int128 "__int128_t"
 
-    ctypedef enum order "cudf::order":
-        ASCENDING "cudf::order::ASCENDING"
-        DESCENDING "cudf::order::DESCENDING"
+    cpdef enum class mask_state(int32_t):
+        UNALLOCATED
+        UNINITIALIZED
+        ALL_VALID
+        ALL_NULL
 
-    ctypedef enum null_order "cudf::null_order":
-        AFTER "cudf::null_order::AFTER"
-        BEFORE "cudf::null_order::BEFORE"
+    cpdef enum class order(bool):
+        ASCENDING
+        DESCENDING
 
-    ctypedef enum sorted "cudf::sorted":
-        NO "cudf::sorted::NO"
-        YES "cudf::sorted::YES"
+    cpdef enum class null_order(bool):
+        AFTER
+        BEFORE
+
+    cpdef enum class sorted(bool):
+        NO
+        YES
 
     cdef cppclass order_info:
         sorted is_sorted
         order ordering
         null_order null_ordering
 
-    ctypedef enum null_policy "cudf::null_policy":
-        EXCLUDE "cudf::null_policy::EXCLUDE"
-        INCLUDE "cudf::null_policy::INCLUDE"
+    cpdef enum class null_policy(bool):
+        EXCLUDE
+        INCLUDE
 
-    ctypedef enum nan_policy "cudf::nan_policy":
-        NAN_IS_NULL  "cudf::nan_policy::NAN_IS_NULL"
-        NAN_IS_VALID "cudf::nan_policy::NAN_IS_VALID"
+    cpdef enum class nan_policy(bool):
+        NAN_IS_NULL
+        NAN_IS_VALID
 
-    ctypedef enum null_equality "cudf::null_equality":
-        EQUAL "cudf::null_equality::EQUAL"
-        UNEQUAL "cudf::null_equality::UNEQUAL"
+    cpdef enum class null_equality(bool):
+        EQUAL
+        UNEQUAL
 
-    ctypedef enum nan_equality "cudf::nan_equality":
-        # These names differ from the C++ names due to Cython warnings if
-        # "UNEQUAL" is declared by both null_equality and nan_equality.
-        ALL_EQUAL "cudf::nan_equality::ALL_EQUAL"
-        NANS_UNEQUAL "cudf::nan_equality::UNEQUAL"
+    cpdef enum class nan_equality(bool):
+        ALL_EQUAL
+        UNEQUAL
 
     cpdef enum class type_id(int32_t):
         EMPTY
@@ -93,14 +91,9 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         type_id id() except +
         int32_t scale() except +
 
-cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
-    ctypedef enum interpolation:
-        LINEAR "cudf::interpolation::LINEAR"
-        LOWER "cudf::interpolation::LOWER"
-        HIGHER "cudf::interpolation::HIGHER"
-        MIDPOINT "cudf::interpolation::MIDPOINT"
-        NEAREST "cudf::interpolation::NEAREST"
-
-    # A Hack to let cython compile with __int128_t symbol
-    # https://stackoverflow.com/a/27609033
-    ctypedef int int128 "__int128_t"
+    cpdef enum class interpolation(int32_t):
+        LINEAR
+        LOWER
+        HIGHER
+        MIDPOINT
+        NEAREST
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 199641fd2ce..f76d7a9a388 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -84,7 +84,7 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
         null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
     )
     cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.NANS_UNEQUAL
+        nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL
     )
 
     cdef unique_ptr[column] c_result
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index 9b22728d2f0..d7725e8df94 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -209,7 +209,7 @@ def distinct_indices(
     cdef nan_equality cpp_nans_equal = (
         nan_equality.ALL_EQUAL
         if nans_equal
-        else nan_equality.NANS_UNEQUAL
+        else nan_equality.UNEQUAL
     )
     cdef table_view source = table_view_from_columns(columns)
     cdef unique_ptr[column] c_result

From 0cd58fbec63d5e461b487e7e37aa9942ebe0f116 Mon Sep 17 00:00:00 2001
From: AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
Date: Thu, 25 Jan 2024 11:40:04 -0500
Subject: [PATCH 54/60] Fix index difference to follow the pandas format
 (#14789)

This PR fixes an error in `Index.difference` where the function keeps duplicate elements while pandas removes the duplicates. The tests had no inputs with duplicates, so I added new tests too (I added the test from the original issue).

- closes #14489

Authors:
  - AmirAli Mirian (https://github.com/amiralimi)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14789
---
 python/cudf/cudf/core/_base_index.py | 4 ++--
 python/cudf/cudf/tests/test_index.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 2aef77b6c99..d7d8e26db1b 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1040,11 +1040,11 @@ def difference(self, other, sort=None):
         res_name = _get_result_name(self.name, other.name)
 
         if is_mixed_with_object_dtype(self, other):
-            difference = self.copy()
+            difference = self.copy().unique()
         else:
             other = other.copy(deep=False)
             difference = cudf.core.index._index_from_data(
-                cudf.DataFrame._from_data({"None": self._column})
+                cudf.DataFrame._from_data({"None": self._column.unique()})
                 .merge(
                     cudf.DataFrame._from_data({"None": other._column}),
                     how="leftanti",
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index a480a4624f7..e0a369d8d91 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 """
 Test related to Index
@@ -803,6 +803,7 @@ def test_index_to_series(data):
         pd.Series(["1", "2", "a", "3", None], dtype="category"),
         range(0, 10),
         [],
+        [1, 1, 2, 2],
     ],
 )
 @pytest.mark.parametrize(
@@ -819,6 +820,7 @@ def test_index_to_series(data):
         range(2, 4),
         pd.Series(["1", "a", "3", None], dtype="category"),
         [],
+        [2],
     ],
 )
 @pytest.mark.parametrize("sort", [None, False])

From 35011dd13c93f2b4e7c46e9360a7c545eb40dd9b Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 25 Jan 2024 17:15:46 +0000
Subject: [PATCH 55/60] De-DOS line-endings (#14880)

These are the only two files in the repo (other than the sphinx make.bat files, which should have DOS line-endings) that use \r\n as the line-ending. Let's fix that.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14880
---
 cpp/doxygen/unicode.md           |  46 +--
 cpp/src/search/contains_table.cu | 584 +++++++++++++++----------------
 2 files changed, 315 insertions(+), 315 deletions(-)

diff --git a/cpp/doxygen/unicode.md b/cpp/doxygen/unicode.md
index 1ab09e110c1..089bb944b42 100644
--- a/cpp/doxygen/unicode.md
+++ b/cpp/doxygen/unicode.md
@@ -1,23 +1,23 @@
-# Unicode Limitations
-
-The strings column currently supports only UTF-8 characters internally.
-For functions that require character testing (e.g. cudf::strings::all_characters_of_type()) or
-case conversion (e.g. cudf::strings::capitalize(), etc) only the 16-bit [Unicode 13.0](http://www.unicode.org/versions/Unicode13.0.0)
-character code-points (0-65535) values are supported.
-Case conversion and character testing on characters above code-point 65535 are not supported.
-
-Case conversions that are context-sensitive are not supported. Also, case conversions that result
-in multiple characters are not reversible. That is, adjacent individual characters will not be case converted
-to a single character. For example, converting character ß to upper case will result in the characters "SS". But converting "SS" to lower case will produce "ss".
-
-Strings case and type APIs:
-
-- cudf::strings::all_characters_of_type()
-- cudf::strings::to_upper()
-- cudf::strings::to_lower()
-- cudf::strings::capitalize()
-- cudf::strings::title()
-- cudf::strings::swapcase()
-
-Also, using regex patterns that use the shorthand character classes `\d \D \w \W \s \S` will include only appropriate characters with
-code-points between (0-65535).
+# Unicode Limitations
+
+The strings column currently supports only UTF-8 characters internally.
+For functions that require character testing (e.g. cudf::strings::all_characters_of_type()) or
+case conversion (e.g. cudf::strings::capitalize(), etc) only the 16-bit [Unicode 13.0](http://www.unicode.org/versions/Unicode13.0.0)
+character code-points (0-65535) values are supported.
+Case conversion and character testing on characters above code-point 65535 are not supported.
+
+Case conversions that are context-sensitive are not supported. Also, case conversions that result
+in multiple characters are not reversible. That is, adjacent individual characters will not be case converted
+to a single character. For example, converting character ß to upper case will result in the characters "SS". But converting "SS" to lower case will produce "ss".
+
+Strings case and type APIs:
+
+- cudf::strings::all_characters_of_type()
+- cudf::strings::to_upper()
+- cudf::strings::to_lower()
+- cudf::strings::capitalize()
+- cudf::strings::title()
+- cudf::strings::swapcase()
+
+Also, using regex patterns that use the shorthand character classes `\d \D \w \W \s \S` will include only appropriate characters with
+code-points between (0-65535).
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 09122b37d6f..b8ece03c4a0 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -1,292 +1,292 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <join/join_common_utils.cuh>
-
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-
-#include <cuco/static_set.cuh>
-
-#include <cuda/functional>
-
-#include <type_traits>
-
-namespace cudf::detail {
-
-namespace {
-
-using cudf::experimental::row::lhs_index_type;
-using cudf::experimental::row::rhs_index_type;
-
-/**
- * @brief An hasher adapter wrapping both haystack hasher and needles hasher
- */
-template <typename HaystackHasher, typename NeedleHasher>
-struct hasher_adapter {
-  hasher_adapter(HaystackHasher const& haystack_hasher, NeedleHasher const& needle_hasher)
-    : _haystack_hasher{haystack_hasher}, _needle_hasher{needle_hasher}
-  {
-  }
-
-  __device__ constexpr auto operator()(lhs_index_type idx) const noexcept
-  {
-    return _haystack_hasher(static_cast<size_type>(idx));
-  }
-
-  __device__ constexpr auto operator()(rhs_index_type idx) const noexcept
-  {
-    return _needle_hasher(static_cast<size_type>(idx));
-  }
-
- private:
-  HaystackHasher const _haystack_hasher;
-  NeedleHasher const _needle_hasher;
-};
-
-/**
- * @brief An comparator adapter wrapping both self comparator and two table comparator
- */
-template <typename SelfEqual, typename TwoTableEqual>
-struct comparator_adapter {
-  comparator_adapter(SelfEqual const& self_equal, TwoTableEqual const& two_table_equal)
-    : _self_equal{self_equal}, _two_table_equal{two_table_equal}
-  {
-  }
-
-  __device__ constexpr auto operator()(lhs_index_type lhs_index,
-                                       lhs_index_type rhs_index) const noexcept
-  {
-    auto const lhs = static_cast<size_type>(lhs_index);
-    auto const rhs = static_cast<size_type>(rhs_index);
-
-    return _self_equal(lhs, rhs);
-  }
-
-  __device__ constexpr auto operator()(lhs_index_type lhs_index,
-                                       rhs_index_type rhs_index) const noexcept
-  {
-    return _two_table_equal(lhs_index, rhs_index);
-  }
-
- private:
-  SelfEqual const _self_equal;
-  TwoTableEqual const _two_table_equal;
-};
-
-/**
- * @brief Build a row bitmask for the input table.
- *
- * The output bitmask will have invalid bits corresponding to the input rows having nulls (at
- * any nested level) and vice versa.
- *
- * @param input The input table
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return A pair of pointer to the output bitmask and the buffer containing the bitmask
- */
-std::pair<rmm::device_buffer, bitmask_type const*> build_row_bitmask(table_view const& input,
-                                                                     rmm::cuda_stream_view stream)
-{
-  auto const nullable_columns = get_nullable_columns(input);
-  CUDF_EXPECTS(nullable_columns.size() > 0,
-               "The input table has nulls thus it should have nullable columns.");
-
-  // If there are more than one nullable column, we compute `bitmask_and` of their null masks.
-  // Otherwise, we have only one nullable column and can use its null mask directly.
-  if (nullable_columns.size() > 1) {
-    auto row_bitmask =
-      cudf::detail::bitmask_and(
-        table_view{nullable_columns}, stream, rmm::mr::get_current_device_resource())
-        .first;
-    auto const row_bitmask_ptr = static_cast<bitmask_type const*>(row_bitmask.data());
-    return std::pair(std::move(row_bitmask), row_bitmask_ptr);
-  }
-
-  return std::pair(rmm::device_buffer{0, stream}, nullable_columns.front().null_mask());
-}
-
-/**
- * @brief Invokes the given `func` with desired comparators based on the specified `compare_nans`
- * parameter
- *
- * @tparam HasNested Flag indicating whether there are nested columns in haystack or needles
- * @tparam Hasher Type of device hash function
- * @tparam Func Type of the helper function doing `contains` check
- *
- * @param compare_nulls Control whether nulls should be compared as equal or not
- * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
- * @param haystack_has_nulls Flag indicating whether haystack has nulls or not
- * @param has_any_nulls Flag indicating whether there are nested nulls is either haystack or needles
- * @param self_equal Self table comparator
- * @param two_table_equal Two table comparator
- * @param d_hasher Device hash functor
- * @param func The input functor to invoke
- */
-template <bool HasNested, typename Hasher, typename Func>
-void dispatch_nan_comparator(
-  null_equality compare_nulls,
-  nan_equality compare_nans,
-  bool haystack_has_nulls,
-  bool has_any_nulls,
-  cudf::experimental::row::equality::self_comparator self_equal,
-  cudf::experimental::row::equality::two_table_comparator two_table_equal,
-  Hasher const& d_hasher,
-  Func&& func)
-{
-  // Distinguish probing scheme CG sizes between nested and flat types for better performance
-  auto const probing_scheme = [&]() {
-    if constexpr (HasNested) {
-      return cuco::experimental::linear_probing<4, Hasher>{d_hasher};
-    } else {
-      return cuco::experimental::linear_probing<1, Hasher>{d_hasher};
-    }
-  }();
-
-  if (compare_nans == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    auto const d_self_equal = self_equal.equal_to<HasNested>(
-      nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_equal_comparator{});
-    auto const d_two_table_equal = two_table_equal.equal_to<HasNested>(
-      nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_equal_comparator{});
-    func(d_self_equal, d_two_table_equal, probing_scheme);
-  } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    auto const d_self_equal      = self_equal.equal_to<HasNested>(
-      nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_unequal_comparator{});
-    auto const d_two_table_equal = two_table_equal.equal_to<HasNested>(
-      nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_unequal_comparator{});
-    func(d_self_equal, d_two_table_equal, probing_scheme);
-  }
-}
-
-}  // namespace
-
-rmm::device_uvector<bool> contains(table_view const& haystack,
-                                   table_view const& needles,
-                                   null_equality compare_nulls,
-                                   nan_equality compare_nans,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(cudf::have_same_types(haystack, needles), "Column types mismatch");
-
-  auto const haystack_has_nulls = has_nested_nulls(haystack);
-  auto const needles_has_nulls  = has_nested_nulls(needles);
-  auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
-
-  auto const preprocessed_needles =
-    cudf::experimental::row::equality::preprocessed_table::create(needles, stream);
-  auto const preprocessed_haystack =
-    cudf::experimental::row::equality::preprocessed_table::create(haystack, stream);
-
-  auto const haystack_hasher   = cudf::experimental::row::hash::row_hasher(preprocessed_haystack);
-  auto const d_haystack_hasher = haystack_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
-  auto const needle_hasher     = cudf::experimental::row::hash::row_hasher(preprocessed_needles);
-  auto const d_needle_hasher   = needle_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
-  auto const d_hasher          = hasher_adapter{d_haystack_hasher, d_needle_hasher};
-
-  auto const self_equal = cudf::experimental::row::equality::self_comparator(preprocessed_haystack);
-  auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator(
-    preprocessed_haystack, preprocessed_needles);
-
-  // The output vector.
-  auto contained = rmm::device_uvector<bool>(needles.num_rows(), stream, mr);
-
-  auto const haystack_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0}, cuda::proclaim_return_type<lhs_index_type>([] __device__(auto idx) {
-      return lhs_index_type{idx};
-    }));
-  auto const needles_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0}, cuda::proclaim_return_type<rhs_index_type>([] __device__(auto idx) {
-      return rhs_index_type{idx};
-    }));
-
-  auto const helper_func =
-    [&](auto const& d_self_equal, auto const& d_two_table_equal, auto const& probing_scheme) {
-      auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal};
-
-      auto set = cuco::experimental::static_set{
-        cuco::experimental::extent{compute_hash_table_size(haystack.num_rows())},
-        cuco::empty_key{lhs_index_type{-1}},
-        d_equal,
-        probing_scheme,
-        detail::hash_table_allocator_type{default_allocator<lhs_index_type>{}, stream},
-        stream.value()};
-
-      if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
-        auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
-        auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
-
-        // If the haystack table has nulls but they are compared unequal, don't insert them.
-        // Otherwise, it was known to cause performance issue:
-        // - https://github.com/rapidsai/cudf/pull/6943
-        // - https://github.com/rapidsai/cudf/pull/8277
-        set.insert_if_async(haystack_iter,
-                            haystack_iter + haystack.num_rows(),
-                            thrust::counting_iterator<size_type>(0),  // stencil
-                            row_is_valid{row_bitmask_ptr},
-                            stream.value());
-      } else {
-        set.insert_async(haystack_iter, haystack_iter + haystack.num_rows(), stream.value());
-      }
-
-      if (needles_has_nulls && compare_nulls == null_equality::UNEQUAL) {
-        auto const bitmask_buffer_and_ptr = build_row_bitmask(needles, stream);
-        auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
-        set.contains_if_async(needles_iter,
-                              needles_iter + needles.num_rows(),
-                              thrust::counting_iterator<size_type>(0),  // stencil
-                              row_is_valid{row_bitmask_ptr},
-                              contained.begin(),
-                              stream.value());
-      } else {
-        set.contains_async(
-          needles_iter, needles_iter + needles.num_rows(), contained.begin(), stream.value());
-      }
-    };
-
-  if (cudf::detail::has_nested_columns(haystack)) {
-    dispatch_nan_comparator<true>(compare_nulls,
-                                  compare_nans,
-                                  haystack_has_nulls,
-                                  has_any_nulls,
-                                  self_equal,
-                                  two_table_equal,
-                                  d_hasher,
-                                  helper_func);
-  } else {
-    dispatch_nan_comparator<false>(compare_nulls,
-                                   compare_nans,
-                                   haystack_has_nulls,
-                                   has_any_nulls,
-                                   self_equal,
-                                   two_table_equal,
-                                   d_hasher,
-                                   helper_func);
-  }
-
-  return contained;
-}
-
-}  // namespace cudf::detail
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <join/join_common_utils.cuh>
+
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <cuco/static_set.cuh>
+
+#include <cuda/functional>
+
+#include <type_traits>
+
+namespace cudf::detail {
+
+namespace {
+
+using cudf::experimental::row::lhs_index_type;
+using cudf::experimental::row::rhs_index_type;
+
+/**
+ * @brief An hasher adapter wrapping both haystack hasher and needles hasher
+ */
+template <typename HaystackHasher, typename NeedleHasher>
+struct hasher_adapter {
+  hasher_adapter(HaystackHasher const& haystack_hasher, NeedleHasher const& needle_hasher)
+    : _haystack_hasher{haystack_hasher}, _needle_hasher{needle_hasher}
+  {
+  }
+
+  __device__ constexpr auto operator()(lhs_index_type idx) const noexcept
+  {
+    return _haystack_hasher(static_cast<size_type>(idx));
+  }
+
+  __device__ constexpr auto operator()(rhs_index_type idx) const noexcept
+  {
+    return _needle_hasher(static_cast<size_type>(idx));
+  }
+
+ private:
+  HaystackHasher const _haystack_hasher;
+  NeedleHasher const _needle_hasher;
+};
+
+/**
+ * @brief An comparator adapter wrapping both self comparator and two table comparator
+ */
+template <typename SelfEqual, typename TwoTableEqual>
+struct comparator_adapter {
+  comparator_adapter(SelfEqual const& self_equal, TwoTableEqual const& two_table_equal)
+    : _self_equal{self_equal}, _two_table_equal{two_table_equal}
+  {
+  }
+
+  __device__ constexpr auto operator()(lhs_index_type lhs_index,
+                                       lhs_index_type rhs_index) const noexcept
+  {
+    auto const lhs = static_cast<size_type>(lhs_index);
+    auto const rhs = static_cast<size_type>(rhs_index);
+
+    return _self_equal(lhs, rhs);
+  }
+
+  __device__ constexpr auto operator()(lhs_index_type lhs_index,
+                                       rhs_index_type rhs_index) const noexcept
+  {
+    return _two_table_equal(lhs_index, rhs_index);
+  }
+
+ private:
+  SelfEqual const _self_equal;
+  TwoTableEqual const _two_table_equal;
+};
+
+/**
+ * @brief Build a row bitmask for the input table.
+ *
+ * The output bitmask will have invalid bits corresponding to the input rows having nulls (at
+ * any nested level) and vice versa.
+ *
+ * @param input The input table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A pair of pointer to the output bitmask and the buffer containing the bitmask
+ */
+std::pair<rmm::device_buffer, bitmask_type const*> build_row_bitmask(table_view const& input,
+                                                                     rmm::cuda_stream_view stream)
+{
+  auto const nullable_columns = get_nullable_columns(input);
+  CUDF_EXPECTS(nullable_columns.size() > 0,
+               "The input table has nulls thus it should have nullable columns.");
+
+  // If there are more than one nullable column, we compute `bitmask_and` of their null masks.
+  // Otherwise, we have only one nullable column and can use its null mask directly.
+  if (nullable_columns.size() > 1) {
+    auto row_bitmask =
+      cudf::detail::bitmask_and(
+        table_view{nullable_columns}, stream, rmm::mr::get_current_device_resource())
+        .first;
+    auto const row_bitmask_ptr = static_cast<bitmask_type const*>(row_bitmask.data());
+    return std::pair(std::move(row_bitmask), row_bitmask_ptr);
+  }
+
+  return std::pair(rmm::device_buffer{0, stream}, nullable_columns.front().null_mask());
+}
+
+/**
+ * @brief Invokes the given `func` with desired comparators based on the specified `compare_nans`
+ * parameter
+ *
+ * @tparam HasNested Flag indicating whether there are nested columns in haystack or needles
+ * @tparam Hasher Type of device hash function
+ * @tparam Func Type of the helper function doing `contains` check
+ *
+ * @param compare_nulls Control whether nulls should be compared as equal or not
+ * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
+ * @param haystack_has_nulls Flag indicating whether haystack has nulls or not
+ * @param has_any_nulls Flag indicating whether there are nested nulls is either haystack or needles
+ * @param self_equal Self table comparator
+ * @param two_table_equal Two table comparator
+ * @param d_hasher Device hash functor
+ * @param func The input functor to invoke
+ */
+template <bool HasNested, typename Hasher, typename Func>
+void dispatch_nan_comparator(
+  null_equality compare_nulls,
+  nan_equality compare_nans,
+  bool haystack_has_nulls,
+  bool has_any_nulls,
+  cudf::experimental::row::equality::self_comparator self_equal,
+  cudf::experimental::row::equality::two_table_comparator two_table_equal,
+  Hasher const& d_hasher,
+  Func&& func)
+{
+  // Distinguish probing scheme CG sizes between nested and flat types for better performance
+  auto const probing_scheme = [&]() {
+    if constexpr (HasNested) {
+      return cuco::experimental::linear_probing<4, Hasher>{d_hasher};
+    } else {
+      return cuco::experimental::linear_probing<1, Hasher>{d_hasher};
+    }
+  }();
+
+  if (compare_nans == nan_equality::ALL_EQUAL) {
+    using nan_equal_comparator =
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
+    auto const d_self_equal = self_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_equal_comparator{});
+    auto const d_two_table_equal = two_table_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_equal_comparator{});
+    func(d_self_equal, d_two_table_equal, probing_scheme);
+  } else {
+    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
+    auto const d_self_equal      = self_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_unequal_comparator{});
+    auto const d_two_table_equal = two_table_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_unequal_comparator{});
+    func(d_self_equal, d_two_table_equal, probing_scheme);
+  }
+}
+
+}  // namespace
+
+rmm::device_uvector<bool> contains(table_view const& haystack,
+                                   table_view const& needles,
+                                   null_equality compare_nulls,
+                                   nan_equality compare_nans,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(cudf::have_same_types(haystack, needles), "Column types mismatch");
+
+  auto const haystack_has_nulls = has_nested_nulls(haystack);
+  auto const needles_has_nulls  = has_nested_nulls(needles);
+  auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
+
+  auto const preprocessed_needles =
+    cudf::experimental::row::equality::preprocessed_table::create(needles, stream);
+  auto const preprocessed_haystack =
+    cudf::experimental::row::equality::preprocessed_table::create(haystack, stream);
+
+  auto const haystack_hasher   = cudf::experimental::row::hash::row_hasher(preprocessed_haystack);
+  auto const d_haystack_hasher = haystack_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
+  auto const needle_hasher     = cudf::experimental::row::hash::row_hasher(preprocessed_needles);
+  auto const d_needle_hasher   = needle_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
+  auto const d_hasher          = hasher_adapter{d_haystack_hasher, d_needle_hasher};
+
+  auto const self_equal = cudf::experimental::row::equality::self_comparator(preprocessed_haystack);
+  auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator(
+    preprocessed_haystack, preprocessed_needles);
+
+  // The output vector.
+  auto contained = rmm::device_uvector<bool>(needles.num_rows(), stream, mr);
+
+  auto const haystack_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, cuda::proclaim_return_type<lhs_index_type>([] __device__(auto idx) {
+      return lhs_index_type{idx};
+    }));
+  auto const needles_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, cuda::proclaim_return_type<rhs_index_type>([] __device__(auto idx) {
+      return rhs_index_type{idx};
+    }));
+
+  auto const helper_func =
+    [&](auto const& d_self_equal, auto const& d_two_table_equal, auto const& probing_scheme) {
+      auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal};
+
+      auto set = cuco::experimental::static_set{
+        cuco::experimental::extent{compute_hash_table_size(haystack.num_rows())},
+        cuco::empty_key{lhs_index_type{-1}},
+        d_equal,
+        probing_scheme,
+        detail::hash_table_allocator_type{default_allocator<lhs_index_type>{}, stream},
+        stream.value()};
+
+      if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
+        auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
+        auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
+
+        // If the haystack table has nulls but they are compared unequal, don't insert them.
+        // Otherwise, it was known to cause performance issue:
+        // - https://github.com/rapidsai/cudf/pull/6943
+        // - https://github.com/rapidsai/cudf/pull/8277
+        set.insert_if_async(haystack_iter,
+                            haystack_iter + haystack.num_rows(),
+                            thrust::counting_iterator<size_type>(0),  // stencil
+                            row_is_valid{row_bitmask_ptr},
+                            stream.value());
+      } else {
+        set.insert_async(haystack_iter, haystack_iter + haystack.num_rows(), stream.value());
+      }
+
+      if (needles_has_nulls && compare_nulls == null_equality::UNEQUAL) {
+        auto const bitmask_buffer_and_ptr = build_row_bitmask(needles, stream);
+        auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
+        set.contains_if_async(needles_iter,
+                              needles_iter + needles.num_rows(),
+                              thrust::counting_iterator<size_type>(0),  // stencil
+                              row_is_valid{row_bitmask_ptr},
+                              contained.begin(),
+                              stream.value());
+      } else {
+        set.contains_async(
+          needles_iter, needles_iter + needles.num_rows(), contained.begin(), stream.value());
+      }
+    };
+
+  if (cudf::detail::has_nested_columns(haystack)) {
+    dispatch_nan_comparator<true>(compare_nulls,
+                                  compare_nans,
+                                  haystack_has_nulls,
+                                  has_any_nulls,
+                                  self_equal,
+                                  two_table_equal,
+                                  d_hasher,
+                                  helper_func);
+  } else {
+    dispatch_nan_comparator<false>(compare_nulls,
+                                   compare_nans,
+                                   haystack_has_nulls,
+                                   has_any_nulls,
+                                   self_equal,
+                                   two_table_equal,
+                                   d_hasher,
+                                   helper_func);
+  }
+
+  return contained;
+}
+
+}  // namespace cudf::detail

From 821f4dea107db6a51fcbffff997fa6844ab5565f Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Thu, 25 Jan 2024 17:44:58 -0600
Subject: [PATCH 56/60] Fixed an issue with output chunking computation
 stemming from input chunking. (#14889)

Fixes  https://github.com/rapidsai/cudf/issues/14883

The core issue was that the output chunking code was expecting all columns to have terminating pages that end in the same row count.  Previously this was the case because we always processed entire row groups.  But now with the subrowgroup reader, we can split on page boundaries that cause a jagged max row index for different columns.  Example:

```
             0       100             200
Col A     [-----------][--------------]      300
Col B     [-----------][----------------------]
```

The input chunking would have computed a max row index of 200 for the subpass.  But when computing the _output_ chunks, there was code that would have tried finding where row 300 was in column A, resulting in an out-of-bounds read.

The fix is simply to cap the max row seen for column B to be the max expected row for the subpass.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14889
---
 cpp/src/io/parquet/reader_impl_chunking.cu | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 1bfe5745b9e..e0cb2fbb4f4 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -253,13 +253,15 @@ struct set_row_index {
   device_span<ColumnChunkDesc const> chunks;
   device_span<PageInfo const> pages;
   device_span<cumulative_page_info> c_info;
+  size_t max_row;
 
   __device__ void operator()(size_t i)
   {
-    auto const& page            = pages[i];
-    auto const& chunk           = chunks[page.chunk_idx];
-    size_t const page_start_row = chunk.start_row + page.chunk_row + page.num_rows;
-    c_info[i].row_index         = page_start_row;
+    auto const& page          = pages[i];
+    auto const& chunk         = chunks[page.chunk_idx];
+    size_t const page_end_row = chunk.start_row + page.chunk_row + page.num_rows;
+    // if we have been passed in a cap, apply it
+    c_info[i].row_index = max_row > 0 ? min(max_row, page_end_row) : page_end_row;
   }
 };
 
@@ -1288,7 +1290,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
     thrust::for_each(rmm::exec_policy_nosync(_stream),
                      iter,
                      iter + pass.pages.size(),
-                     set_row_index{pass.chunks, pass.pages, c_info});
+                     set_row_index{pass.chunks, pass.pages, c_info, 0});
     // print_cumulative_page_info(pass.pages, pass.chunks, c_info, _stream);
 
     // get the next batch of pages
@@ -1533,10 +1535,15 @@ void reader::impl::compute_output_chunks_for_subpass()
                                 thrust::equal_to{},
                                 cumulative_page_sum{});
   auto iter = thrust::make_counting_iterator(0);
+  // cap the max row in all pages by the max row we expect in the subpass. input chunking
+  // can cause "dangling" row counts where for example, only 1 column has a page whose
+  // maximum row is beyond our expected subpass max row, which will cause an out of
+  // bounds index in compute_page_splits_by_row.
+  auto const subpass_max_row = subpass.skip_rows + subpass.num_rows;
   thrust::for_each(rmm::exec_policy_nosync(_stream),
                    iter,
                    iter + subpass.pages.size(),
-                   set_row_index{pass.chunks, subpass.pages, c_info});
+                   set_row_index{pass.chunks, subpass.pages, c_info, subpass_max_row});
   // print_cumulative_page_info(subpass.pages, c_info, _stream);
 
   // compute the splits

From 4444909b63b6854a9202f8093dfd7ae7833b0d1b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 26 Jan 2024 08:27:50 +0530
Subject: [PATCH 57/60] FIx more miscellaneous pytests failures (#14895)

This PR fixes multiple issues:

 Enables corr and cov for Datetime and Timedelta types.
 Properly disables all and any for StringColumn.
 Preserves groupby categorical index ordering.
 Catches FutureWarnings in pytests.
---
 python/cudf/cudf/core/column/datetime.py  | 18 ++++++++++
 python/cudf/cudf/core/column/string.py    | 14 ++++++++
 python/cudf/cudf/core/column/timedelta.py | 18 ++++++++++
 python/cudf/cudf/core/dataframe.py        |  4 +--
 python/cudf/cudf/core/groupby/groupby.py  |  9 -----
 python/cudf/cudf/tests/test_dataframe.py  | 43 ++++++++---------------
 python/cudf/cudf/tests/test_groupby.py    | 22 +++++++-----
 python/cudf/cudf/tests/test_joining.py    |  4 +--
 python/cudf/cudf/tests/test_stats.py      | 38 ++++++++++----------
 9 files changed, 101 insertions(+), 69 deletions(-)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 6f7baebddd3..08a5103b409 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -532,6 +532,24 @@ def median(self, skipna: Optional[bool] = None) -> pd.Timestamp:
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
+    def cov(self, other: DatetimeColumn) -> float:
+        if not isinstance(other, DatetimeColumn):
+            raise TypeError(
+                f"cannot perform corr with types {self.dtype}, {other.dtype}"
+            )
+        return self.as_numerical_column("int64").cov(
+            other.as_numerical_column("int64")
+        )
+
+    def corr(self, other: DatetimeColumn) -> float:
+        if not isinstance(other, DatetimeColumn):
+            raise TypeError(
+                f"cannot perform corr with types {self.dtype}, {other.dtype}"
+            )
+        return self.as_numerical_column("int64").corr(
+            other.as_numerical_column("int64")
+        )
+
     def quantile(
         self,
         q: np.ndarray,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 3d222cb762e..b115e6cda48 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5584,6 +5584,20 @@ def data(self):
                 ]
         return self._data
 
+    def all(self, skipna: bool = True) -> bool:
+        # The skipna argument is only used for numerical columns.
+        # If all entries are null the result is True, including when the column
+        # is empty.
+
+        raise NotImplementedError("`all` not implemented for `StringColumn`")
+
+    def any(self, skipna: bool = True) -> bool:
+        # The skipna argument is only used for numerical columns.
+        # If all entries are null the result is True, including when the column
+        # is empty.
+
+        raise NotImplementedError("`any` not implemented for `StringColumn`")
+
     def data_array_view(
         self, *, mode="write"
     ) -> cuda.devicearray.DeviceNDArray:
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index b7209bbe7d0..2c12c77277c 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -404,6 +404,24 @@ def std(
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
+    def cov(self, other: TimeDeltaColumn) -> float:
+        if not isinstance(other, TimeDeltaColumn):
+            raise TypeError(
+                f"cannot perform corr with types {self.dtype}, {other.dtype}"
+            )
+        return self.as_numerical_column("int64").cov(
+            other.as_numerical_column("int64")
+        )
+
+    def corr(self, other: TimeDeltaColumn) -> float:
+        if not isinstance(other, TimeDeltaColumn):
+            raise TypeError(
+                f"cannot perform corr with types {self.dtype}, {other.dtype}"
+            )
+        return self.as_numerical_column("int64").corr(
+            other.as_numerical_column("int64")
+        )
+
     def components(self, index=None) -> "cudf.DataFrame":
         """
         Return a Dataframe of the components of the Timedeltas.
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 5fa1956eaf1..c94b9040693 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1038,7 +1038,6 @@ def _init_from_dict_like(
                 empty_column = functools.partial(
                     cudf.core.column.column_empty,
                     row_count=(0 if index is None else len(index)),
-                    dtype=None,
                     masked=index is not None,
                 )
 
@@ -6115,7 +6114,8 @@ def _reduce(
                 return Series(
                     index=self._data.to_pandas_index()[:0]
                     if axis == 0
-                    else source.index
+                    else source.index,
+                    dtype="float64",
                 )
         if axis in {0, 2}:
             if axis == 2 and op in ("kurtosis", "kurt", "skew"):
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6aba93855a7..3d0d7d9eba6 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -646,15 +646,6 @@ def agg(self, func):
                     how="left",
                 )
                 result = result.take(indices)
-                if isinstance(result._index, cudf.CategoricalIndex):
-                    # Needs re-ordering the categories in the order
-                    # they are after grouping.
-                    result._index = cudf.Index(
-                        result._index._column.reorder_categories(
-                            result._index._column._get_decategorized_column()
-                        ),
-                        name=result._index.name,
-                    )
 
         if not self._as_index:
             result = result.reset_index()
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 026f0aa845d..69be352cf63 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4173,15 +4173,7 @@ def test_dataframe_round_dict_decimal_validation():
         [None, None],
         [[0, 5], [1, 6], [2, 7], [3, 8], [4, 9]],
         [[1, True], [2, False], [3, False]],
-        pytest.param(
-            [["a", True], ["b", False], ["c", False]],
-            marks=[
-                pytest_xfail(
-                    reason="NotImplementedError: all does not "
-                    "support columns of object dtype."
-                )
-            ],
-        ),
+        [["a", True], ["b", False], ["c", False]],
     ],
 )
 def test_all(data):
@@ -4192,6 +4184,9 @@ def test_all(data):
     if np.array(data).ndim <= 1:
         pdata = pd.Series(data=data, dtype=dtype).replace([None], False)
         gdata = cudf.Series.from_pandas(pdata)
+        got = gdata.all()
+        expected = pdata.all()
+        assert_eq(got, expected)
     else:
         pdata = pd.DataFrame(data, columns=["a", "b"], dtype=dtype).replace(
             [None], False
@@ -4203,10 +4198,10 @@ def test_all(data):
             got = gdata.all(bool_only=True)
             expected = pdata.all(bool_only=True)
             assert_eq(got, expected)
-
-    got = gdata.all()
-    expected = pdata.all()
-    assert_eq(got, expected)
+        else:
+            got = gdata.all()
+            expected = pdata.all()
+            assert_eq(got, expected)
 
 
 @pytest.mark.parametrize(
@@ -4226,21 +4221,13 @@ def test_all(data):
         [None, None],
         [[0, 5], [1, 6], [2, 7], [3, 8], [4, 9]],
         [[1, True], [2, False], [3, False]],
-        pytest.param(
-            [["a", True], ["b", False], ["c", False]],
-            marks=[
-                pytest_xfail(
-                    reason="NotImplementedError: any does not "
-                    "support columns of object dtype."
-                )
-            ],
-        ),
+        [["a", True], ["b", False], ["c", False]],
     ],
 )
 @pytest.mark.parametrize("axis", [0, 1])
 def test_any(data, axis):
     # Provide a dtype when data is empty to avoid future pandas changes.
-    dtype = None if data else float
+    dtype = float if all(x is None for x in data) or len(data) < 1 else None
     if np.array(data).ndim <= 1:
         pdata = pd.Series(data=data, dtype=dtype)
         gdata = cudf.Series(data=data, dtype=dtype)
@@ -4261,10 +4248,10 @@ def test_any(data, axis):
             got = gdata.any(bool_only=True)
             expected = pdata.any(bool_only=True)
             assert_eq(got, expected)
-
-        got = gdata.any(axis=axis)
-        expected = pdata.any(axis=axis)
-        assert_eq(got, expected)
+        else:
+            got = gdata.any(axis=axis)
+            expected = pdata.any(axis=axis)
+            assert_eq(got, expected)
 
 
 @pytest.mark.parametrize("axis", [0, 1])
@@ -10197,7 +10184,7 @@ def test_empty_numeric_only(data):
     pdf = gdf.to_pandas()
     expected = pdf.prod(numeric_only=True)
     actual = gdf.prod(numeric_only=True)
-    assert_eq(expected, actual)
+    assert_eq(expected, actual, check_dtype=True)
 
 
 @pytest.fixture(params=[0, 10], ids=["empty", "10"])
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index f594963dcda..e3dceeca1f3 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -565,7 +565,9 @@ def test_groupby_apply_jit_reductions_special_vals(
     func, dtype, dataset, groupby_jit_datasets, special_val
 ):
     dataset = groupby_jit_datasets[dataset]
-    with expect_warning_if(func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning):
+    with expect_warning_if(
+        func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning
+    ):
         groupby_apply_jit_reductions_special_vals_inner(
             func, dataset, dtype, special_val
         )
@@ -1409,7 +1411,7 @@ def test_groupby_multi_agg_hash_groupby(agg):
 
 
 @pytest.mark.parametrize(
-    "agg", ["min", "max", "idxmax", "idxmax", "sum", "prod", "count", "mean"]
+    "agg", ["min", "max", "idxmax", "idxmin", "sum", "prod", "count", "mean"]
 )
 def test_groupby_nulls_basic(agg):
     check_dtype = agg not in _index_type_aggs
@@ -1447,11 +1449,12 @@ def test_groupby_nulls_basic(agg):
 
     # TODO: fillna() used here since we don't follow
     # Pandas' null semantics. Should we change it?
-    assert_groupby_results_equal(
-        getattr(pdf.groupby("a"), agg)().fillna(0),
-        getattr(gdf.groupby("a"), agg)().fillna(0 if agg != "prod" else 1),
-        check_dtype=check_dtype,
-    )
+    with expect_warning_if(agg in {"idxmax", "idxmin"}):
+        assert_groupby_results_equal(
+            getattr(pdf.groupby("a"), agg)().fillna(0),
+            getattr(gdf.groupby("a"), agg)().fillna(0 if agg != "prod" else 1),
+            check_dtype=check_dtype,
+        )
 
 
 def test_groupby_nulls_in_index():
@@ -3702,8 +3705,9 @@ def test_categorical_grouping_pandas_compatibility():
 
     with cudf.option_context("mode.pandas_compatible", True):
         actual = gdf.groupby("key", sort=False).sum()
-    expected = pdf.groupby("key", sort=False).sum()
-
+    with pytest.warns(FutureWarning):
+        # observed param deprecation.
+        expected = pdf.groupby("key", sort=False).sum()
     assert_eq(actual, expected)
 
 
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 8ce2adae15b..00b4a9b0e01 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -183,8 +183,8 @@ def test_dataframe_join_suffix():
     assert list(expect.columns) == list(got.columns)
     assert_eq(expect.index.values, got.index.values)
 
-    got_sorted = got.sort_values(by=list(got.columns), axis=0)
-    expect_sorted = expect.sort_values(by=list(expect.columns), axis=0)
+    got_sorted = got.sort_values(by=["b_left", "c", "b_right"], axis=0)
+    expect_sorted = expect.sort_values(by=["b_left", "c", "b_right"], axis=0)
     for k in expect_sorted.columns:
         _check_series(expect_sorted[k].fillna(-1), got_sorted[k].fillna(-1))
 
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index edd7da3d42c..6dbb23fbf04 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -581,28 +581,28 @@ def test_min_count_ops(data, ops, skipna, min_count):
 
 
 @pytest.mark.parametrize(
-    "gsr",
+    "data1",
     [
-        cudf.Series([1, 2, 3, 4], dtype="datetime64[ns]"),
-        cudf.Series([1, 2, 3, 4], dtype="timedelta64[ns]"),
+        [1, 2, 3, 4],
+        [10, 1, 3, 5],
     ],
 )
-def test_cov_corr_invalid_dtypes(gsr):
-    psr = gsr.to_pandas()
-
-    assert_exceptions_equal(
-        lfunc=psr.corr,
-        rfunc=gsr.corr,
-        lfunc_args_and_kwargs=([psr],),
-        rfunc_args_and_kwargs=([gsr],),
-    )
-
-    assert_exceptions_equal(
-        lfunc=psr.cov,
-        rfunc=gsr.cov,
-        lfunc_args_and_kwargs=([psr],),
-        rfunc_args_and_kwargs=([gsr],),
-    )
+@pytest.mark.parametrize(
+    "data2",
+    [
+        [1, 2, 3, 4],
+        [10, 1, 3, 5],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
+def test_cov_corr_datetime_timedelta(data1, data2, dtype):
+    gsr1 = cudf.Series(data1, dtype=dtype)
+    gsr2 = cudf.Series(data2, dtype=dtype)
+    psr1 = gsr1.to_pandas()
+    psr2 = gsr2.to_pandas()
+
+    assert_eq(psr1.corr(psr2), gsr1.corr(gsr2))
+    assert_eq(psr1.cov(psr2), gsr1.cov(gsr2))
 
 
 @pytest.mark.parametrize(

From 23d189beb1a6f4dc281f22f5c4ce7772d2848767 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 25 Jan 2024 17:09:34 -1000
Subject: [PATCH 58/60] Fix some pytests (#14894)

np.product call I think will be redundant with the existing params, np.var call adjusted to what was tested before
matmul failure existed upstream in pandas
Snuck in a clean up files leftover by a parquet test (found these leftover when running the test suite locally)
---
 python/cudf/cudf/tests/test_array_function.py |  3 +--
 python/cudf/cudf/tests/test_array_ufunc.py    | 11 ++++++++++-
 python/cudf/cudf/tests/test_parquet.py        | 18 ++++++++++--------
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index 758a8cbb535..58658f8b3cc 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -65,11 +65,10 @@ def test_array_func_cudf_series(np_ar, func):
     [
         lambda x: np.mean(x, axis=0),
         lambda x: np.sum(x, axis=0),
-        lambda x: np.var(x, ddof=1),
+        lambda x: np.var(x, ddof=1, axis=0),
         lambda x: np.dot(x, x.transpose()),
         lambda x: np.all(x),
         lambda x: np.any(x),
-        lambda x: np.product(x),
         lambda x: np.product(x, axis=0),
         lambda x: np.product(x, axis=1),
     ],
diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
index f5e999559b3..3e3f3aa5dfa 100644
--- a/python/cudf/cudf/tests/test_array_ufunc.py
+++ b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -7,14 +7,16 @@
 
 import cupy as cp
 import numpy as np
+import pandas as pd
 import pytest
+from packaging import version
 
 import cudf
 from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_GE_210
 from cudf.testing._utils import (
     assert_eq,
-    set_random_null_mask_inplace,
     expect_warning_if,
+    set_random_null_mask_inplace,
 )
 
 _UFUNCS = [
@@ -89,6 +91,13 @@ def test_ufunc_index(request, ufunc):
             reason=f"cupy has no support for '{fname}'",
         )
     )
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=fname == "matmul"
+            and version.parse(pd.__version__) < version.parse("3.0"),
+            reason="Fixed by https://github.com/pandas-dev/pandas/pull/57079",
+        )
+    )
 
     N = 100
     # Avoid zeros in either array to skip division by 0 errors. Also limit the
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 73cbb924c65..69d3fe0b83f 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -21,7 +21,7 @@
 from pyarrow import fs as pa_fs, parquet as pq
 
 import cudf
-from cudf.core._compat import PANDAS_LT_153, PANDAS_GE_200
+from cudf.core._compat import PANDAS_GE_200, PANDAS_LT_153
 from cudf.io.parquet import (
     ParquetDatasetWriter,
     ParquetWriter,
@@ -2683,29 +2683,31 @@ def test_parquet_writer_decimal(decimal_type, data):
 
 
 def test_parquet_writer_column_validation():
+    cudf_parquet = BytesIO()
+    pandas_parquet = BytesIO()
     df = cudf.DataFrame({1: [1, 2, 3], "a": ["a", "b", "c"]})
     pdf = df.to_pandas()
 
     with cudf.option_context("mode.pandas_compatible", True):
         with pytest.warns(UserWarning):
-            df.to_parquet("cudf.parquet")
+            df.to_parquet(cudf_parquet)
 
     if PANDAS_GE_200:
         with pytest.warns(UserWarning):
-            pdf.to_parquet("pandas.parquet")
+            pdf.to_parquet(pandas_parquet)
 
         assert_eq(
-            pd.read_parquet("cudf.parquet"),
-            cudf.read_parquet("pandas.parquet"),
+            pd.read_parquet(cudf_parquet),
+            cudf.read_parquet(pandas_parquet),
         )
         assert_eq(
-            cudf.read_parquet("cudf.parquet"),
-            pd.read_parquet("pandas.parquet"),
+            cudf.read_parquet(cudf_parquet),
+            pd.read_parquet(pandas_parquet),
         )
 
     with cudf.option_context("mode.pandas_compatible", False):
         with pytest.raises(ValueError):
-            df.to_parquet("cudf.parquet")
+            df.to_parquet(cudf_parquet)
 
 
 def test_parquet_writer_nulls_pandas_read(tmpdir, pdf):

From 7df96e70289ee38a3a03ce7d70086edc9af62933 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Thu, 25 Jan 2024 23:24:09 -0500
Subject: [PATCH 59/60] Align datetimeindex slicing behaviour with Pandas 2.x
 (#14887)

* Align with pandas slicing behaviour for non-monotonic datetime index

* Not a TODO

---------

Co-authored-by: Ashwin Srinath <shwina@users.noreply.github.com>
---
 python/cudf/cudf/core/indexed_frame.py  | 15 +++++++++++---
 python/cudf/cudf/tests/test_indexing.py | 27 +++++++++++++++++++++----
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index fbbc606d7b8..cb7ff6a00d0 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -198,9 +198,18 @@ def _get_label_range_or_mask(index, start, stop, step):
         if start is not None and stop is not None:
             if start > stop:
                 return slice(0, 0, None)
-            # TODO: Once Index binary ops are updated to support logical_and,
-            # can use that instead of using cupy.
-            boolean_mask = cp.logical_and((index >= start), (index <= stop))
+            if (start in index) and (stop in index):
+                # when we have a non-monotonic datetime index, return
+                # values in the slice defined by index_of(start) and
+                # index_of(end)
+                start_loc = index.get_loc(start.to_datetime64())
+                stop_loc = index.get_loc(stop.to_datetime64()) + 1
+                return slice(start_loc, stop_loc)
+            else:
+                raise KeyError(
+                    "Value based partial slicing on non-monotonic DatetimeIndexes "
+                    "with non-existing keys is not allowed.",
+                )
         elif start is not None:
             boolean_mask = index >= start
         else:
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 8a84a84f681..1cdaa3c52a7 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1278,15 +1278,15 @@ def test_iloc_categorical_index(index):
 @pytest.mark.parametrize(
     "sli",
     [
-        slice("2001", "2020"),
         slice("2001", "2002"),
         slice("2002", "2001"),
-        slice(None, "2020"),
         slice("2001", None),
     ],
 )
 @pytest.mark.parametrize("is_dataframe", [True, False])
 def test_loc_datetime_index(sli, is_dataframe):
+    sli = slice(pd.to_datetime(sli.start), pd.to_datetime(sli.stop))
+
     if is_dataframe is True:
         pd_data = pd.DataFrame(
             {"a": [1, 2, 3]},
@@ -1299,13 +1299,32 @@ def test_loc_datetime_index(sli, is_dataframe):
         )
 
     gd_data = cudf.from_pandas(pd_data)
-
     expect = pd_data.loc[sli]
     got = gd_data.loc[sli]
-
     assert_eq(expect, got)
 
 
+@pytest.mark.parametrize(
+    "sli",
+    [
+        slice("2001", "2020"),
+        slice(None, "2020"),
+    ],
+)
+def test_loc_datetime_index_slice_not_in(sli):
+    pd_data = pd.Series(
+        [1, 2, 3],
+        pd.Series(["2001", "2009", "2002"], dtype="datetime64[ns]"),
+    )
+    gd_data = cudf.from_pandas(pd_data)
+    with pytest.raises(KeyError):
+        assert_eq(pd_data.loc[sli], gd_data.loc[sli])
+
+    with pytest.raises(KeyError):
+        sli = slice(pd.to_datetime(sli.start), pd.to_datetime(sli.stop))
+        assert_eq(pd_data.loc[sli], gd_data.loc[sli])
+
+
 @pytest.mark.parametrize(
     "gdf_kwargs",
     [

From 7d3e72af69ea38f4150a5a2ff352a300f704fcd0 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 26 Jan 2024 23:28:39 +0530
Subject: [PATCH 60/60] Parquet Writer: Write `non-string` columns
 pandas-compatibility mode only (#14899)

This PR enables writing of non-string columns in parquet writer only in pandas-compatibility mode.

This PR:
```
= 8 failed, 102249 passed, 2090 skipped, 976 xfailed, 312 xpassed in 1363.59s (0:22:43) =
```
On `pandas_2.0_feature_branch`:
```
= 9 failed, 102247 passed, 2091 skipped, 976 xfailed, 312 xpassed in 1336.47s (0:22:16) =
```


Co-authored-by: Lawrence Mitchell <wence@gmx.li>

---------

Co-authored-by: Lawrence Mitchell <wence@gmx.li>
---
 python/cudf/cudf/_lib/utils.pyx | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 50a47b4f507..7ba717a0003 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -85,7 +85,12 @@ cpdef generate_pandas_metadata(table, index):
 
     # Columns
     for name, col in table._data.items():
-        col_names.append(name)
+        if cudf.get_option("mode.pandas_compatible"):
+            # in pandas-compat mode, non-string column names are stringified.
+            col_names.append(str(name))
+        else:
+            col_names.append(name)
+
         if isinstance(col.dtype, cudf.CategoricalDtype):
             raise ValueError(
                 "'category' column dtypes are currently not "