diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a4a8f036174..d7d14ea12ff 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -52,7 +52,7 @@ jobs:
     steps:
       - name: Get PR info
         id: get-pr-info
-        uses: rapidsai/shared-actions/get-pr-info@main
+        uses: nv-gha-runners/get-pr-info@main
       - name: Checkout code repo
         uses: actions/checkout@v4
         with:
diff --git a/.gitignore b/.gitignore
index 619e1464b2a..180a6a286e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -80,7 +80,6 @@ build/
 cpp/build/
 cpp/examples/*/install/
 cpp/examples/*/build/
-cpp/examples/tpch/datagen/datafusion
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 
diff --git a/README.md b/README.md
index f62f7885d63..8f8c2adac2f 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12
 
 ### Conda
 
-cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel:
+cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7c387a6fc2a..26c086046a8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -385,6 +385,7 @@ add_library(
   src/io/json/nested_json_gpu.cu
   src/io/json/read_json.cu
   src/io/json/parser_features.cpp
+  src/io/json/process_tokens.cu
   src/io/json/write_json.cu
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index d2c22b788cb..6c5f4a68a4c 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -36,25 +36,25 @@ target_include_directories(
 )
 
 add_library(
-  tpch_data_generator STATIC
-  common/tpch_data_generator/tpch_data_generator.cpp common/tpch_data_generator/table_helpers.cpp
-  common/tpch_data_generator/random_column_generator.cu
+  ndsh_data_generator STATIC
+  common/ndsh_data_generator/ndsh_data_generator.cpp common/ndsh_data_generator/table_helpers.cpp
+  common/ndsh_data_generator/random_column_generator.cu
 )
-target_compile_features(tpch_data_generator PUBLIC cxx_std_17 cuda_std_17)
+target_compile_features(ndsh_data_generator PUBLIC cxx_std_17 cuda_std_17)
 
 target_compile_options(
-  tpch_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+  ndsh_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
                              "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
 )
 
 target_link_libraries(
-  tpch_data_generator
+  ndsh_data_generator
   PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
 target_include_directories(
-  tpch_data_generator
+  ndsh_data_generator
   PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
          "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
 )
@@ -127,8 +127,8 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
   target_link_libraries(
-    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen nvbench::nvbench
-                                $<TARGET_NAME_IF_EXISTS:conda_env>
+    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen
+                                nvbench::nvbench $<TARGET_NAME_IF_EXISTS:conda_env>
   )
   install(
     TARGETS ${CMAKE_BENCH_NAME}
@@ -175,6 +175,14 @@ ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else.cpp)
 # * transpose benchmark ---------------------------------------------------------------------------
 ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp)
 
+# ##################################################################################################
+# * nds-h benchmark --------------------------------------------------------------------------------
+ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp)
+
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
@@ -329,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
   TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp
similarity index 97%
rename from cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
rename to cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp
index 236fe8095ad..fa7edd225ba 100644
--- a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
+++ b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "tpch_data_generator.hpp"
+#include "ndsh_data_generator.hpp"
 
 #include "random_column_generator.hpp"
 #include "table_helpers.hpp"
@@ -435,46 +435,37 @@ std::unique_ptr<cudf::table> generate_lineitem_partial(cudf::table_view const& o
   columns.push_back(std::move(l_quantity));
   columns.push_back(std::move(l_discount));
   columns.push_back(std::move(l_tax));
+  columns.push_back(std::move(l_returnflag));
+  columns.push_back(std::move(l_linestatus));
   columns.push_back(std::move(l_shipdate_ts));
   columns.push_back(std::move(l_commitdate_ts));
   columns.push_back(std::move(l_receiptdate_ts));
-  columns.push_back(std::move(l_returnflag));
-  columns.push_back(std::move(l_linestatus));
   columns.push_back(std::move(l_shipinstruct));
   columns.push_back(std::move(l_shipmode));
   columns.push_back(std::move(l_comment));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem,
+/**
+ * @brief Generate the part of the `orders` table dependent on the `lineitem` table
+ *
+ * @param lineitem_partial The partially generated `lineitem` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem_partial,
                                                        rmm::cuda_stream_view stream,
                                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  auto const l_linestatus_mask = lineitem.column(0);
-  auto const l_orderkey        = lineitem.column(1);
-  auto const l_discount        = lineitem.column(6);
-  auto const l_tax             = lineitem.column(7);
-  auto const l_extendedprice   = lineitem.column(16);
+  auto const l_linestatus_mask = lineitem_partial.column(0);
+  auto const l_orderkey        = lineitem_partial.column(1);
+  auto const l_extendedprice   = lineitem_partial.column(6);
+  auto const l_discount        = lineitem_partial.column(7);
+  auto const l_tax             = lineitem_partial.column(8);
 
   std::vector<std::unique_ptr<cudf::column>> orders_dependent_columns;
 
-  // Generate the `o_totalprice` column
-  // We calculate the `charge` column, which is a function of `l_extendedprice`,
-  // `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
-  auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
-  auto o_totalprice   = [&]() {
-    auto const keys = cudf::table_view({l_orderkey});
-    cudf::groupby::groupby gb(keys);
-    std::vector<cudf::groupby::aggregation_request> requests;
-    requests.push_back(cudf::groupby::aggregation_request());
-    requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
-    requests[0].values = l_charge->view();
-    auto agg_result    = gb.aggregate(requests);
-    return cudf::round(agg_result.second[0].results[0]->view(), 2);
-  }();
-  orders_dependent_columns.push_back(std::move(o_totalprice));
-
   // Generate the `o_orderstatus` column
   auto o_orderstatus = [&]() {
     auto const keys = cudf::table_view({l_orderkey});
@@ -529,6 +520,22 @@ std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& l
       cudf::string_scalar("P"), o_orderstatus_intermediate->view(), mask_b->view());
   }();
   orders_dependent_columns.push_back(std::move(o_orderstatus));
+
+  // Generate the `o_totalprice` column
+  // We calculate the `charge` column, which is a function of `l_extendedprice`,
+  // `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
+  auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
+  auto o_totalprice   = [&]() {
+    auto const keys = cudf::table_view({l_orderkey});
+    cudf::groupby::groupby gb(keys);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    requests.push_back(cudf::groupby::aggregation_request());
+    requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+    requests[0].values = l_charge->view();
+    auto agg_result    = gb.aggregate(requests);
+    return cudf::round(agg_result.second[0].results[0]->view(), 2);
+  }();
+  orders_dependent_columns.push_back(std::move(o_totalprice));
   return std::make_unique<cudf::table>(std::move(orders_dependent_columns));
 }
 
@@ -730,9 +737,7 @@ generate_orders_lineitem_part(double scale_factor,
   // Generate the `part` table
   auto part = generate_part(scale_factor, stream, mr);
 
-  // Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column,
-  // add the column to the `lineitem` table, and write the `lineitem` table to a parquet file
-
+  // Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column
   auto l_extendedprice = [&]() {
     auto const left = cudf::table_view(
       {lineitem_partial->get_column(2).view(), lineitem_partial->get_column(5).view()});
@@ -752,8 +757,9 @@ generate_orders_lineitem_part(double scale_factor,
     return cudf::round(col->view(), 2);
   }();
 
+  // Insert the `l_extendedprice` column into the partial columns of the `lineitem` table
   auto lineitem_partial_columns = lineitem_partial->release();
-  lineitem_partial_columns.push_back(std::move(l_extendedprice));
+  lineitem_partial_columns.insert(lineitem_partial_columns.begin() + 6, std::move(l_extendedprice));
   auto lineitem_temp = std::make_unique<cudf::table>(std::move(lineitem_partial_columns));
 
   // Generate the dependent columns of the `orders` table
@@ -762,7 +768,7 @@ generate_orders_lineitem_part(double scale_factor,
 
   auto orders_independent_columns = orders_independent->release();
   auto orders_dependent_columns   = orders_dependent->release();
-  orders_independent_columns.insert(orders_independent_columns.end(),
+  orders_independent_columns.insert(orders_independent_columns.begin() + 2,
                                     std::make_move_iterator(orders_dependent_columns.begin()),
                                     std::make_move_iterator(orders_dependent_columns.end()));
 
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.hpp
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
rename to cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.hpp
diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu b/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu
rename to cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu
diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp b/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.hpp
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
rename to cpp/benchmarks/common/ndsh_data_generator/random_column_generator.hpp
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
rename to cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.hpp
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
rename to cpp/benchmarks/common/ndsh_data_generator/table_helpers.hpp
diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index 61e79a47a50..e4ff0c8c4a7 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -50,7 +50,7 @@ static void bench_hash(nvbench::state& state)
   state.add_global_memory_reads<nvbench::int64_t>(num_rows);
   // add memory read from bitmaks
   if (!no_nulls) {
-    state.add_global_memory_reads<nvbench::int8_t>(2 *
+    state.add_global_memory_reads<nvbench::int8_t>(2L *
                                                    cudf::bitmask_allocation_size_bytes(num_rows));
   }
   // memory written depends on used hash
@@ -63,37 +63,37 @@ static void bench_hash(nvbench::state& state)
     });
   } else if (hash_name == "md5") {
     // md5 creates a 32-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(32 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(32L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); });
   } else if (hash_name == "sha1") {
     // sha1 creates a 40-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(40 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(40L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha1(data->view()); });
   } else if (hash_name == "sha224") {
     // sha224 creates a 56-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(56 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(56L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha224(data->view()); });
   } else if (hash_name == "sha256") {
     // sha256 creates a 64-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(64 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(64L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha256(data->view()); });
   } else if (hash_name == "sha384") {
     // sha384 creates a 96-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(96 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(96L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha384(data->view()); });
   } else if (hash_name == "sha512") {
     // sha512 creates a 128-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(128 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(128L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha512(data->view()); });
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 7563c823454..ce115fd7723 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64;
 void parquet_read_common(cudf::size_type num_rows_to_read,
                          cudf::size_type num_cols_to_read,
                          cuio_source_sink_pair& source_sink,
-                         nvbench::state& state)
+                         nvbench::state& state,
+                         size_t table_data_size = data_size)
 {
   cudf::io::parquet_reader_options read_opts =
     cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
@@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
     });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_element_count(static_cast<double>(table_data_size) / time, "bytes_per_second");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
@@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list<nvbench::e
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+template <data_type DataType>
+void BM_parquet_read_wide_tables(nvbench::state& state,
+                                 nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
+
+  auto const n_col           = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
+  auto const cardinality     = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length      = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type     = io_type::DEVICE_BUFFER;
+  cuio_source_sink_pair source_sink(source_type);
+
+  auto const num_rows_written = [&]() {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, n_col),
+      table_size_bytes{data_size_bytes},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+    return view.num_rows();
+  }();
+
+  parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
+}
+
+void BM_parquet_read_wide_tables_mixed(nvbench::state& state)
+{
+  auto const d_type = []() {
+    auto d_type1 = get_type_or_group(static_cast<int32_t>(data_type::INTEGRAL));
+    auto d_type2 = get_type_or_group(static_cast<int32_t>(data_type::FLOAT));
+    d_type1.reserve(d_type1.size() + d_type2.size());
+    std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1));
+    return d_type1;
+  }();
+
+  auto const n_col           = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
+  auto const cardinality     = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length      = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type     = io_type::DEVICE_BUFFER;
+  cuio_source_sink_pair source_sink(source_type);
+
+  auto const num_rows_written = [&]() {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, n_col),
+      table_size_bytes{data_size_bytes},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+    return view.num_rows();
+  }();
+
+  parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
+}
+
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
                                             data_type::FLOAT,
                                             data_type::DECIMAL,
@@ -272,6 +337,24 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("num_string_cols", {1, 2, 3});
 
+using d_type_list_wide_table = nvbench::enum_type_list<data_type::DECIMAL, data_type::STRING>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table))
+  .set_name("parquet_read_wide_tables")
+  .set_min_samples(4)
+  .set_type_axes_names({"data_type"})
+  .add_int64_axis("data_size_mb", {1024, 2048, 4096})
+  .add_int64_axis("num_cols", {256, 512, 1024})
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
+
+NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed)
+  .set_name("parquet_read_wide_tables_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("data_size_mb", {1024, 2048, 4096})
+  .add_int64_axis("num_cols", {256, 512, 1024})
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
+
 // a benchmark for structs that only contain fixed-width types
 using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
 NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index 3abd4280081..7121cb9f034 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -50,7 +50,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state)
 }
 
 std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
-  nvbench::state& state, std::vector<cudf::type_id> const& d_types)
+  nvbench::state& state, std::vector<cudf::type_id> const& d_types, io_type io_source_type)
 {
   cudf::size_type const cardinality = state.get_int64("cardinality");
   cudf::size_type const run_length  = state.get_int64("run_length");
@@ -63,7 +63,7 @@ std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
   size_t total_file_size = 0;
 
   for (size_t i = 0; i < num_files; ++i) {
-    cuio_source_sink_pair source_sink{io_type::HOST_BUFFER};
+    cuio_source_sink_pair source_sink{io_source_type};
 
     auto const tbl = create_random_table(
       cycle_dtypes(d_types, num_cols),
@@ -92,11 +92,13 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
 {
   size_t const data_size = state.get_int64("total_data_size");
   auto const num_threads = state.get_int64("num_threads");
+  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   BS::thread_pool threads(num_threads);
 
-  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  auto [source_sink_vector, total_file_size, num_files] =
+    write_file_data(state, d_types, source_type);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
                  source_sink_vector.end(),
@@ -173,10 +175,12 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   auto const num_threads    = state.get_int64("num_threads");
   size_t const input_limit  = state.get_int64("input_limit");
   size_t const output_limit = state.get_int64("output_limit");
+  auto const source_type    = retrieve_io_type_enum(state.get_string("io_type"));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   BS::thread_pool threads(num_threads);
-  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  auto [source_sink_vector, total_file_size, num_files] =
+    write_file_data(state, d_types, source_type);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
                  source_sink_vector.end(),
@@ -264,7 +268,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
   .set_name("parquet_multithreaded_read_decode_fixed_width")
@@ -273,7 +278,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
   .set_name("parquet_multithreaded_read_decode_string")
@@ -282,7 +288,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
   .set_name("parquet_multithreaded_read_decode_list")
@@ -291,7 +298,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 // mixed data types: fixed width, strings
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
@@ -303,7 +311,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
   .set_name("parquet_multithreaded_read_decode_chunked_fixed_width")
@@ -314,7 +323,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
   .set_name("parquet_multithreaded_read_decode_chunked_string")
@@ -325,7 +335,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
   .set_name("parquet_multithreaded_read_decode_chunked_list")
@@ -336,4 +347,5 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
diff --git a/cpp/benchmarks/ndsh/README.md b/cpp/benchmarks/ndsh/README.md
new file mode 100644
index 00000000000..0a462e1684e
--- /dev/null
+++ b/cpp/benchmarks/ndsh/README.md
@@ -0,0 +1,11 @@
+# NDS-H Benchmarks for `libcudf`
+
+## Disclaimer
+
+NDS-H is derived from the TPC-H Benchmarks and as such any results obtained using NDS-H are not
+comparable to published TPC-H Benchmark results, as the results obtained from using NDS-H do not
+comply with the TPC-H Benchmarks.
+
+## Current Status
+
+For now, only Q1, Q5, Q6, Q9, and Q10 have been implemented
diff --git a/cpp/examples/tpch/q1.cpp b/cpp/benchmarks/ndsh/q01.cpp
similarity index 82%
rename from cpp/examples/tpch/q1.cpp
rename to cpp/benchmarks/ndsh/q01.cpp
index 87b7e613766..ef709926ae9 100644
--- a/cpp/examples/tpch/q1.cpp
+++ b/cpp/benchmarks/ndsh/q01.cpp
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
- * @file q1.cpp
- * @brief Implement query 1 of the TPC-H benchmark.
+ * @file q01.cpp
+ * @brief Implement query 1 of the NDS-H benchmark.
  *
  * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
  *
@@ -59,7 +61,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_disc_price(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_disc_price(
   cudf::column_view const& discount,
   cudf::column_view const& extendedprice,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -86,7 +88,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_charge(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_charge(
   cudf::column_view const& tax,
   cudf::column_view const& disc_price,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -101,16 +103,9 @@
   return charge;
 }
 
-int main(int argc, char const** argv)
+void run_ndsh_q1(nvbench::state& state,
+                 std::unordered_map<std::string, parquet_device_buffer>& sources)
 {
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
-
   // Define the column projections and filter predicate for `lineitem` table
   std::vector<std::string> const lineitem_cols = {"l_returnflag",
                                                   "l_linestatus",
@@ -130,12 +125,12 @@ int main(int argc, char const** argv)
 
   // Read out the `lineitem` table from parquet file
   auto lineitem =
-    read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
+    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Calculate the discount price and charge columns and append to lineitem table
   auto disc_price =
-    calc_disc_price(lineitem->column("l_discount"), lineitem->column("l_extendedprice"));
-  auto charge = calc_charge(lineitem->column("l_tax"), disc_price->view());
+    calculate_disc_price(lineitem->column("l_discount"), lineitem->column("l_extendedprice"));
+  auto charge = calculate_charge(lineitem->column("l_tax"), disc_price->view());
   (*lineitem).append(disc_price, "disc_price").append(charge, "charge");
 
   // Perform the group by operation
@@ -167,9 +162,21 @@ int main(int argc, char const** argv)
                                              {"l_returnflag", "l_linestatus"},
                                              {cudf::order::ASCENDING, cudf::order::ASCENDING});
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   orderedby_table->to_parquet("q1.parquet");
-  return 0;
 }
+
+void ndsh_q1(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q1(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q1).set_name("ndsh_q1").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/examples/tpch/q5.cpp b/cpp/benchmarks/ndsh/q05.cpp
similarity index 80%
rename from cpp/examples/tpch/q5.cpp
rename to cpp/benchmarks/ndsh/q05.cpp
index 12c186db10e..522bc4789c2 100644
--- a/cpp/examples/tpch/q5.cpp
+++ b/cpp/benchmarks/ndsh/q05.cpp
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
- * @file q5.cpp
- * @brief Implement query 5 of the TPC-H benchmark.
+ * @file q05.cpp
+ * @brief Implement query 5 of the NDS-H benchmark.
  *
  * create view customer as select * from '/tables/scale-1/customer.parquet';
  * create view orders as select * from '/tables/scale-1/orders.parquet';
@@ -67,7 +69,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_revenue(
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -86,16 +88,9 @@
   return revenue;
 }
 
-int main(int argc, char const** argv)
+void run_ndsh_q5(nvbench::state& state,
+                 std::unordered_map<std::string, parquet_device_buffer>& sources)
 {
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
-
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
   auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
@@ -125,17 +120,17 @@ int main(int argc, char const** argv)
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer =
-    read_parquet(args.dataset_dir + "/customer.parquet", {"c_custkey", "c_nationkey"});
+    read_parquet(sources["customer"].make_source_info(), {"c_custkey", "c_nationkey"});
   auto const orders =
-    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
-  auto const lineitem = read_parquet(args.dataset_dir + "/lineitem.parquet",
+    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
+  auto const lineitem = read_parquet(sources["lineitem"].make_source_info(),
                                      {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"});
   auto const supplier =
-    read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"});
+    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
   auto const nation =
-    read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_regionkey", "n_name"});
+    read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_regionkey", "n_name"});
   auto const region =
-    read_parquet(args.dataset_dir + "/region.parquet", region_cols, std::move(region_pred));
+    read_parquet(sources["region"].make_source_info(), region_cols, std::move(region_pred));
 
   // Perform the joins
   auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"});
@@ -147,7 +142,7 @@ int main(int argc, char const** argv)
 
   // Calculate and append the `revenue` column
   auto revenue =
-    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+    calculate_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
   (*joined_table).append(revenue, "revenue");
 
   // Perform the groupby operation
@@ -162,9 +157,22 @@ int main(int argc, char const** argv)
   auto const orderedby_table =
     apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   orderedby_table->to_parquet("q5.parquet");
-  return 0;
 }
+
+void ndsh_q5(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(
+    scale_factor, {"customer", "orders", "lineitem", "supplier", "nation", "region"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q5(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q5).set_name("ndsh_q5").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/examples/tpch/q6.cpp b/cpp/benchmarks/ndsh/q06.cpp
similarity index 79%
rename from cpp/examples/tpch/q6.cpp
rename to cpp/benchmarks/ndsh/q06.cpp
index 92dac40c768..04078547973 100644
--- a/cpp/examples/tpch/q6.cpp
+++ b/cpp/benchmarks/ndsh/q06.cpp
@@ -14,17 +14,20 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
- * @file q6.cpp
- * @brief Implement query 6 of the TPC-H benchmark.
+ * @file q06.cpp
+ * @brief Implement query 6 of the NDS-H benchmark.
  *
  * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
  *
@@ -48,7 +51,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_revenue(
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -60,16 +63,9 @@
   return revenue;
 }
 
-int main(int argc, char const** argv)
+void run_ndsh_q6(nvbench::state& state,
+                 std::unordered_map<std::string, parquet_device_buffer>& sources)
 {
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
-
   // Read out the `lineitem` table from parquet file
   std::vector<std::string> const lineitem_cols = {
     "l_extendedprice", "l_discount", "l_shipdate", "l_quantity"};
@@ -88,7 +84,7 @@ int main(int argc, char const** argv)
   auto const lineitem_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
   auto lineitem =
-    read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
+    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Cast the discount and quantity columns to float32 and append to lineitem table
   auto discout_float =
@@ -99,8 +95,8 @@ int main(int argc, char const** argv)
   (*lineitem).append(discout_float, "l_discount_float").append(quantity_float, "l_quantity_float");
 
   // Apply the filters
-  auto const discount_ref = cudf::ast::column_reference(lineitem->col_id("l_discount_float"));
-  auto const quantity_ref = cudf::ast::column_reference(lineitem->col_id("l_quantity_float"));
+  auto const discount_ref = cudf::ast::column_reference(lineitem->column_id("l_discount_float"));
+  auto const quantity_ref = cudf::ast::column_reference(lineitem->column_id("l_quantity_float"));
 
   auto discount_lower               = cudf::numeric_scalar<float_t>(0.05);
   auto const discount_lower_literal = cudf::ast::literal(discount_lower);
@@ -123,16 +119,28 @@ int main(int argc, char const** argv)
   auto const filtered_table = apply_filter(lineitem, discount_quantity_pred);
 
   // Calculate the `revenue` column
-  auto revenue =
-    calc_revenue(filtered_table->column("l_extendedprice"), filtered_table->column("l_discount"));
+  auto revenue = calculate_revenue(filtered_table->column("l_extendedprice"),
+                                   filtered_table->column("l_discount"));
 
   // Sum the `revenue` column
   auto const revenue_view = revenue->view();
   auto const result_table = apply_reduction(revenue_view, cudf::aggregation::Kind::SUM, "revenue");
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   result_table->to_parquet("q6.parquet");
-  return 0;
 }
+
+void ndsh_q6(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q6(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q6).set_name("ndsh_q6").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/examples/tpch/q9.cpp b/cpp/benchmarks/ndsh/q09.cpp
similarity index 78%
rename from cpp/examples/tpch/q9.cpp
rename to cpp/benchmarks/ndsh/q09.cpp
index 2882182aa2b..59218ab8912 100644
--- a/cpp/examples/tpch/q9.cpp
+++ b/cpp/benchmarks/ndsh/q09.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/datetime.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -24,9 +24,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
- * @file q9.cpp
- * @brief Implement query 9 of the TPC-H benchmark.
+ * @file q09.cpp
+ * @brief Implement query 9 of the NDS-H benchmark.
  *
  * create view part as select * from '/tables/scale-1/part.parquet';
  * create view supplier as select * from '/tables/scale-1/supplier.parquet';
@@ -79,7 +81,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_amount(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_amount(
   cudf::column_view const& discount,
   cudf::column_view const& extendedprice,
   cudf::column_view const& supplycost,
@@ -109,28 +111,21 @@
   return amount;
 }
 
-int main(int argc, char const** argv)
+void run_ndsh_q9(nvbench::state& state,
+                 std::unordered_map<std::string, parquet_device_buffer>& sources)
 {
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
-
   // Read out the table from parquet files
   auto const lineitem = read_parquet(
-    args.dataset_dir + "/lineitem.parquet",
+    sources["lineitem"].make_source_info(),
     {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"});
-  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_name"});
+  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_name"});
   auto const orders =
-    read_parquet(args.dataset_dir + "/orders.parquet", {"o_orderkey", "o_orderdate"});
-  auto const part     = read_parquet(args.dataset_dir + "/part.parquet", {"p_partkey", "p_name"});
-  auto const partsupp = read_parquet(args.dataset_dir + "/partsupp.parquet",
+    read_parquet(sources["orders"].make_source_info(), {"o_orderkey", "o_orderdate"});
+  auto const part     = read_parquet(sources["part"].make_source_info(), {"p_partkey", "p_name"});
+  auto const partsupp = read_parquet(sources["partsupp"].make_source_info(),
                                      {"ps_suppkey", "ps_partkey", "ps_supplycost"});
   auto const supplier =
-    read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"});
+    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
 
   // Generating the `profit` table
   // Filter the part table using `p_name like '%green%'`
@@ -150,10 +145,10 @@ int main(int argc, char const** argv)
   // Calculate the `nation`, `o_year`, and `amount` columns
   auto n_name = std::make_unique<cudf::column>(joined_table->column("n_name"));
   auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate"));
-  auto amount = calc_amount(joined_table->column("l_discount"),
-                            joined_table->column("l_extendedprice"),
-                            joined_table->column("ps_supplycost"),
-                            joined_table->column("l_quantity"));
+  auto amount = calculate_amount(joined_table->column("l_discount"),
+                                 joined_table->column("l_extendedprice"),
+                                 joined_table->column("ps_supplycost"),
+                                 joined_table->column("l_quantity"));
 
   // Put together the `profit` table
   std::vector<std::unique_ptr<cudf::column>> profit_columns;
@@ -175,9 +170,22 @@ int main(int argc, char const** argv)
   auto const orderedby_table = apply_orderby(
     groupedby_table, {"nation", "o_year"}, {cudf::order::ASCENDING, cudf::order::DESCENDING});
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   orderedby_table->to_parquet("q9.parquet");
-  return 0;
 }
+
+void ndsh_q9(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(
+    scale_factor, {"part", "supplier", "lineitem", "partsupp", "orders", "nation"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q9(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q9).set_name("ndsh_q9").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/examples/tpch/q10.cpp b/cpp/benchmarks/ndsh/q10.cpp
similarity index 81%
rename from cpp/examples/tpch/q10.cpp
rename to cpp/benchmarks/ndsh/q10.cpp
index fdf147b50e0..a520480020a 100644
--- a/cpp/examples/tpch/q10.cpp
+++ b/cpp/benchmarks/ndsh/q10.cpp
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
  * @file q10.cpp
- * @brief Implement query 10 of the TPC-H benchmark.
+ * @brief Implement query 10 of the NDS-H benchmark.
  *
  * create view customer as select * from '/tables/scale-1/customer.parquet';
  * create view orders as select * from '/tables/scale-1/orders.parquet';
@@ -72,7 +74,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_revenue(
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -90,16 +92,10 @@
                                         mr);
   return revenue;
 }
-int main(int argc, char const** argv)
-{
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
 
+void run_ndsh_q10(nvbench::state& state,
+                  std::unordered_map<std::string, parquet_device_buffer>& sources)
+{
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
   auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
@@ -126,15 +122,15 @@ int main(int argc, char const** argv)
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer = read_parquet(
-    args.dataset_dir + "/customer.parquet",
+    sources["customer"].make_source_info(),
     {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"});
   auto const orders =
-    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
+    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
   auto const lineitem =
-    read_parquet(args.dataset_dir + "/lineitem.parquet",
+    read_parquet(sources["lineitem"].make_source_info(),
                  {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"},
                  std::move(lineitem_pred));
-  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_name", "n_nationkey"});
+  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_name", "n_nationkey"});
 
   // Perform the joins
   auto const join_a       = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"});
@@ -143,7 +139,7 @@ int main(int argc, char const** argv)
 
   // Calculate and append the `revenue` column
   auto revenue =
-    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+    calculate_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
   (*joined_table).append(revenue, "revenue");
 
   // Perform the groupby operation
@@ -159,9 +155,22 @@ int main(int argc, char const** argv)
   auto const orderedby_table =
     apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   orderedby_table->to_parquet("q10.parquet");
-  return 0;
 }
+
+void ndsh_q10(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(
+    scale_factor, {"customer", "orders", "lineitem", "nation"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q10(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q10).set_name("ndsh_q10").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp
new file mode 100644
index 00000000000..2d514764fc2
--- /dev/null
+++ b/cpp/benchmarks/ndsh/utilities.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utilities.hpp"
+
+#include "common/ndsh_data_generator/ndsh_data_generator.hpp"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/join.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/transform.hpp>
+
+#include <cstdlib>
+#include <ctime>
+
+namespace {
+
+std::vector<std::string> const ORDERS_SCHEMA   = {"o_orderkey",
+                                                  "o_custkey",
+                                                  "o_orderstatus",
+                                                  "o_totalprice",
+                                                  "o_orderdate",
+                                                  "o_orderpriority",
+                                                  "o_clerk",
+                                                  "o_shippriority",
+                                                  "o_comment"};
+std::vector<std::string> const LINEITEM_SCHEMA = {"l_orderkey",
+                                                  "l_partkey",
+                                                  "l_suppkey",
+                                                  "l_linenumber",
+                                                  "l_quantity",
+                                                  "l_extendedprice",
+                                                  "l_discount",
+                                                  "l_tax",
+                                                  "l_returnflag",
+                                                  "l_linestatus",
+                                                  "l_shipdate",
+                                                  "l_commitdate",
+                                                  "l_receiptdate",
+                                                  "l_shipinstruct",
+                                                  "l_shipmode",
+                                                  "l_comment"};
+std::vector<std::string> const PART_SCHEMA     = {"p_partkey",
+                                                  "p_name",
+                                                  "p_mfgr",
+                                                  "p_brand",
+                                                  "p_type",
+                                                  "p_size",
+                                                  "p_container",
+                                                  "p_retailprice",
+                                                  "p_comment"};
+std::vector<std::string> const PARTSUPP_SCHEMA = {
+  "ps_partkey", "ps_suppkey", "ps_availqty", "ps_supplycost", "ps_comment"};
+std::vector<std::string> const SUPPLIER_SCHEMA = {
+  "s_suppkey", "s_name", "s_address", "s_nationkey", "s_phone", "s_acctbal", "s_comment"};
+std::vector<std::string> const CUSTOMER_SCHEMA = {"c_custkey",
+                                                  "c_name",
+                                                  "c_address",
+                                                  "c_nationkey",
+                                                  "c_phone",
+                                                  "c_acctbal",
+                                                  "c_mktsegment",
+                                                  "c_comment"};
+std::vector<std::string> const NATION_SCHEMA   = {
+  "n_nationkey", "n_name", "n_regionkey", "n_comment"};
+std::vector<std::string> const REGION_SCHEMA = {"r_regionkey", "r_name", "r_comment"};
+
+}  // namespace
+
+cudf::table_view table_with_names::table() const { return tbl->view(); }
+
+cudf::column_view table_with_names::column(std::string const& col_name) const
+{
+  return tbl->view().column(column_id(col_name));
+}
+
+std::vector<std::string> const& table_with_names::column_names() const { return col_names; }
+
+cudf::size_type table_with_names::column_id(std::string const& col_name) const
+{
+  auto it = std::find(col_names.begin(), col_names.end(), col_name);
+  if (it == col_names.end()) {
+    std::string err_msg = "Column `" + col_name + "` not found";
+    throw std::runtime_error(err_msg);
+  }
+  return std::distance(col_names.begin(), it);
+}
+
+table_with_names& table_with_names::append(std::unique_ptr<cudf::column>& col,
+                                           std::string const& col_name)
+{
+  auto cols = tbl->release();
+  cols.push_back(std::move(col));
+  tbl = std::make_unique<cudf::table>(std::move(cols));
+  col_names.push_back(col_name);
+  return (*this);
+}
+
+cudf::table_view table_with_names::select(std::vector<std::string> const& col_names) const
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::size_type> col_indices;
+  for (auto const& col_name : col_names) {
+    col_indices.push_back(column_id(col_name));
+  }
+  return tbl->select(col_indices);
+}
+
+void table_with_names::to_parquet(std::string const& filepath) const
+{
+  CUDF_FUNC_RANGE();
+  auto const sink_info = cudf::io::sink_info(filepath);
+  cudf::io::table_metadata metadata;
+  metadata.schema_info =
+    std::vector<cudf::io::column_name_info>(col_names.begin(), col_names.end());
+  auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
+  auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view());
+  builder.metadata(table_input_metadata);
+  auto const options = builder.build();
+  cudf::io::write_parquet(options);
+}
+
+std::unique_ptr<cudf::table> join_and_gather(cudf::table_view const& left_input,
+                                             cudf::table_view const& right_input,
+                                             std::vector<cudf::size_type> const& left_on,
+                                             std::vector<cudf::size_type> const& right_on,
+                                             cudf::null_equality compare_nulls)
+{
+  CUDF_FUNC_RANGE();
+  constexpr auto oob_policy                          = cudf::out_of_bounds_policy::DONT_CHECK;
+  auto const left_selected                           = left_input.select(left_on);
+  auto const right_selected                          = right_input.select(right_on);
+  auto const [left_join_indices, right_join_indices] = cudf::inner_join(
+    left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref());
+
+  auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
+  auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
+
+  auto const left_indices_col  = cudf::column_view{left_indices_span};
+  auto const right_indices_col = cudf::column_view{right_indices_span};
+
+  auto const left_result  = cudf::gather(left_input, left_indices_col, oob_policy);
+  auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy);
+
+  auto joined_cols = left_result->release();
+  auto right_cols  = right_result->release();
+  joined_cols.insert(joined_cols.end(),
+                     std::make_move_iterator(right_cols.begin()),
+                     std::make_move_iterator(right_cols.end()));
+  return std::make_unique<cudf::table>(std::move(joined_cols));
+}
+
+std::unique_ptr<table_with_names> apply_inner_join(
+  std::unique_ptr<table_with_names> const& left_input,
+  std::unique_ptr<table_with_names> const& right_input,
+  std::vector<std::string> const& left_on,
+  std::vector<std::string> const& right_on,
+  cudf::null_equality compare_nulls)
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::size_type> left_on_indices;
+  std::vector<cudf::size_type> right_on_indices;
+  std::transform(
+    left_on.begin(), left_on.end(), std::back_inserter(left_on_indices), [&](auto const& col_name) {
+      return left_input->column_id(col_name);
+    });
+  std::transform(right_on.begin(),
+                 right_on.end(),
+                 std::back_inserter(right_on_indices),
+                 [&](auto const& col_name) { return right_input->column_id(col_name); });
+  auto table = join_and_gather(
+    left_input->table(), right_input->table(), left_on_indices, right_on_indices, compare_nulls);
+  ;
+  std::vector<std::string> merged_column_names;
+  merged_column_names.reserve(left_input->column_names().size() +
+                              right_input->column_names().size());
+  std::copy(left_input->column_names().begin(),
+            left_input->column_names().end(),
+            std::back_inserter(merged_column_names));
+  std::copy(right_input->column_names().begin(),
+            right_input->column_names().end(),
+            std::back_inserter(merged_column_names));
+  return std::make_unique<table_with_names>(std::move(table), merged_column_names);
+  return std::make_unique<table_with_names>(std::move(table), merged_column_names);
+}
+
+std::unique_ptr<table_with_names> apply_filter(std::unique_ptr<table_with_names> const& table,
+                                               cudf::ast::operation const& predicate)
+{
+  CUDF_FUNC_RANGE();
+  auto const boolean_mask = cudf::compute_column(table->table(), predicate);
+  auto result_table       = cudf::apply_boolean_mask(table->table(), boolean_mask->view());
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+std::unique_ptr<table_with_names> apply_mask(std::unique_ptr<table_with_names> const& table,
+                                             std::unique_ptr<cudf::column> const& mask)
+{
+  CUDF_FUNC_RANGE();
+  auto result_table = cudf::apply_boolean_mask(table->table(), mask->view());
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+std::unique_ptr<table_with_names> apply_groupby(std::unique_ptr<table_with_names> const& table,
+                                                groupby_context_t const& ctx)
+{
+  CUDF_FUNC_RANGE();
+  auto const keys = table->select(ctx.keys);
+  cudf::groupby::groupby groupby_obj(keys);
+  std::vector<std::string> result_column_names;
+  result_column_names.insert(result_column_names.end(), ctx.keys.begin(), ctx.keys.end());
+  std::vector<cudf::groupby::aggregation_request> requests;
+  for (auto& [value_col, aggregations] : ctx.values) {
+    requests.emplace_back(cudf::groupby::aggregation_request());
+    for (auto& agg : aggregations) {
+      if (agg.first == cudf::aggregation::Kind::SUM) {
+        requests.back().aggregations.push_back(
+          cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+      } else if (agg.first == cudf::aggregation::Kind::MEAN) {
+        requests.back().aggregations.push_back(
+          cudf::make_mean_aggregation<cudf::groupby_aggregation>());
+      } else if (agg.first == cudf::aggregation::Kind::COUNT_ALL) {
+        requests.back().aggregations.push_back(
+          cudf::make_count_aggregation<cudf::groupby_aggregation>());
+      } else {
+        throw std::runtime_error("Unsupported aggregation");
+      }
+      result_column_names.push_back(agg.second);
+    }
+    requests.back().values = table->column(value_col);
+  }
+  auto agg_results = groupby_obj.aggregate(requests);
+  std::vector<std::unique_ptr<cudf::column>> result_columns;
+  for (auto i = 0; i < agg_results.first->num_columns(); i++) {
+    auto col = std::make_unique<cudf::column>(agg_results.first->get_column(i));
+    result_columns.push_back(std::move(col));
+  }
+  for (size_t i = 0; i < agg_results.second.size(); i++) {
+    for (size_t j = 0; j < agg_results.second[i].results.size(); j++) {
+      result_columns.push_back(std::move(agg_results.second[i].results[j]));
+    }
+  }
+  auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
+  return std::make_unique<table_with_names>(std::move(result_table), result_column_names);
+}
+
+std::unique_ptr<table_with_names> apply_orderby(std::unique_ptr<table_with_names> const& table,
+                                                std::vector<std::string> const& sort_keys,
+                                                std::vector<cudf::order> const& sort_key_orders)
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::column_view> column_views;
+  for (auto& key : sort_keys) {
+    column_views.push_back(table->column(key));
+  }
+  auto result_table =
+    cudf::sort_by_key(table->table(), cudf::table_view{column_views}, sort_key_orders);
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+std::unique_ptr<table_with_names> apply_reduction(cudf::column_view const& column,
+                                                  cudf::aggregation::Kind const& agg_kind,
+                                                  std::string const& col_name)
+{
+  CUDF_FUNC_RANGE();
+  auto const agg            = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  auto const result         = cudf::reduce(column, *agg, column.type());
+  cudf::size_type const len = 1;
+  auto col                  = cudf::make_column_from_scalar(*result, len);
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(col));
+  auto result_table                  = std::make_unique<cudf::table>(std::move(columns));
+  std::vector<std::string> col_names = {col_name};
+  return std::make_unique<table_with_names>(std::move(result_table), col_names);
+}
+
+std::unique_ptr<table_with_names> read_parquet(
+  cudf::io::source_info const& source_info,
+  std::vector<std::string> const& columns,
+  std::unique_ptr<cudf::ast::operation> const& predicate)
+{
+  CUDF_FUNC_RANGE();
+  auto builder = cudf::io::parquet_reader_options_builder(source_info);
+  if (!columns.empty()) { builder.columns(columns); }
+  if (predicate) { builder.filter(*predicate); }
+  auto const options       = builder.build();
+  auto table_with_metadata = cudf::io::read_parquet(options);
+  std::vector<std::string> column_names;
+  for (auto const& col_info : table_with_metadata.metadata.schema_info) {
+    column_names.push_back(col_info.name);
+  }
+  return std::make_unique<table_with_names>(std::move(table_with_metadata.tbl), column_names);
+}
+
+std::tm make_tm(int year, int month, int day)
+{
+  std::tm tm{};
+  tm.tm_year = year - 1900;
+  tm.tm_mon  = month - 1;
+  tm.tm_mday = day;
+  return tm;
+}
+
+int32_t days_since_epoch(int year, int month, int day)
+{
+  std::tm tm             = make_tm(year, month, day);
+  std::tm epoch          = make_tm(1970, 1, 1);
+  std::time_t time       = std::mktime(&tm);
+  std::time_t epoch_time = std::mktime(&epoch);
+  double diff            = std::difftime(time, epoch_time) / (60 * 60 * 24);
+  return static_cast<int32_t>(diff);
+}
+
+void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
+                                    std::vector<std::string> const& col_names,
+                                    parquet_device_buffer& source)
+{
+  CUDF_FUNC_RANGE();
+  auto const stream = cudf::get_default_stream();
+
+  // Prepare the table metadata
+  cudf::io::table_metadata metadata;
+  std::vector<cudf::io::column_name_info> col_name_infos;
+  for (auto& col_name : col_names) {
+    col_name_infos.push_back(cudf::io::column_name_info(col_name));
+  }
+  metadata.schema_info            = col_name_infos;
+  auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
+
+  // Declare a host and device buffer
+  std::vector<char> h_buffer;
+
+  // Write parquet data to host buffer
+  auto builder =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&h_buffer), table->view());
+  builder.metadata(table_input_metadata);
+  auto const options = builder.build();
+  cudf::io::write_parquet(options);
+
+  // Copy host buffer to device buffer
+  source.d_buffer.resize(h_buffer.size(), stream);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
+    source.d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value()));
+}
+
+void generate_parquet_data_sources(double scale_factor,
+                                   std::vector<std::string> const& table_names,
+                                   std::unordered_map<std::string, parquet_device_buffer>& sources)
+{
+  CUDF_FUNC_RANGE();
+  std::for_each(table_names.begin(), table_names.end(), [&](auto const& table_name) {
+    sources[table_name] = parquet_device_buffer();
+  });
+
+  auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
+    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+
+  auto partsupp = cudf::datagen::generate_partsupp(
+    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+
+  auto supplier = cudf::datagen::generate_supplier(
+    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+
+  auto customer = cudf::datagen::generate_customer(
+    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+
+  auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
+                                               cudf::get_current_device_resource_ref());
+
+  auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
+                                               cudf::get_current_device_resource_ref());
+
+  write_to_parquet_device_buffer(std::move(orders), ORDERS_SCHEMA, sources["orders"]);
+  write_to_parquet_device_buffer(std::move(lineitem), LINEITEM_SCHEMA, sources["lineitem"]);
+  write_to_parquet_device_buffer(std::move(part), PART_SCHEMA, sources["part"]);
+  write_to_parquet_device_buffer(std::move(partsupp), PARTSUPP_SCHEMA, sources["partsupp"]);
+  write_to_parquet_device_buffer(std::move(customer), CUSTOMER_SCHEMA, sources["customer"]);
+  write_to_parquet_device_buffer(std::move(supplier), SUPPLIER_SCHEMA, sources["supplier"]);
+  write_to_parquet_device_buffer(std::move(nation), NATION_SCHEMA, sources["nation"]);
+  write_to_parquet_device_buffer(std::move(region), REGION_SCHEMA, sources["region"]);
+}
diff --git a/cpp/benchmarks/ndsh/utilities.hpp b/cpp/benchmarks/ndsh/utilities.hpp
new file mode 100644
index 00000000000..762e43deccf
--- /dev/null
+++ b/cpp/benchmarks/ndsh/utilities.hpp
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/io/parquet.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+/**
+ * @brief A class to represent a table with column names attached
+ */
+class table_with_names {
+ public:
+  table_with_names(std::unique_ptr<cudf::table> tbl, std::vector<std::string> col_names)
+    : tbl(std::move(tbl)), col_names(col_names){};
+  /**
+   * @brief Return the table view
+   */
+  [[nodiscard]] cudf::table_view table() const;
+  /**
+   * @brief Return the column view for a given column name
+   *
+   * @param col_name The name of the column
+   */
+  [[nodiscard]] cudf::column_view column(std::string const& col_name) const;
+  /**
+   * @param Return the column names of the table
+   */
+  [[nodiscard]] std::vector<std::string> const& column_names() const;
+  /**
+   * @brief Translate a column name to a column index
+   *
+   * @param col_name The name of the column
+   */
+  [[nodiscard]] cudf::size_type column_id(std::string const& col_name) const;
+  /**
+   * @brief Append a column to the table
+   *
+   * @param col The column to append
+   * @param col_name The name of the appended column
+   */
+  table_with_names& append(std::unique_ptr<cudf::column>& col, std::string const& col_name);
+  /**
+   * @brief Select a subset of columns from the table
+   *
+   * @param col_names The names of the columns to select
+   */
+  [[nodiscard]] cudf::table_view select(std::vector<std::string> const& col_names) const;
+  /**
+   * @brief Write the table to a parquet file
+   *
+   * @param filepath The path to the parquet file
+   */
+  void to_parquet(std::string const& filepath) const;
+
+ private:
+  std::unique_ptr<cudf::table> tbl;
+  std::vector<std::string> col_names;
+};
+
+/**
+ * @brief Inner join two tables and gather the result
+ *
+ * @param left_input The left input table
+ * @param right_input The right input table
+ * @param left_on The columns to join on in the left table
+ * @param right_on The columns to join on in the right table
+ * @param compare_nulls The null equality policy
+ */
+[[nodiscard]] std::unique_ptr<cudf::table> join_and_gather(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls);
+
+/**
+ * @brief Apply an inner join operation to two tables
+ *
+ * @param left_input The left input table
+ * @param right_input The right input table
+ * @param left_on The columns to join on in the left table
+ * @param right_on The columns to join on in the right table
+ * @param compare_nulls The null equality policy
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_inner_join(
+  std::unique_ptr<table_with_names> const& left_input,
+  std::unique_ptr<table_with_names> const& right_input,
+  std::vector<std::string> const& left_on,
+  std::vector<std::string> const& right_on,
+  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL);
+
+/**
+ * @brief Apply a filter predicate to a table
+ *
+ * @param table The input table
+ * @param predicate The filter predicate
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_filter(
+  std::unique_ptr<table_with_names> const& table, cudf::ast::operation const& predicate);
+
+/**
+ * @brief Apply a boolean mask to a table
+ *
+ * @param table The input table
+ * @param mask The boolean mask
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_mask(
+  std::unique_ptr<table_with_names> const& table, std::unique_ptr<cudf::column> const& mask);
+
+/**
+ * Struct representing group by key columns, value columns, and the type of aggregations to perform
+ * on the value columns
+ */
+struct groupby_context_t {
+  std::vector<std::string> keys;
+  std::unordered_map<std::string, std::vector<std::pair<cudf::aggregation::Kind, std::string>>>
+    values;
+};
+
+/**
+ * @brief Apply a groupby operation to a table
+ *
+ * @param table The input table
+ * @param ctx The groupby context
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_groupby(
+  std::unique_ptr<table_with_names> const& table, groupby_context_t const& ctx);
+
+/**
+ * @brief Apply an order by operation to a table
+ *
+ * @param table The input table
+ * @param sort_keys The sort keys
+ * @param sort_key_orders The sort key orders
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_orderby(
+  std::unique_ptr<table_with_names> const& table,
+  std::vector<std::string> const& sort_keys,
+  std::vector<cudf::order> const& sort_key_orders);
+
+/**
+ * @brief Apply a reduction operation to a column
+ *
+ * @param column The input column
+ * @param agg_kind The aggregation kind
+ * @param col_name The name of the output column
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_reduction(
+  cudf::column_view const& column,
+  cudf::aggregation::Kind const& agg_kind,
+  std::string const& col_name);
+
+/**
+ * @brief Read a parquet file into a table
+ *
+ * @param source_info The source of the parquet file
+ * @param columns The columns to read
+ * @param predicate The filter predicate to pushdown
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> read_parquet(
+  cudf::io::source_info const& source_info,
+  std::vector<std::string> const& columns                = {},
+  std::unique_ptr<cudf::ast::operation> const& predicate = nullptr);
+
+/**
+ * @brief Generate the `std::tm` structure from year, month, and day
+ *
+ * @param year The year
+ * @param month The month
+ * @param day The day
+ */
+std::tm make_tm(int year, int month, int day);
+
+/**
+ * @brief Calculate the number of days since the UNIX epoch
+ *
+ * @param year The year
+ * @param month The month
+ * @param day The day
+ */
+int32_t days_since_epoch(int year, int month, int day);
+
+/**
+ * @brief Struct representing a parquet device buffer
+ */
+struct parquet_device_buffer {
+  parquet_device_buffer() : d_buffer{0, cudf::get_default_stream()} {};
+  cudf::io::source_info make_source_info() { return cudf::io::source_info(d_buffer); }
+  rmm::device_uvector<std::byte> d_buffer;
+};
+
+/**
+ * @brief Write a `cudf::table` to a parquet device buffer
+ *
+ * @param table The `cudf::table` to write
+ * @param col_names The column names of the table
+ * @param parquet_device_buffer The parquet device buffer to write the table to
+ */
+void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
+                                    std::vector<std::string> const& col_names,
+                                    parquet_device_buffer& source);
+
+/**
+ * @brief Generate NDS-H tables and write to parquet device buffers
+ *
+ * @param scale_factor The scale factor of NDS-H tables to generate
+ * @param table_names The names of the tables to generate
+ * @param sources The parquet data sources to populate
+ */
+void generate_parquet_data_sources(double scale_factor,
+                                   std::vector<std::string> const& table_names,
+                                   std::unordered_map<std::string, parquet_device_buffer>& sources);
diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp
new file mode 100644
index 00000000000..adc3dddc59c
--- /dev/null
+++ b/cpp/benchmarks/text/word_minhash.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/minhash.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_word_minhash(nvbench::state& state)
+{
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const base64     = state.get_int64("hash_type") == 64;
+
+  data_profile const strings_profile =
+    data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
+  auto strings_table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
+
+  auto const num_offsets = (num_rows / row_width) + 1;
+  auto offsets           = cudf::sequence(num_offsets,
+                                cudf::numeric_scalar<cudf::size_type>(0),
+                                cudf::numeric_scalar<cudf::size_type>(row_width));
+
+  auto source = cudf::make_lists_column(num_offsets - 1,
+                                        std::move(offsets),
+                                        std::move(strings_table->release().front()),
+                                        0,
+                                        rmm::device_buffer{});
+
+  data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
+    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
+  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
+  auto seeds             = seeds_table->get_column(0);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
+                         : nvtext::word_minhash(source->view(), seeds.view());
+  });
+}
+
+NVBENCH_BENCH(bench_word_minhash)
+  .set_name("word_minhash")
+  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
+  .add_int64_axis("row_width", {10, 100, 1000})
+  .add_int64_axis("seed_count", {2, 25})
+  .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md
index 8d206f245dc..6d1c91a5752 100644
--- a/cpp/doxygen/regex.md
+++ b/cpp/doxygen/regex.md
@@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions.
 
 **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen.
 
+By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include:
+- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`)
+- Line separator (Unicode: `2028`, UTF-8: `E280A8`)
+- Next line (Unicode: `0085`, UTF-8: `C285`)
+- Carriage return (Unicode: `000D`, UTF-8: `0D`)
+
 **Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following:
 - Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals.
 - Unmatched paired special characters like `()`, `[]`, and `{}`.
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 8e8d8bd0b78..25984df1b60 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -57,7 +57,6 @@ build_example() {
 }
 
 build_example basic
-build_example tpch
 build_example strings
 build_example nested_types
 build_example parquet_io
diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt
deleted file mode 100644
index 373a6d72d56..00000000000
--- a/cpp/examples/tpch/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-cmake_minimum_required(VERSION 3.26.4)
-
-include(../set_cuda_architecture.cmake)
-
-rapids_cuda_init_architectures(tpch_example)
-rapids_cuda_set_architectures(RAPIDS)
-
-project(
-  tpch_example
-  VERSION 0.0.1
-  LANGUAGES CXX CUDA
-)
-
-include(../fetch_dependencies.cmake)
-
-add_executable(tpch_q1 q1.cpp)
-target_link_libraries(tpch_q1 PRIVATE cudf::cudf)
-target_compile_features(tpch_q1 PRIVATE cxx_std_17)
-
-add_executable(tpch_q5 q5.cpp)
-target_link_libraries(tpch_q5 PRIVATE cudf::cudf)
-target_compile_features(tpch_q5 PRIVATE cxx_std_17)
-
-add_executable(tpch_q6 q6.cpp)
-target_link_libraries(tpch_q6 PRIVATE cudf::cudf)
-target_compile_features(tpch_q6 PRIVATE cxx_std_17)
-
-add_executable(tpch_q9 q9.cpp)
-target_link_libraries(tpch_q9 PRIVATE cudf::cudf)
-target_compile_features(tpch_q9 PRIVATE cxx_std_17)
-
-add_executable(tpch_q10 q10.cpp)
-target_link_libraries(tpch_q10 PRIVATE cudf::cudf)
-target_compile_features(tpch_q10 PRIVATE cxx_std_17)
diff --git a/cpp/examples/tpch/README.md b/cpp/examples/tpch/README.md
deleted file mode 100644
index 8c046c3f1e8..00000000000
--- a/cpp/examples/tpch/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# TPC-H Derived Examples
-
-Implements TPC-H queries using `libcudf`. We leverage the data generator (wrapper around official TPC-H datagen) from [Apache Datafusion](https://github.com/apache/datafusion) for generating data in Parquet format.
-
-## Requirements
-
-- Rust
-- [libcudf](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment)
-
-## Running Queries
-
-1. Build the `libcudf` examples.
-```bash
-cd cudf/cpp/examples
-./build.sh
-```
-The TPC-H query binaries would be built inside `tpch/build`.
-
-2. Generate the dataset.
-```bash
-cd tpch/datagen
-./datagen.sh [scale factor (1/10)]
-```
-
-The parquet files will be generated in `tpch/datagen/datafusion/benchmarks/data/tpch_sf[scale factor]`.
-
-3. Set these environment variables for optimized runtimes.
-```bash
-export KVIKIO_COMPAT_MODE="on"
-export LIBCUDF_CUFILE_POLICY="KVIKIO"
-export CUDA_MODULE_LOADING="EAGER"
-```
-
-4. Execute the queries.
-```bash
-./tpch/build/tpch_q[query no] [path to dataset] [memory resource type (cuda/pool/managed/managed_pool)]
-```
-
-A parquet file named `q[query no].parquet` would be generated containing the results of the query.
diff --git a/cpp/examples/tpch/datagen/correct_datatypes.py b/cpp/examples/tpch/datagen/correct_datatypes.py
deleted file mode 100644
index 8564774647b..00000000000
--- a/cpp/examples/tpch/datagen/correct_datatypes.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import os
-import sys
-
-import pyarrow as pa
-import pyarrow.parquet as pq
-import pandas as pd
-
-if __name__ == "__main__":
-    dataset_path = str(sys.argv[1])
-    tables = ["lineitem", "part", "partsupp", "orders", "supplier", "customer", "nation", "region"]
-    for table in tables:
-        filepath = os.path.join(dataset_path, f"{table}.parquet")
-        print("Reading file ", filepath)
-
-        if filepath.endswith("lineitem.parquet"):
-            df = pd.read_parquet(filepath)
-            df["l_linenumber"] = df["l_linenumber"].astype("int64")
-            df["l_quantity"] = df["l_quantity"].astype("int64")
-            df["l_extendedprice"] = df["l_extendedprice"].astype("float64")
-            df["l_discount"] = df["l_discount"].astype("float64")
-            df["l_tax"] = df["l_tax"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("part.parquet"):
-            df = pd.read_parquet(filepath)
-            df["p_size"] = df["p_size"].astype("int64")
-            df["p_retailprice"] = df["p_retailprice"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("partsupp.parquet"):
-            df = pd.read_parquet(filepath)
-            df["ps_availqty"] = df["ps_availqty"].astype("int64")
-            df["ps_supplycost"] = df["ps_supplycost"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("orders.parquet"):
-            df = pd.read_parquet(filepath)
-            df["o_totalprice"] = df["o_totalprice"].astype("float64")
-            df["o_shippriority"] = df["o_shippriority"].astype("int64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("supplier.parquet"):
-            df = pd.read_parquet(filepath)
-            df["s_acctbal"] = df["s_acctbal"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("customer.parquet"):
-            df = pd.read_parquet(filepath)
-            df["c_acctbal"] = df["c_acctbal"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("nation.parquet"):
-            df = pd.read_parquet(filepath)
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("region.parquet"):
-            df = pd.read_parquet(filepath)
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
diff --git a/cpp/examples/tpch/datagen/datagen.sh b/cpp/examples/tpch/datagen/datagen.sh
deleted file mode 100755
index 0b03753daea..00000000000
--- a/cpp/examples/tpch/datagen/datagen.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-set -e
-
-scale_factor=$1
-script_dir=$(pwd)
-
-# Clone the datafusion repository and apply a patch
-# for single threaded data generation so that a
-# single parquet file is generated for each table
-rm -rf datafusion
-git clone https://github.com/apache/datafusion.git datafusion
-cd datafusion/
-git checkout 679a85f
-git apply ${script_dir}/tpch.patch
-cd benchmarks/
-
-# Generate the data
-# Currently, we support only scale factor 1 and 10
-if [ ${scale_factor} -eq 1 ]; then
-    ./bench.sh data tpch
-elif [ ${scale_factor} -eq 10 ]; then
-    ./bench.sh data tpch10
-else
-    echo "Unsupported scale factor"
-    exit 1
-fi
-
-# Correct the datatypes of the parquet files
-python3 ${script_dir}/correct_datatypes.py data/tpch_sf${scale_factor}
diff --git a/cpp/examples/tpch/datagen/tpch.patch b/cpp/examples/tpch/datagen/tpch.patch
deleted file mode 100644
index 42727aa9904..00000000000
--- a/cpp/examples/tpch/datagen/tpch.patch
+++ /dev/null
@@ -1,33 +0,0 @@
-diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
-index 3b854f6dc..f000f09c0 100755
---- a/benchmarks/bench.sh
-+++ b/benchmarks/bench.sh
-@@ -311,6 +311,15 @@ data_tpch() {
-         $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
-         popd > /dev/null
-     fi
-+
-+    cp ${TPCH_DIR}/lineitem/part-0.parquet ${TPCH_DIR}/lineitem.parquet
-+    cp ${TPCH_DIR}/orders/part-0.parquet ${TPCH_DIR}/orders.parquet
-+    cp ${TPCH_DIR}/part/part-0.parquet ${TPCH_DIR}/part.parquet
-+    cp ${TPCH_DIR}/partsupp/part-0.parquet ${TPCH_DIR}/partsupp.parquet
-+    cp ${TPCH_DIR}/customer/part-0.parquet ${TPCH_DIR}/customer.parquet
-+    cp ${TPCH_DIR}/supplier/part-0.parquet ${TPCH_DIR}/supplier.parquet
-+    cp ${TPCH_DIR}/nation/part-0.parquet ${TPCH_DIR}/nation.parquet
-+    cp ${TPCH_DIR}/region/part-0.parquet ${TPCH_DIR}/region.parquet
- }
-
- # Runs the tpch benchmark
-diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
-index b5204b343..84fd2e78d 100644
---- a/datafusion/common/src/config.rs
-+++ b/datafusion/common/src/config.rs
-@@ -250,7 +250,7 @@ config_namespace! {
-         /// concurrency.
-         ///
-         /// Defaults to the number of CPU cores on the system
--        pub target_partitions: usize, default = num_cpus::get()
-+        pub target_partitions: usize, default = 1
-
-         /// The default time zone
-         ///
diff --git a/cpp/examples/tpch/utils.hpp b/cpp/examples/tpch/utils.hpp
deleted file mode 100644
index 8102fa8f976..00000000000
--- a/cpp/examples/tpch/utils.hpp
+++ /dev/null
@@ -1,458 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/binaryop.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/io/parquet.hpp>
-#include <cudf/join.hpp>
-#include <cudf/reduction.hpp>
-#include <cudf/sorting.hpp>
-#include <cudf/stream_compaction.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/transform.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <rmm/cuda_device.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/owning_wrapper.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-#include <ctime>
-
-// RMM memory resource creation utilities
-inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
-inline auto make_pool()
-{
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
-    make_cuda(), rmm::percent_of_free_device_memory(50));
-}
-inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
-inline auto make_managed_pool()
-{
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
-    make_managed(), rmm::percent_of_free_device_memory(50));
-}
-inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
-  std::string const& mode)
-{
-  if (mode == "cuda") return make_cuda();
-  if (mode == "pool") return make_pool();
-  if (mode == "managed") return make_managed();
-  if (mode == "managed_pool") return make_managed_pool();
-  CUDF_FAIL("Unknown rmm_mode parameter: " + mode +
-            "\nExpecting: cuda, pool, managed, or managed_pool");
-}
-
-/**
- * @brief A class to represent a table with column names attached
- */
-class table_with_names {
- public:
-  table_with_names(std::unique_ptr<cudf::table> tbl, std::vector<std::string> col_names)
-    : tbl(std::move(tbl)), col_names(col_names)
-  {
-  }
-  /**
-   * @brief Return the table view
-   */
-  [[nodiscard]] cudf::table_view table() const { return tbl->view(); }
-  /**
-   * @brief Return the column view for a given column name
-   *
-   * @param col_name The name of the column
-   */
-  [[nodiscard]] cudf::column_view column(std::string const& col_name) const
-  {
-    return tbl->view().column(col_id(col_name));
-  }
-  /**
-   * @param Return the column names of the table
-   */
-  [[nodiscard]] std::vector<std::string> column_names() const { return col_names; }
-  /**
-   * @brief Translate a column name to a column index
-   *
-   * @param col_name The name of the column
-   */
-  [[nodiscard]] cudf::size_type col_id(std::string const& col_name) const
-  {
-    CUDF_FUNC_RANGE();
-    auto it = std::find(col_names.begin(), col_names.end(), col_name);
-    if (it == col_names.end()) { throw std::runtime_error("Column not found"); }
-    return std::distance(col_names.begin(), it);
-  }
-  /**
-   * @brief Append a column to the table
-   *
-   * @param col The column to append
-   * @param col_name The name of the appended column
-   */
-  table_with_names& append(std::unique_ptr<cudf::column>& col, std::string const& col_name)
-  {
-    CUDF_FUNC_RANGE();
-    auto cols = tbl->release();
-    cols.push_back(std::move(col));
-    tbl = std::make_unique<cudf::table>(std::move(cols));
-    col_names.push_back(col_name);
-    return (*this);
-  }
-  /**
-   * @brief Select a subset of columns from the table
-   *
-   * @param col_names The names of the columns to select
-   */
-  [[nodiscard]] cudf::table_view select(std::vector<std::string> const& col_names) const
-  {
-    CUDF_FUNC_RANGE();
-    std::vector<cudf::size_type> col_indices;
-    for (auto const& col_name : col_names) {
-      col_indices.push_back(col_id(col_name));
-    }
-    return tbl->select(col_indices);
-  }
-  /**
-   * @brief Write the table to a parquet file
-   *
-   * @param filepath The path to the parquet file
-   */
-  void to_parquet(std::string const& filepath) const
-  {
-    CUDF_FUNC_RANGE();
-    auto const sink_info = cudf::io::sink_info(filepath);
-    cudf::io::table_metadata metadata;
-    metadata.schema_info =
-      std::vector<cudf::io::column_name_info>(col_names.begin(), col_names.end());
-    auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
-    auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view());
-    builder.metadata(table_input_metadata);
-    auto const options = builder.build();
-    cudf::io::write_parquet(options);
-  }
-
- private:
-  std::unique_ptr<cudf::table> tbl;
-  std::vector<std::string> col_names;
-};
-
-/**
- * @brief Concatenate two vectors
- *
- * @param lhs The left vector
- * @param rhs The right vector
- */
-template <typename T>
-std::vector<T> concat(std::vector<T> const& lhs, std::vector<T> const& rhs)
-{
-  std::vector<T> result;
-  result.reserve(lhs.size() + rhs.size());
-  std::copy(lhs.begin(), lhs.end(), std::back_inserter(result));
-  std::copy(rhs.begin(), rhs.end(), std::back_inserter(result));
-  return result;
-}
-
-/**
- * @brief Inner join two tables and gather the result
- *
- * @param left_input The left input table
- * @param right_input The right input table
- * @param left_on The columns to join on in the left table
- * @param right_on The columns to join on in the right table
- * @param compare_nulls The null equality policy
- */
-[[nodiscard]] std::unique_ptr<cudf::table> join_and_gather(
-  cudf::table_view const& left_input,
-  cudf::table_view const& right_input,
-  std::vector<cudf::size_type> const& left_on,
-  std::vector<cudf::size_type> const& right_on,
-  cudf::null_equality compare_nulls)
-{
-  CUDF_FUNC_RANGE();
-  constexpr auto oob_policy                          = cudf::out_of_bounds_policy::DONT_CHECK;
-  auto const left_selected                           = left_input.select(left_on);
-  auto const right_selected                          = right_input.select(right_on);
-  auto const [left_join_indices, right_join_indices] = cudf::inner_join(
-    left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref());
-
-  auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
-  auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
-
-  auto const left_indices_col  = cudf::column_view{left_indices_span};
-  auto const right_indices_col = cudf::column_view{right_indices_span};
-
-  auto const left_result  = cudf::gather(left_input, left_indices_col, oob_policy);
-  auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy);
-
-  auto joined_cols = left_result->release();
-  auto right_cols  = right_result->release();
-  joined_cols.insert(joined_cols.end(),
-                     std::make_move_iterator(right_cols.begin()),
-                     std::make_move_iterator(right_cols.end()));
-  return std::make_unique<cudf::table>(std::move(joined_cols));
-}
-
-/**
- * @brief Apply an inner join operation to two tables
- *
- * @param left_input The left input table
- * @param right_input The right input table
- * @param left_on The columns to join on in the left table
- * @param right_on The columns to join on in the right table
- * @param compare_nulls The null equality policy
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_inner_join(
-  std::unique_ptr<table_with_names> const& left_input,
-  std::unique_ptr<table_with_names> const& right_input,
-  std::vector<std::string> const& left_on,
-  std::vector<std::string> const& right_on,
-  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
-{
-  CUDF_FUNC_RANGE();
-  std::vector<cudf::size_type> left_on_indices;
-  std::vector<cudf::size_type> right_on_indices;
-  std::transform(
-    left_on.begin(), left_on.end(), std::back_inserter(left_on_indices), [&](auto const& col_name) {
-      return left_input->col_id(col_name);
-    });
-  std::transform(right_on.begin(),
-                 right_on.end(),
-                 std::back_inserter(right_on_indices),
-                 [&](auto const& col_name) { return right_input->col_id(col_name); });
-  auto table = join_and_gather(
-    left_input->table(), right_input->table(), left_on_indices, right_on_indices, compare_nulls);
-  return std::make_unique<table_with_names>(
-    std::move(table), concat(left_input->column_names(), right_input->column_names()));
-}
-
-/**
- * @brief Apply a filter predicated to a table
- *
- * @param table The input table
- * @param predicate The filter predicate
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_filter(
-  std::unique_ptr<table_with_names> const& table, cudf::ast::operation const& predicate)
-{
-  CUDF_FUNC_RANGE();
-  auto const boolean_mask = cudf::compute_column(table->table(), predicate);
-  auto result_table       = cudf::apply_boolean_mask(table->table(), boolean_mask->view());
-  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
-}
-
-/**
- * @brief Apply a boolean mask to a table
- *
- * @param table The input table
- * @param mask The boolean mask
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_mask(
-  std::unique_ptr<table_with_names> const& table, std::unique_ptr<cudf::column> const& mask)
-{
-  CUDF_FUNC_RANGE();
-  auto result_table = cudf::apply_boolean_mask(table->table(), mask->view());
-  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
-}
-
-struct groupby_context_t {
-  std::vector<std::string> keys;
-  std::unordered_map<std::string, std::vector<std::pair<cudf::aggregation::Kind, std::string>>>
-    values;
-};
-
-/**
- * @brief Apply a groupby operation to a table
- *
- * @param table The input table
- * @param ctx The groupby context
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_groupby(
-  std::unique_ptr<table_with_names> const& table, groupby_context_t const& ctx)
-{
-  CUDF_FUNC_RANGE();
-  auto const keys = table->select(ctx.keys);
-  cudf::groupby::groupby groupby_obj(keys);
-  std::vector<std::string> result_column_names;
-  result_column_names.insert(result_column_names.end(), ctx.keys.begin(), ctx.keys.end());
-  std::vector<cudf::groupby::aggregation_request> requests;
-  for (auto& [value_col, aggregations] : ctx.values) {
-    requests.emplace_back(cudf::groupby::aggregation_request());
-    for (auto& agg : aggregations) {
-      if (agg.first == cudf::aggregation::Kind::SUM) {
-        requests.back().aggregations.push_back(
-          cudf::make_sum_aggregation<cudf::groupby_aggregation>());
-      } else if (agg.first == cudf::aggregation::Kind::MEAN) {
-        requests.back().aggregations.push_back(
-          cudf::make_mean_aggregation<cudf::groupby_aggregation>());
-      } else if (agg.first == cudf::aggregation::Kind::COUNT_ALL) {
-        requests.back().aggregations.push_back(
-          cudf::make_count_aggregation<cudf::groupby_aggregation>());
-      } else {
-        throw std::runtime_error("Unsupported aggregation");
-      }
-      result_column_names.push_back(agg.second);
-    }
-    requests.back().values = table->column(value_col);
-  }
-  auto agg_results = groupby_obj.aggregate(requests);
-  std::vector<std::unique_ptr<cudf::column>> result_columns;
-  for (size_t i = 0; i < agg_results.first->num_columns(); i++) {
-    auto col = std::make_unique<cudf::column>(agg_results.first->get_column(i));
-    result_columns.push_back(std::move(col));
-  }
-  for (size_t i = 0; i < agg_results.second.size(); i++) {
-    for (size_t j = 0; j < agg_results.second[i].results.size(); j++) {
-      result_columns.push_back(std::move(agg_results.second[i].results[j]));
-    }
-  }
-  auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
-  return std::make_unique<table_with_names>(std::move(result_table), result_column_names);
-}
-
-/**
- * @brief Apply an order by operation to a table
- *
- * @param table The input table
- * @param sort_keys The sort keys
- * @param sort_key_orders The sort key orders
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_orderby(
-  std::unique_ptr<table_with_names> const& table,
-  std::vector<std::string> const& sort_keys,
-  std::vector<cudf::order> const& sort_key_orders)
-{
-  CUDF_FUNC_RANGE();
-  std::vector<cudf::column_view> column_views;
-  for (auto& key : sort_keys) {
-    column_views.push_back(table->column(key));
-  }
-  auto result_table =
-    cudf::sort_by_key(table->table(), cudf::table_view{column_views}, sort_key_orders);
-  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
-}
-
-/**
- * @brief Apply a reduction operation to a column
- *
- * @param column The input column
- * @param agg_kind The aggregation kind
- * @param col_name The name of the output column
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_reduction(
-  cudf::column_view const& column,
-  cudf::aggregation::Kind const& agg_kind,
-  std::string const& col_name)
-{
-  CUDF_FUNC_RANGE();
-  auto const agg            = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
-  auto const result         = cudf::reduce(column, *agg, column.type());
-  cudf::size_type const len = 1;
-  auto col                  = cudf::make_column_from_scalar(*result, len);
-  std::vector<std::unique_ptr<cudf::column>> columns;
-  columns.push_back(std::move(col));
-  auto result_table                  = std::make_unique<cudf::table>(std::move(columns));
-  std::vector<std::string> col_names = {col_name};
-  return std::make_unique<table_with_names>(std::move(result_table), col_names);
-}
-
-/**
- * @brief Read a parquet file into a table
- *
- * @param filename The path to the parquet file
- * @param columns The columns to read
- * @param predicate The filter predicate to pushdown
- */
-[[nodiscard]] std::unique_ptr<table_with_names> read_parquet(
-  std::string const& filename,
-  std::vector<std::string> const& columns                = {},
-  std::unique_ptr<cudf::ast::operation> const& predicate = nullptr)
-{
-  CUDF_FUNC_RANGE();
-  auto const source = cudf::io::source_info(filename);
-  auto builder      = cudf::io::parquet_reader_options_builder(source);
-  if (!columns.empty()) { builder.columns(columns); }
-  if (predicate) { builder.filter(*predicate); }
-  auto const options       = builder.build();
-  auto table_with_metadata = cudf::io::read_parquet(options);
-  std::vector<std::string> column_names;
-  for (auto const& col_info : table_with_metadata.metadata.schema_info) {
-    column_names.push_back(col_info.name);
-  }
-  return std::make_unique<table_with_names>(std::move(table_with_metadata.tbl), column_names);
-}
-
-/**
- * @brief Generate the `std::tm` structure from year, month, and day
- *
- * @param year The year
- * @param month The month
- * @param day The day
- */
-std::tm make_tm(int year, int month, int day)
-{
-  std::tm tm{};
-  tm.tm_year = year - 1900;
-  tm.tm_mon  = month - 1;
-  tm.tm_mday = day;
-  return tm;
-}
-
-/**
- * @brief Calculate the number of days since the UNIX epoch
- *
- * @param year The year
- * @param month The month
- * @param day The day
- */
-int32_t days_since_epoch(int year, int month, int day)
-{
-  std::tm tm             = make_tm(year, month, day);
-  std::tm epoch          = make_tm(1970, 1, 1);
-  std::time_t time       = std::mktime(&tm);
-  std::time_t epoch_time = std::mktime(&epoch);
-  double diff            = std::difftime(time, epoch_time) / (60 * 60 * 24);
-  return static_cast<int32_t>(diff);
-}
-
-struct tpch_example_args {
-  std::string dataset_dir;
-  std::string memory_resource_type;
-};
-
-/**
- * @brief Parse command line arguments into a struct
- *
- * @param argc The number of command line arguments
- * @param argv The command line arguments
- */
-tpch_example_args parse_args(int argc, char const** argv)
-{
-  if (argc < 3) {
-    std::string usage_message = "Usage: " + std::string(argv[0]) +
-                                " <dataset_dir> <memory_resource_type>\n The query result will be "
-                                "saved to a parquet file named q{query_no}.parquet in the current "
-                                "working directory ";
-    throw std::runtime_error(usage_message);
-  }
-  tpch_example_args args;
-  args.dataset_dir          = argv[1];
-  args.memory_resource_type = argv[2];
-  return args;
-}
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index a3d6533705e..ff25a5bacae 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -20,6 +20,7 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <map>
@@ -128,6 +129,19 @@ class json_reader_options {
   // Whether to recover after an invalid JSON line
   json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
 
+  // Validation checks for spark
+  // Should the json validation be strict or not
+  // Note: strict validation enforces the JSON specification https://www.json.org/json-en.html
+  bool _strict_validation = false;
+  // Allow leading zeros for numeric values.
+  bool _allow_numeric_leading_zeros = true;
+  // Allow non-numeric numbers: NaN, +INF, -INF, +Infinity, Infinity, -Infinity
+  bool _allow_nonnumeric_numbers = true;
+  // Allow unquoted control characters
+  bool _allow_unquoted_control_chars = true;
+  // Additional values to recognize as null values
+  std::vector<std::string> _na_values;
+
   /**
    * @brief Constructor from source info.
    *
@@ -298,6 +312,55 @@ class json_reader_options {
    */
   [[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; }
 
+  /**
+   * @brief Whether json validation should be enforced strictly or not.
+   *
+   * @return true if it should be.
+   */
+  [[nodiscard]] bool is_strict_validation() const { return _strict_validation; }
+
+  /**
+   * @brief Whether leading zeros are allowed in numeric values.
+   *
+   * @note: This validation is enforced only if strict validation is enabled.
+   *
+   * @return true if leading zeros are allowed in numeric values
+   */
+  [[nodiscard]] bool is_allowed_numeric_leading_zeros() const
+  {
+    return _allow_numeric_leading_zeros;
+  }
+
+  /**
+   * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity,
+   * and -Infinity.
+   *
+   * @note: This validation is enforced only if strict validation is enabled.
+   *
+   * @return true if leading zeros are allowed in numeric values
+   */
+  [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; }
+
+  /**
+   * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32
+   * be allowed without some form of escaping.
+   *
+   * @note: This validation is enforced only if strict validation is enabled.
+   *
+   * @return true if unquoted control chars are allowed.
+   */
+  [[nodiscard]] bool is_allowed_unquoted_control_chars() const
+  {
+    return _allow_unquoted_control_chars;
+  }
+
+  /**
+   * @brief Returns additional values to recognize as null values.
+   *
+   * @return Additional values to recognize as null values
+   */
+  [[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }
+
   /**
    * @brief Set data types for columns to be read.
    *
@@ -427,6 +490,63 @@ class json_reader_options {
    * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines.
    */
   void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; }
+
+  /**
+   * @brief Set whether strict validation is enabled or not.
+   *
+   * @param val Boolean value to indicate whether strict validation is enabled.
+   */
+  void set_strict_validation(bool val) { _strict_validation = val; }
+
+  /**
+   * @brief Set whether leading zeros are allowed in numeric values. Strict validation
+   * must be enabled for this to work.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   */
+  void allow_numeric_leading_zeros(bool val)
+  {
+    CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
+    _allow_numeric_leading_zeros = val;
+  }
+
+  /**
+   * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity,
+   * Infinity, and -Infinity. Strict validation must be enabled for this to work.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   */
+  void allow_nonnumeric_numbers(bool val)
+  {
+    CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
+    _allow_nonnumeric_numbers = val;
+  }
+
+  /**
+   * @brief Set whether in a quoted string should characters greater than or equal to 0
+   * and less than 32 be allowed without some form of escaping. Strict validation must
+   * be enabled for this to work.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val true to indicate whether unquoted control chars are allowed.
+   */
+  void allow_unquoted_control_chars(bool val)
+  {
+    CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
+    _allow_unquoted_control_chars = val;
+  }
+
+  /**
+   * @brief Sets additional values to recognize as null values.
+   *
+   * @param vals Vector of values to be considered to be null
+   */
+  void set_na_values(std::vector<std::string> vals) { _na_values = std::move(vals); }
 };
 
 /**
@@ -638,6 +758,76 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether json validation should be strict or not.
+   *
+   * @param val Boolean value to indicate whether json validation should be strict or not.
+   * @return this for chaining
+   */
+  json_reader_options_builder& strict_validation(bool val)
+  {
+    options.set_strict_validation(val);
+    return *this;
+  }
+
+  /**
+   * @brief Set Whether leading zeros are allowed in numeric values. Strict validation must
+   * be enabled for this to have any effect.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   * @return this for chaining
+   */
+  json_reader_options_builder& numeric_leading_zeros(bool val)
+  {
+    options.allow_numeric_leading_zeros(val);
+    return *this;
+  }
+
+  /**
+   * @brief Set whether specific unquoted number values are valid JSON. The values are NaN,
+   * +INF, -INF, +Infinity, Infinity, and -Infinity.
+   * Strict validation must be enabled for this to have any effect.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not.
+   * @return this for chaining
+   */
+  json_reader_options_builder& nonnumeric_numbers(bool val)
+  {
+    options.allow_nonnumeric_numbers(val);
+    return *this;
+  }
+
+  /**
+   * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without
+   * some form of escaping. Strict validation must be enabled for this to have any effect.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate if unquoted control chars are allowed or not.
+   * @return this for chaining
+   */
+  json_reader_options_builder& unquoted_control_chars(bool val)
+  {
+    options.allow_unquoted_control_chars(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets additional values to recognize as null values.
+   *
+   * @param vals Vector of values to be considered to be null
+   * @return this for chaining
+   */
+  json_reader_options_builder& na_values(std::vector<std::string> vals)
+  {
+    options.set_na_values(std::move(vals));
+    return *this;
+  }
+
   /**
    * @brief move json_reader_options member once it's built.
    */
diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp
index e7fe3cc7214..0d74a4158ad 100644
--- a/cpp/include/cudf/io/nvcomp_adapter.hpp
+++ b/cpp/include/cudf/io/nvcomp_adapter.hpp
@@ -24,7 +24,7 @@
 namespace CUDF_EXPORT cudf {
 namespace io::nvcomp {
 
-enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 };
+enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4, GZIP };
 
 /**
  * @brief Set of parameters that impact whether nvCOMP features are enabled.
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
index f7108129dee..4f3fc7086f2 100644
--- a/cpp/include/cudf/strings/regex/flags.hpp
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -35,10 +35,11 @@ namespace strings {
  * and to match the Python flag values.
  */
 enum regex_flags : uint32_t {
-  DEFAULT   = 0,   ///< default
-  MULTILINE = 8,   ///< the '^' and '$' honor new-line characters
-  DOTALL    = 16,  ///< the '.' matching includes new-line characters
-  ASCII     = 256  ///< use only ASCII when matching built-in character classes
+  DEFAULT     = 0,    ///< default
+  MULTILINE   = 8,    ///< the '^' and '$' honor new-line characters
+  DOTALL      = 16,   ///< the '.' matching includes new-line characters
+  ASCII       = 256,  ///< use only ASCII when matching built-in character classes
+  EXT_NEWLINE = 512   ///< new-line matches extended characters
 };
 
 /**
@@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f)
   return (f & regex_flags::ASCII) == regex_flags::ASCII;
 }
 
+/**
+ * @brief Returns true if the given flags contain EXT_NEWLINE
+ *
+ * @param f Regex flags to check
+ * @return true if `f` includes EXT_NEWLINE
+ */
+constexpr bool is_ext_newline(regex_flags const f)
+{
+  return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE;
+}
+
 /**
  * @brief Capture groups setting
  *
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index abb26d7ccb4..14695c3bb27 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper
 
 __device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
 {
-  if (byte_pos > 0)
-    while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
-      ;
+  if (byte_pos > 0) {
+    if (byte_pos == char_pos) {
+      --byte_pos;
+    } else {
+      while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
+        ;
+    }
+  }
   --char_pos;
   return *this;
 }
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index c83a4260c19..7c909f1a948 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/hashing.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
@@ -72,7 +73,7 @@ std::unique_ptr<cudf::column> minhash(
  *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seeds Seed values used for the hash algorithm
@@ -133,7 +134,7 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seeds Seed values used for the hash algorithm
@@ -150,5 +151,61 @@ std::unique_ptr<cudf::column> minhash64(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each row of strings per seed
+ *
+ * Hash values are computed from each string in each row and the
+ * minimum hash value is returned for each row for each seed.
+ * Each row of the output list column are seed results for the corresponding
+ * input row. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Lists column of strings to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> word_minhash(
+  cudf::lists_column_view const& input,
+  cudf::device_span<uint32_t const> seeds,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Returns the minhash values for each row of strings per seed
+ *
+ * Hash values are computed from each string in each row and the
+ * minimum hash value is returned for each row for each seed.
+ * Each row of the output list column are seed results for the corresponding
+ * input row. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm though
+ * only the first 64-bits of the hash are used in computing the output.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Lists column of strings to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> word_minhash64(
+  cudf::lists_column_view const& input,
+  cudf::device_span<uint64_t const> seeds,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 261a8eb401d..c3187f73a95 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -23,6 +23,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <nvcomp/deflate.h>
+#include <nvcomp/gzip.h>
 #include <nvcomp/lz4.h>
 #include <nvcomp/snappy.h>
 #include <nvcomp/zstd.h>
@@ -44,6 +45,8 @@ auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&...
       return nvcompBatchedLZ4DecompressGetTempSizeEx(std::forward<Args>(args)...);
     case compression_type::DEFLATE:
       return nvcompBatchedDeflateDecompressGetTempSizeEx(std::forward<Args>(args)...);
+    case compression_type::GZIP:
+      return nvcompBatchedGzipDecompressGetTempSizeEx(std::forward<Args>(args)...);
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -73,6 +76,8 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
     case compression_type::DEFLATE:
       return nvcompBatchedDeflateDecompressAsync(std::forward<Args>(args)...);
     case compression_type::LZ4: return nvcompBatchedLZ4DecompressAsync(std::forward<Args>(args)...);
+    case compression_type::GZIP:
+      return nvcompBatchedGzipDecompressAsync(std::forward<Args>(args)...);
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -84,6 +89,7 @@ std::string compression_type_name(compression_type compression)
     case compression_type::ZSTD: return "Zstandard";
     case compression_type::DEFLATE: return "Deflate";
     case compression_type::LZ4: return "LZ4";
+    case compression_type::GZIP: return "GZIP";
   }
   return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
 }
@@ -359,8 +365,8 @@ std::optional<std::string> is_compression_disabled_impl(compression_type compres
         return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
       }
       return std::nullopt;
+    default: return "Unsupported compression type";
   }
-  return "Unsupported compression type";
 }
 
 std::optional<std::string> is_compression_disabled(compression_type compression,
@@ -396,7 +402,8 @@ std::optional<std::string> is_decompression_disabled_impl(compression_type compr
                                                           feature_status_parameters params)
 {
   switch (compression) {
-    case compression_type::DEFLATE: {
+    case compression_type::DEFLATE:
+    case compression_type::GZIP: {
       if (not params.are_all_integrations_enabled) {
         return "DEFLATE decompression is experimental, you can enable it through "
                "`LIBCUDF_NVCOMP_POLICY` environment variable.";
@@ -447,6 +454,7 @@ std::optional<std::string> is_decompression_disabled(compression_type compressio
 size_t required_alignment(compression_type compression)
 {
   switch (compression) {
+    case compression_type::GZIP:
     case compression_type::DEFLATE: return nvcompDeflateRequiredAlignment;
     case compression_type::SNAPPY: return nvcompSnappyRequiredAlignment;
     case compression_type::ZSTD: return nvcompZstdRequiredAlignment;
@@ -462,7 +470,7 @@ std::optional<size_t> compress_max_allowed_chunk_size(compression_type compressi
     case compression_type::SNAPPY: return nvcompSnappyCompressionMaxAllowedChunkSize;
     case compression_type::ZSTD: return nvcompZstdCompressionMaxAllowedChunkSize;
     case compression_type::LZ4: return nvcompLZ4CompressionMaxAllowedChunkSize;
-    default: return std::nullopt;
+    default: CUDF_FAIL("Unsupported compression type");
   }
 }
 
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 7899ea7bac4..97d5884fef1 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -16,6 +16,7 @@
 
 #include "io/fst/lookup_tables.cuh"
 
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -302,6 +303,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
+  CUDF_FUNC_RANGE();
   static constexpr std::int32_t min_out = 0;
   static constexpr std::int32_t max_out = 2;
   auto parser =
@@ -330,6 +332,7 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr)
 {
+  CUDF_FUNC_RANGE();
   static constexpr std::int32_t min_out = 0;
   static constexpr std::int32_t max_out = 2;
   auto parser =
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index d153cf5b909..49ad0cb1d34 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -226,6 +226,21 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
   device_span<SymbolOffsetT const> token_indices,
   rmm::cuda_stream_view stream);
 
+/**
+ * @brief Validate the tokens conforming to behavior given in options.
+ *
+ * @param d_input The string of input characters
+ * @param tokens The tokens to be post-processed
+ * @param token_indices The tokens' corresponding indices that are post-processed
+ * @param options Parsing options specifying the parsing behaviour
+ * @param stream The cuda stream to dispatch GPU kernels to
+ */
+void validate_token_stream(device_span<char const> d_input,
+                           device_span<PdaTokenT> tokens,
+                           device_span<SymbolOffsetT> token_indices,
+                           cudf::io::json_reader_options const& options,
+                           rmm::cuda_stream_view stream);
+
 /**
  * @brief Parses the given JSON string and generates a tree representation of the given input.
  *
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index d76e5447c30..4e513d3495c 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1660,6 +1660,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
 
   if (delimiter_offset == 1) {
     tokens.set_element(0, token_t::LineEnd, stream);
+    validate_token_stream(json_in, tokens, tokens_indices, options, stream);
     auto [filtered_tokens, filtered_tokens_indices] =
       process_token_stream(tokens, tokens_indices, stream);
     tokens         = std::move(filtered_tokens);
@@ -2082,7 +2083,9 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
   parse_opts.keepquotes = options.is_enabled_keep_quotes();
   parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
   parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
-  parse_opts.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+  std::vector<std::string> na_values{"", "null"};
+  na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end());
+  parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream);
   return parse_opts;
 }
 
diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
new file mode 100644
index 00000000000..83c7b663980
--- /dev/null
+++ b/cpp/src/io/json/process_tokens.cu
@@ -0,0 +1,310 @@
+
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/utilities/trie.cuh"
+#include "nested_json.hpp"
+#include "tabulate_output_iterator.cuh"
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/tokenize_json.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/transform_scan.h>
+
+namespace cudf::io::json {
+namespace detail {
+
+struct write_if {
+  using token_t   = cudf::io::json::token_t;
+  using scan_type = thrust::pair<token_t, bool>;
+  PdaTokenT* tokens;
+  size_t n;
+  // Index, value
+  __device__ void operator()(size_type i, scan_type x)
+  {
+    if (i == n - 1 or tokens[i + 1] == token_t::LineEnd) {
+      if (x.first == token_t::ErrorBegin and tokens[i] != token_t::ErrorBegin) {
+        tokens[i] = token_t::ErrorBegin;
+      }
+    }
+  }
+};
+
+enum class number_state {
+  START = 0,
+  SAW_NEG,  // not a complete state
+  LEADING_ZERO,
+  WHOLE,
+  SAW_RADIX,  // not a complete state
+  FRACTION,
+  START_EXPONENT,       // not a complete state
+  AFTER_SIGN_EXPONENT,  // not a complete state
+  EXPONENT
+};
+
+enum class string_state {
+  NORMAL = 0,
+  ESCAPED,   // not a complete state
+  ESCAPED_U  // not a complete state
+};
+
+__device__ inline bool substr_eq(const char* data,
+                                 SymbolOffsetT const start,
+                                 SymbolOffsetT const end,
+                                 SymbolOffsetT const expected_len,
+                                 const char* expected)
+{
+  if (end - start != expected_len) { return false; }
+  for (auto idx = 0; idx < expected_len; idx++) {
+    if (data[start + idx] != expected[idx]) { return false; }
+  }
+  return true;
+}
+
+void validate_token_stream(device_span<char const> d_input,
+                           device_span<PdaTokenT> tokens,
+                           device_span<SymbolOffsetT> token_indices,
+                           cudf::io::json_reader_options const& options,
+                           rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  if (!options.is_strict_validation()) { return; }
+  using token_t = cudf::io::json::token_t;
+  cudf::detail::optional_trie trie_na =
+    cudf::detail::create_serialized_trie(options.get_na_values(), stream);
+  auto trie_na_view    = cudf::detail::make_trie_view(trie_na);
+  auto validate_values = cuda::proclaim_return_type<bool>(
+    [data                        = d_input.data(),
+     trie_na                     = trie_na_view,
+     allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
+     allow_nonnumeric =
+       options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
+                                                           SymbolOffsetT end) -> bool {
+      // This validates an unquoted value. A value must match https://www.json.org/json-en.html
+      // but the leading and training whitespace should already have been removed, and is not
+      // a string
+      auto c               = data[start];
+      auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
+      if (is_null_literal) {
+        return true;
+      } else if ('n' == c) {
+        return substr_eq(data, start, end, 4, "null");
+      } else if ('t' == c) {
+        return substr_eq(data, start, end, 4, "true");
+      } else if ('f' == c) {
+        return substr_eq(data, start, end, 5, "false");
+      } else if (allow_nonnumeric && c == 'N') {
+        return substr_eq(data, start, end, 3, "NaN");
+      } else if (allow_nonnumeric && c == 'I') {
+        return substr_eq(data, start, end, 8, "Infinity");
+      } else if (allow_nonnumeric && c == '+') {
+        return substr_eq(data, start, end, 4, "+INF") ||
+               substr_eq(data, start, end, 9, "+Infinity");
+      } else if ('-' == c || c <= '9' && 'c' >= '0') {
+        // number
+        auto num_state = number_state::START;
+        for (auto at = start; at < end; at++) {
+          c = data[at];
+          switch (num_state) {
+            case number_state::START:
+              if ('-' == c) {
+                num_state = number_state::SAW_NEG;
+              } else if ('0' == c) {
+                num_state = number_state::LEADING_ZERO;
+              } else if (c >= '1' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::SAW_NEG:
+              if ('0' == c) {
+                num_state = number_state::LEADING_ZERO;
+              } else if (c >= '1' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else if (allow_nonnumeric && 'I' == c) {
+                return substr_eq(data, start, end, 4, "-INF") ||
+                       substr_eq(data, start, end, 9, "-Infinity");
+              } else {
+                return false;
+              }
+              break;
+            case number_state::LEADING_ZERO:
+              if (allow_numeric_leading_zeros && c >= '0' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else if ('.' == c) {
+                num_state = number_state::SAW_RADIX;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::WHOLE:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else if ('.' == c) {
+                num_state = number_state::SAW_RADIX;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::SAW_RADIX:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::FRACTION;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::FRACTION:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::FRACTION;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::START_EXPONENT:
+              if ('+' == c || '-' == c) {
+                num_state = number_state::AFTER_SIGN_EXPONENT;
+              } else if (c >= '0' && c <= '9') {
+                num_state = number_state::EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::AFTER_SIGN_EXPONENT:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::EXPONENT:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+          }
+        }
+        return num_state != number_state::AFTER_SIGN_EXPONENT &&
+               num_state != number_state::START_EXPONENT && num_state != number_state::SAW_NEG &&
+               num_state != number_state::SAW_RADIX;
+      } else {
+        return false;
+      }
+    });
+
+  auto validate_strings = cuda::proclaim_return_type<bool>(
+    [data = d_input.data(),
+     allow_unquoted_control_chars =
+       options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start,
+                                                               SymbolOffsetT end) -> bool {
+      // This validates a quoted string. A string must match https://www.json.org/json-en.html
+      // but we already know that it has a starting and ending " and all white space has been
+      // stripped out. Also the base CUDF validation makes sure escaped chars are correct
+      // so we only need to worry about unquoted control chars
+
+      auto state   = string_state::NORMAL;
+      auto u_count = 0;
+      for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
+        auto c = data[idx];
+        if (!allow_unquoted_control_chars && static_cast<int>(c) >= 0 && static_cast<int>(c) < 32) {
+          return false;
+        }
+
+        switch (state) {
+          case string_state::NORMAL:
+            if (c == '\\') { state = string_state::ESCAPED; }
+            break;
+          case string_state::ESCAPED:
+            // in Spark you can allow any char to be escaped, but CUDF
+            // validates it in some cases so we need to also validate it.
+            if (c == 'u') {
+              state   = string_state::ESCAPED_U;
+              u_count = 0;
+            } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' ||
+                       c == 'r' || c == 't') {
+              state = string_state::NORMAL;
+            } else {
+              return false;
+            }
+            break;
+          case string_state::ESCAPED_U:
+            if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
+              u_count++;
+              if (u_count == 4) {
+                state   = string_state::NORMAL;
+                u_count = 0;
+              }
+            } else {
+              return false;
+            }
+            break;
+        }
+      }
+      return string_state::NORMAL == state;
+    });
+
+  auto num_tokens = tokens.size();
+  auto count_it   = thrust::make_counting_iterator(0);
+  auto predicate  = [tokens        = tokens.begin(),
+                    token_indices = token_indices.begin(),
+                    validate_values,
+                    validate_strings] __device__(auto i) -> bool {
+    if (tokens[i] == token_t::ValueEnd) {
+      return !validate_values(token_indices[i - 1], token_indices[i]);
+    } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) {
+      return !validate_strings(token_indices[i - 1], token_indices[i]);
+    }
+    return false;
+  };
+
+  using scan_type            = write_if::scan_type;
+  auto conditional_write     = write_if{tokens.begin(), num_tokens};
+  auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write);
+  auto transform_op          = cuda::proclaim_return_type<scan_type>(
+    [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
+      if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
+      return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
+    });
+  auto binary_op = cuda::proclaim_return_type<scan_type>(
+    [] __device__(scan_type prev, scan_type curr) -> scan_type {
+      auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first);
+      return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second);
+    });
+
+  thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                   count_it,
+                                   count_it + num_tokens,
+                                   conditional_output_it,
+                                   transform_op,
+                                   binary_op);  // in-place scan
+}
+}  // namespace detail
+}  // namespace cudf::io::json
diff --git a/cpp/src/io/json/tabulate_output_iterator.cuh b/cpp/src/io/json/tabulate_output_iterator.cuh
new file mode 100644
index 00000000000..7cf3655e259
--- /dev/null
+++ b/cpp/src/io/json/tabulate_output_iterator.cuh
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+
+namespace cudf {
+namespace detail {
+
+// Proxy reference that calls BinaryFunction with index value and the rhs of assignment operator
+template <typename BinaryFunction, typename IndexT>
+class tabulate_output_iterator_proxy {
+ public:
+  __host__ __device__ tabulate_output_iterator_proxy(const IndexT index, BinaryFunction fun)
+    : index(index), fun(fun)
+  {
+  }
+  template <typename T>
+  __host__ __device__ tabulate_output_iterator_proxy operator=(const T& rhs_value)
+  {
+    fun(index, rhs_value);
+    return *this;
+  }
+
+ private:
+  IndexT index;
+  BinaryFunction fun;
+};
+
+/**
+ * @brief Tabulate output iterator with custom binary function which takes index and value.
+ *
+ * @code {.cpp}
+ * #include "tabulate_output_iterator.cuh"
+ * #include <thrust/device_vector.h>
+ * #include <thrust/iterator/counting_iterator.h>
+ * #include <thrust/iterator/transform_iterator.h>
+ *
+ * struct set_bits_field {
+ *   int* bitfield;
+ *   __device__ inline void set_bit(size_t bit_index)
+ *   {
+ *     atomicOr(&bitfield[bit_index/32], (int{1} << (bit_index % 32)));
+ *   }
+ *   __device__ inline void clear_bit(size_t bit_index)
+ *   {
+ *     atomicAnd(&bitfield[bit_index / 32], ~(int{1} << (bit_index % 32)));
+ *   }
+ *   // Index, value
+ *   __device__ void operator()(size_t i, bool x)
+ *   {
+ *     if (x)
+ *       set_bit(i);
+ *     else
+ *       clear_bit(i);
+ *   }
+ * };
+ *
+ * thrust::device_vector<int> v(1, 0x00000000);
+ * auto result_begin = thrust::make_tabulate_output_iterator(set_bits_field{v.data().get()});
+ * auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+ *   [] __device__ (int x) {   return x%2; });
+ * thrust::copy(thrust::device, value, value+32, result_begin);
+ * assert(v[0] == 0xaaaaaaaa);
+ * @endcode
+ *
+ *
+ * @tparam BinaryFunction Binary function to be called with the Iterator value and the rhs of
+ * assignment operator.
+ * @tparam Iterator iterator type that acts as index of the output.
+ */
+template <typename BinaryFunction, typename IndexT = ptrdiff_t>
+class tabulate_output_iterator
+  : public thrust::iterator_adaptor<tabulate_output_iterator<BinaryFunction, IndexT>,
+                                    thrust::counting_iterator<IndexT>,
+                                    thrust::use_default,
+                                    thrust::use_default,
+                                    thrust::use_default,
+                                    tabulate_output_iterator_proxy<BinaryFunction, IndexT>> {
+ public:
+  // parent class.
+  using super_t = thrust::iterator_adaptor<tabulate_output_iterator<BinaryFunction, IndexT>,
+                                           thrust::counting_iterator<IndexT>,
+                                           thrust::use_default,
+                                           thrust::use_default,
+                                           thrust::use_default,
+                                           tabulate_output_iterator_proxy<BinaryFunction, IndexT>>;
+  // friend thrust::iterator_core_access to allow it access to the private interface dereference()
+  friend class thrust::iterator_core_access;
+  __host__ __device__ tabulate_output_iterator(BinaryFunction fun) : fun(fun) {}
+
+ private:
+  BinaryFunction fun;
+
+  // thrust::iterator_core_access accesses this function
+  __host__ __device__ typename super_t::reference dereference() const
+  {
+    return tabulate_output_iterator_proxy<BinaryFunction, IndexT>(*this->base(), fun);
+  }
+};
+
+template <typename BinaryFunction>
+tabulate_output_iterator<BinaryFunction> __host__ __device__
+make_tabulate_output_iterator(BinaryFunction fun)
+{
+  return tabulate_output_iterator<BinaryFunction>(fun);
+}  // end make_tabulate_output_iterator
+
+}  // namespace detail
+}  // namespace cudf
+
+// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from
+// type_traits to enable its use with algorithms.
+template <class BinaryFunction, class IndexT>
+struct thrust::detail::is_proxy_reference<
+  cudf::detail::tabulate_output_iterator_proxy<BinaryFunction, IndexT>>
+  : public thrust::detail::true_type {};
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a3f91f6859b..9ed2929a70e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -893,7 +893,7 @@ __device__ void gpuDecodeLevels(page_state_s* s,
 {
   bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  constexpr int batch_size = 32;
+  constexpr int batch_size = cudf::detail::warp_size;
   int cur_leaf_count       = target_leaf_count;
   while (s->error == 0 && s->nz_count < target_leaf_count &&
          s->input_value_count < s->num_input_values) {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 5d10472b0ae..7c985643887 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -203,10 +203,9 @@ struct SchemaElement {
   bool operator==(SchemaElement const& other) const
   {
     return type == other.type && converted_type == other.converted_type &&
-           type_length == other.type_length && repetition_type == other.repetition_type &&
-           name == other.name && num_children == other.num_children &&
-           decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision &&
-           field_id == other.field_id;
+           type_length == other.type_length && name == other.name &&
+           num_children == other.num_children && decimal_scale == other.decimal_scale &&
+           decimal_precision == other.decimal_precision && field_id == other.field_id;
   }
 
   // the parquet format is a little squishy when it comes to interpreting
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 125d35f6499..1390339c1ae 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -400,7 +400,8 @@ struct ColumnChunkDesc {
                            int32_t src_col_schema_,
                            column_chunk_info const* chunk_info_,
                            float list_bytes_per_row_est_,
-                           bool strings_to_categorical_)
+                           bool strings_to_categorical_,
+                           int32_t src_file_idx_)
     : compressed_data(compressed_data_),
       compressed_size(compressed_size_),
       num_values(num_values_),
@@ -419,7 +420,8 @@ struct ColumnChunkDesc {
       src_col_schema(src_col_schema_),
       h_chunk_info(chunk_info_),
       list_bytes_per_row_est(list_bytes_per_row_est_),
-      is_strings_to_cat(strings_to_categorical_)
+      is_strings_to_cat(strings_to_categorical_),
+      src_file_idx(src_file_idx_)
 
   {
   }
@@ -456,6 +458,7 @@ struct ColumnChunkDesc {
 
   bool is_strings_to_cat{};    // convert strings to hashes
   bool is_large_string_col{};  // `true` if string data uses 64-bit offsets
+  int32_t src_file_idx{};      // source file index
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 84f0dab0d8b..c588fedb85c 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -865,8 +865,18 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
 
     switch (codec.compression_type) {
       case GZIP:
-        gpuinflate(
-          d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
+        if (cudf::io::nvcomp_integration::is_all_enabled()) {
+          nvcomp::batched_decompress(nvcomp::compression_type::GZIP,
+                                     d_comp_in_view,
+                                     d_comp_out_view,
+                                     d_comp_res_view,
+                                     codec.max_decompressed_size,
+                                     codec.total_decomp_size,
+                                     stream);
+        } else {
+          gpuinflate(
+            d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
+        }
         break;
       case SNAPPY:
         if (cudf::io::nvcomp_integration::is_stable_enabled()) {
@@ -1501,10 +1511,13 @@ void reader::impl::create_global_chunk_info()
     std::transform(
       _input_columns.begin(), _input_columns.end(), column_mapping.begin(), [&](auto const& col) {
         // translate schema_idx into something we can use for the page indexes
-        if (auto it = std::find_if(
-              columns.begin(),
-              columns.end(),
-              [&col](auto const& col_chunk) { return col_chunk.schema_idx == col.schema_idx; });
+        if (auto it = std::find_if(columns.begin(),
+                                   columns.end(),
+                                   [&](auto const& col_chunk) {
+                                     return col_chunk.schema_idx ==
+                                            _metadata->map_schema_index(col.schema_idx,
+                                                                        rg.source_index);
+                                   });
             it != columns.end()) {
           return std::distance(columns.begin(), it);
         }
@@ -1525,7 +1538,8 @@ void reader::impl::create_global_chunk_info()
       auto col = _input_columns[i];
       // look up metadata
       auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-      auto& schema   = _metadata->get_schema(col.schema_idx);
+      auto& schema   = _metadata->get_schema(
+        _metadata->map_schema_index(col.schema_idx, rg.source_index), rg.source_index);
 
       auto [clock_rate, logical_type] =
         conversion_info(to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id()),
@@ -1564,9 +1578,9 @@ void reader::impl::create_global_chunk_info()
                                        col.schema_idx,
                                        chunk_info,
                                        list_bytes_per_row_est,
-                                       schema.type == BYTE_ARRAY and _strings_to_categorical));
+                                       schema.type == BYTE_ARRAY and _strings_to_categorical,
+                                       rg.source_index));
     }
-
     // Adjust for skip_rows when updating the remaining rows after the first group
     remaining_rows -=
       (skip_rows) ? std::min<int>(rg.start_row + row_group.num_rows - skip_rows, remaining_rows)
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 8b5678f202b..6d566b5815e 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -423,8 +423,13 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf
   std::vector<column_chunk_info> chunks(rg.columns.size());
 
   for (size_t col_idx = 0; col_idx < rg.columns.size(); col_idx++) {
-    auto const& col_chunk    = rg.columns[col_idx];
-    auto& schema             = get_schema(col_chunk.schema_idx);
+    auto const& col_chunk = rg.columns[col_idx];
+    auto const is_schema_idx_mapped =
+      is_schema_index_mapped(col_chunk.schema_idx, rg_info.source_index);
+    auto const mapped_schema_idx = is_schema_idx_mapped
+                                     ? map_schema_index(col_chunk.schema_idx, rg_info.source_index)
+                                     : col_chunk.schema_idx;
+    auto& schema = get_schema(mapped_schema_idx, is_schema_idx_mapped ? rg_info.source_index : 0);
     auto const max_def_level = schema.max_definition_level;
     auto const max_rep_level = schema.max_repetition_level;
 
@@ -559,22 +564,40 @@ aggregate_reader_metadata::aggregate_reader_metadata(
     num_rows(calc_num_rows()),
     num_row_groups(calc_num_row_groups())
 {
-  // Validate that all sources have the same schema unless we are reading select columns
-  // from mismatched sources, in which case, we will only check the projected columns later.
-  if (per_file_metadata.size() > 1 and not has_cols_from_mismatched_srcs) {
-    auto const& first_meta = per_file_metadata.front();
+  if (per_file_metadata.size() > 1) {
+    auto& first_meta = per_file_metadata.front();
     auto const num_cols =
       first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0;
-    auto const& schema = first_meta.schema;
-
-    // Verify that the input files have matching numbers of columns and schema.
-    for (auto const& pfm : per_file_metadata) {
-      if (pfm.row_groups.size() > 0) {
-        CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(),
-                     "All sources must have the same number of columns");
+    auto& schema = first_meta.schema;
+
+    // Validate that all sources have the same schema unless we are reading select columns
+    // from mismatched sources, in which case, we will only check the projected columns later.
+    if (not has_cols_from_mismatched_srcs) {
+      // Verify that the input files have matching numbers of columns and schema.
+      for (auto const& pfm : per_file_metadata) {
+        if (pfm.row_groups.size() > 0) {
+          CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(),
+                       "All sources must have the same number of columns");
+        }
+        CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
       }
-      CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
     }
+
+    // Mark the column schema in the first (default) source as nullable if it is nullable in any of
+    // the input sources. This avoids recomputing this within build_column() and
+    // populate_metadata().
+    std::for_each(
+      thrust::make_counting_iterator(static_cast<size_t>(1)),
+      thrust::make_counting_iterator(schema.size()),
+      [&](auto const schema_idx) {
+        if (schema[schema_idx].repetition_type == REQUIRED and
+            std::any_of(
+              per_file_metadata.begin() + 1, per_file_metadata.end(), [&](auto const& pfm) {
+                return pfm.schema[schema_idx].repetition_type != REQUIRED;
+              })) {
+          schema[schema_idx].repetition_type = OPTIONAL;
+        }
+      });
   }
 
   // Collect and apply arrow:schema from Parquet's key value metadata section
@@ -884,15 +907,8 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t
                                                                           size_type src_idx,
                                                                           int schema_idx) const
 {
-  // schema_idx_maps will only have > 0 size when we are reading matching column projection from
-  // mismatched Parquet sources.
-  if (src_idx and not schema_idx_maps.empty()) {
-    auto const& schema_idx_map = schema_idx_maps[src_idx - 1];
-    CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(),
-                 "Unmapped schema index encountered in the specified source tree",
-                 std::range_error);
-    schema_idx = schema_idx_map.at(schema_idx);
-  }
+  // Map schema index to the provided source file index
+  schema_idx = map_schema_index(schema_idx, src_idx);
 
   auto col =
     std::find_if(per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
@@ -924,6 +940,46 @@ aggregate_reader_metadata::get_rowgroup_metadata() const
   return rg_metadata;
 }
 
+bool aggregate_reader_metadata::is_schema_index_mapped(int schema_idx, int pfm_idx) const
+{
+  // Check if schema_idx or pfm_idx is invalid
+  CUDF_EXPECTS(
+    schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
+    "Parquet reader encountered an invalid schema_idx or pfm_idx",
+    std::out_of_range);
+
+  // True if root index requested or zeroth file index or schema_idx maps doesn't exist. (i.e.
+  // schemas are identical).
+  if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return true; }
+
+  // Check if mapped
+  auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+  return schema_idx_map.find(schema_idx) != schema_idx_map.end();
+}
+
+int aggregate_reader_metadata::map_schema_index(int schema_idx, int pfm_idx) const
+{
+  // Check if schema_idx or pfm_idx is invalid
+  CUDF_EXPECTS(
+    schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
+    "Parquet reader encountered an invalid schema_idx or pfm_idx",
+    std::out_of_range);
+
+  // Check if pfm_idx is zero or root index requested or schema_idx_maps doesn't exist (i.e.
+  // schemas are identical).
+  if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return schema_idx; }
+
+  // schema_idx_maps will only have > 0 size when we are reading matching column projection from
+  // mismatched Parquet sources.
+  auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+  CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(),
+               "Unmapped schema index encountered in the specified source tree",
+               std::out_of_range);
+
+  // Return the mapped schema idx.
+  return schema_idx_map.at(schema_idx);
+}
+
 std::string aggregate_reader_metadata::get_pandas_index() const
 {
   // Assumes that all input files have the same metadata
@@ -1185,8 +1241,8 @@ aggregate_reader_metadata::select_columns(
   // Compares two schema elements to be equal except their number of children
   auto const equal_to_except_num_children = [](SchemaElement const& lhs, SchemaElement const& rhs) {
     return lhs.type == rhs.type and lhs.converted_type == rhs.converted_type and
-           lhs.type_length == rhs.type_length and lhs.repetition_type == rhs.repetition_type and
-           lhs.name == rhs.name and lhs.decimal_scale == rhs.decimal_scale and
+           lhs.type_length == rhs.type_length and lhs.name == rhs.name and
+           lhs.decimal_scale == rhs.decimal_scale and
            lhs.decimal_precision == rhs.decimal_precision and lhs.field_id == rhs.field_id;
   };
 
@@ -1209,6 +1265,11 @@ aggregate_reader_metadata::select_columns(
                    "the selected path",
                    std::invalid_argument);
 
+      // Get the schema_idx_map for this data source (pfm)
+      auto& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+      // Map the schema index from 0th tree (src) to the one in the current (dst) tree.
+      schema_idx_map[src_schema_idx] = dst_schema_idx;
+
       // If src_schema_elem is a stub, it does not exist in the column_name_info and column_buffer
       // hierarchy. So continue on with mapping.
       if (src_schema_elem.is_stub()) {
@@ -1262,15 +1323,6 @@ aggregate_reader_metadata::select_columns(
                        pfm_idx);
           });
       }
-
-      // We're at a leaf and this is an input column (one with actual data stored) so map it.
-      if (src_schema_elem.num_children == 0) {
-        // Get the schema_idx_map for this data source (pfm)
-        auto& schema_idx_map = schema_idx_maps[pfm_idx - 1];
-
-        // Map the schema index from 0th tree (src) to the one in the current (dst) tree.
-        schema_idx_map[src_schema_idx] = dst_schema_idx;
-      }
     };
 
   std::vector<int> output_column_schemas;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 6f2863136b2..6487c92f48f 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -234,6 +234,26 @@ class aggregate_reader_metadata {
 
   [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
 
+  /**
+   * @brief Checks if a schema index from 0th source is mapped to the specified file index
+   *
+   * @param schema_idx The index of the SchemaElement in the zeroth file.
+   * @param pfm_idx The index of the file (per_file_metadata) to check mappings for.
+   *
+   * @return True if schema index is mapped
+   */
+  [[nodiscard]] bool is_schema_index_mapped(int schema_idx, int pfm_idx) const;
+
+  /**
+   * @brief Maps schema index from 0th source file to the specified file index
+   *
+   * @param schema_idx The index of the SchemaElement in the zeroth file.
+   * @param pfm_idx The index of the file (per_file_metadata) to map the schema_idx to.
+   *
+   * @return Mapped schema index
+   */
+  [[nodiscard]] int map_schema_index(int schema_idx, int pfm_idx) const;
+
   /**
    * @brief Extracts the schema_idx'th SchemaElement from the pfm_idx'th file
    *
@@ -248,7 +268,7 @@ class aggregate_reader_metadata {
     CUDF_EXPECTS(
       schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
       "Parquet reader encountered an invalid schema_idx or pfm_idx",
-      std::invalid_argument);
+      std::out_of_range);
     return per_file_metadata[pfm_idx].schema[schema_idx];
   }
 
@@ -256,7 +276,10 @@ class aggregate_reader_metadata {
   [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); }
 
   /**
-   * @brief Gets the concrete nesting depth of output cudf columns
+   * @brief Gets the concrete nesting depth of output cudf columns.
+   *
+   * Gets the nesting depth of the output cudf column for the given schema.
+   * The nesting depth must be equal for the given schema_index across all sources.
    *
    * @param schema_index Schema index of the input column
    *
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 52918f5bc80..8e67f233213 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -79,23 +79,30 @@ void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_str
  * is indicated when adding new values.  This function generates the mappings of
  * the R/D levels to those start/end bounds
  *
- * @param remap Maps column schema index to the R/D remapping vectors for that column
- * @param src_col_schema The column schema to generate the new mapping for
+ * @param remap Maps column schema index to the R/D remapping vectors for that column for a
+ *              particular input source file
+ * @param src_col_schema The source column schema to generate the new mapping for
+ * @param mapped_src_col_schema Mapped column schema for src_file_idx'th file
+ * @param src_file_idx The input source file index for the column schema
  * @param md File metadata information
  */
-void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>>& remap,
-                               int src_col_schema,
-                               aggregate_reader_metadata const& md)
+void generate_depth_remappings(
+  std::map<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>>& remap,
+  int const src_col_schema,
+  int const mapped_src_col_schema,
+  int const src_file_idx,
+  aggregate_reader_metadata const& md)
 {
   // already generated for this level
-  if (remap.find(src_col_schema) != remap.end()) { return; }
-  auto schema   = md.get_schema(src_col_schema);
-  int max_depth = md.get_output_nesting_depth(src_col_schema);
+  if (remap.find({src_col_schema, src_file_idx}) != remap.end()) { return; }
+  auto const& schema   = md.get_schema(mapped_src_col_schema, src_file_idx);
+  auto const max_depth = md.get_output_nesting_depth(src_col_schema);
 
-  CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(),
+  CUDF_EXPECTS(remap.find({src_col_schema, src_file_idx}) == remap.end(),
                "Attempting to remap a schema more than once");
   auto inserted =
-    remap.insert(std::pair<int, std::pair<std::vector<int>, std::vector<int>>>{src_col_schema, {}});
+    remap.insert(std::pair<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>>{
+      {src_col_schema, src_file_idx}, {}});
   auto& depth_remap = inserted.first->second;
 
   std::vector<int>& rep_depth_remap = (depth_remap.first);
@@ -136,15 +143,15 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
     auto find_shallowest = [&](int r) {
       int shallowest = -1;
       int cur_depth  = max_depth - 1;
-      int schema_idx = src_col_schema;
+      int schema_idx = mapped_src_col_schema;
       while (schema_idx > 0) {
-        auto cur_schema = md.get_schema(schema_idx);
+        auto& cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_repetition_level == r) {
           // if this is a repeated field, map it one level deeper
           shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth;
         }
         // if it's one-level encoding list
-        else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx))) {
+        else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx, src_file_idx))) {
           shallowest = cur_depth - 1;
         }
         if (!cur_schema.is_stub()) { cur_depth--; }
@@ -159,10 +166,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) {
     auto find_deepest = [&](int d) {
       SchemaElement prev_schema;
-      int schema_idx = src_col_schema;
+      int schema_idx = mapped_src_col_schema;
       int r1         = 0;
       while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
+        SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_definition_level == d) {
           // if this is a repeated field, map it one level deeper
           r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level
@@ -175,10 +182,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
 
       // we now know R1 from above. return the deepest nesting level that has the
       // same repetition level
-      schema_idx = src_col_schema;
+      schema_idx = mapped_src_col_schema;
       int depth  = max_depth - 1;
       while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
+        SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_repetition_level == r1) {
           // if this is a repeated field, map it one level deeper
           depth = cur_schema.is_stub() ? depth + 1 : depth;
@@ -783,9 +790,20 @@ void reader::impl::allocate_nesting_info()
   std::vector<int> per_page_nesting_info_size(num_columns);
   auto iter = thrust::make_counting_iterator(size_type{0});
   std::transform(iter, iter + num_columns, per_page_nesting_info_size.begin(), [&](size_type i) {
+    // Schema index of the current input column
     auto const schema_idx = _input_columns[i].schema_idx;
-    auto const& schema    = _metadata->get_schema(schema_idx);
-    return max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(schema_idx));
+    // Get the max_definition_level of this column across all sources.
+    auto max_definition_level = _metadata->get_schema(schema_idx).max_definition_level + 1;
+    std::for_each(thrust::make_counting_iterator(static_cast<size_t>(1)),
+                  thrust::make_counting_iterator(_sources.size()),
+                  [&](auto const src_file_idx) {
+                    auto const& schema = _metadata->get_schema(
+                      _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx);
+                    max_definition_level =
+                      std::max(max_definition_level, schema.max_definition_level + 1);
+                  });
+
+    return std::max(max_definition_level, _metadata->get_output_nesting_depth(schema_idx));
   });
 
   // compute total # of page_nesting infos needed and allocate space. doing this in one
@@ -813,6 +831,8 @@ void reader::impl::allocate_nesting_info()
         page_nesting_decode_info.device_ptr() + src_info_index;
 
       pages[target_page_index + p_idx].nesting_info_size = per_page_nesting_info_size[idx];
+      // Set the number of output nesting levels from the zeroth source as nesting must be
+      // identical across sources.
       pages[target_page_index + p_idx].num_output_nesting_levels =
         _metadata->get_output_nesting_depth(src_col_schema);
 
@@ -821,25 +841,36 @@ void reader::impl::allocate_nesting_info()
     target_page_index += subpass.column_page_count[idx];
   }
 
+  // Reset the target_page_index
+  target_page_index = 0;
+
   // fill in
   int nesting_info_index = 0;
-  std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const src_col_schema = _input_columns[idx].schema_idx;
 
-    // schema of the input column
-    auto& schema = _metadata->get_schema(src_col_schema);
     // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc)
+    // nesting depth must be same across sources so getting it from the zeroth source is ok
     int const max_output_depth = _metadata->get_output_nesting_depth(src_col_schema);
 
+    // Map to store depths if this column has lists
+    std::map<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
     // if this column has lists, generate depth remapping
-    std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
-    if (schema.max_repetition_level > 0) {
-      generate_depth_remappings(depth_remapping, src_col_schema, *_metadata);
-    }
+    std::for_each(
+      thrust::make_counting_iterator(static_cast<size_t>(0)),
+      thrust::make_counting_iterator(_sources.size()),
+      [&](auto const src_file_idx) {
+        auto const mapped_schema_idx = _metadata->map_schema_index(src_col_schema, src_file_idx);
+        if (_metadata->get_schema(mapped_schema_idx, src_file_idx).max_repetition_level > 0) {
+          generate_depth_remappings(
+            depth_remapping, src_col_schema, mapped_schema_idx, src_file_idx, *_metadata);
+        }
+      });
 
     // fill in host-side nesting info
-    int schema_idx  = src_col_schema;
+    int schema_idx = src_col_schema;
+    // This is okay as we only use this to check stubness of cur_schema and
+    // to get its parent's indices, both of which are one to one mapped.
     auto cur_schema = _metadata->get_schema(schema_idx);
     int cur_depth   = max_output_depth - 1;
     while (schema_idx > 0) {
@@ -848,6 +879,9 @@ void reader::impl::allocate_nesting_info()
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
         for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
+          // Source file index for the current page.
+          auto const src_file_idx =
+            pass.chunks[pages[target_page_index + p_idx].chunk_idx].src_file_idx;
           PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])];
 
@@ -855,9 +889,11 @@ void reader::impl::allocate_nesting_info()
             &page_nesting_decode_info[nesting_info_index +
                                       (p_idx * per_page_nesting_info_size[idx])];
 
+          auto const mapped_src_col_schema =
+            _metadata->map_schema_index(src_col_schema, src_file_idx);
           // if we have lists, set our start and end depth remappings
-          if (schema.max_repetition_level > 0) {
-            auto remap = depth_remapping.find(src_col_schema);
+          if (_metadata->get_schema(mapped_src_col_schema, src_file_idx).max_repetition_level > 0) {
+            auto remap = depth_remapping.find({src_col_schema, src_file_idx});
             CUDF_EXPECTS(remap != depth_remapping.end(),
                          "Could not find depth remapping for schema");
             std::vector<int> const& rep_depth_remap = (remap->second.first);
@@ -871,11 +907,15 @@ void reader::impl::allocate_nesting_info()
             }
           }
 
+          // Get the schema from the current input source.
+          auto& actual_cur_schema = _metadata->get_schema(
+            _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx);
+
           // values indexed by output column index
-          nesting_info[cur_depth].max_def_level = cur_schema.max_definition_level;
+          nesting_info[cur_depth].max_def_level = actual_cur_schema.max_definition_level;
           pni[cur_depth].size                   = 0;
           pni[cur_depth].type =
-            to_type_id(cur_schema, _strings_to_categorical, _options.timestamp_type.id());
+            to_type_id(actual_cur_schema, _strings_to_categorical, _options.timestamp_type.id());
           pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL;
         }
 
@@ -888,6 +928,8 @@ void reader::impl::allocate_nesting_info()
       cur_schema = _metadata->get_schema(schema_idx);
     }
 
+    // Offset the page and nesting info indices
+    target_page_index += subpass.column_page_count[idx];
     nesting_info_index += (per_page_nesting_info_size[idx] * subpass.column_page_count[idx]);
   }
 
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index adf650a4f27..7c4c89bd3fb 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -539,15 +539,26 @@ class regex_parser {
                                                          : static_cast<int32_t>(LBRA);
       case ')': return RBRA;
       case '^': {
-        _chr = is_multiline(_flags) ? chr : '\n';
+        if (is_ext_newline(_flags)) {
+          _chr = is_multiline(_flags) ? 'S' : 'N';
+        } else {
+          _chr = is_multiline(_flags) ? chr : '\n';
+        }
         return BOL;
       }
       case '$': {
-        _chr = is_multiline(_flags) ? chr : '\n';
+        if (is_ext_newline(_flags)) {
+          _chr = is_multiline(_flags) ? 'S' : 'N';
+        } else {
+          _chr = is_multiline(_flags) ? chr : '\n';
+        }
         return EOL;
       }
       case '[': return build_cclass();
-      case '.': return dot_type;
+      case '.': {
+        _chr = is_ext_newline(_flags) ? 'N' : chr;
+        return dot_type;
+      }
     }
 
     if (std::find(quantifiers.begin(), quantifiers.end(), static_cast<char>(chr)) ==
@@ -959,7 +970,7 @@ class regex_compiler {
       _prog.inst_at(inst_id).u1.cls_id = class_id;
     } else if (token == CHAR) {
       _prog.inst_at(inst_id).u1.c = yy;
-    } else if (token == BOL || token == EOL) {
+    } else if (token == BOL || token == EOL || token == ANY) {
       _prog.inst_at(inst_id).u1.c = yy;
     }
     push_and(inst_id, inst_id);
@@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags)
       case STAR: printf("   STAR next=%d", inst.u2.next_id); break;
       case PLUS: printf("   PLUS next=%d", inst.u2.next_id); break;
       case QUEST: printf("  QUEST next=%d", inst.u2.next_id); break;
-      case ANY: printf("    ANY next=%d", inst.u2.next_id); break;
+      case ANY: printf("    ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break;
       case ANYNL: printf("  ANYNL next=%d", inst.u2.next_id); break;
       case NOP: printf("    NOP next=%d", inst.u2.next_id); break;
       case BOL: {
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 3b899e4edc1..e34a1e12015 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist()
   list2    = tmp;
 }
 
+/**
+ * @brief Check for supported new-line characters
+ *
+ * '\n, \r, \u0085, \u2028, or \u2029'
+ */
+constexpr bool is_newline(char32_t const ch)
+{
+  return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9);
+}
+
 /**
  * @brief Utility to check a specific character against this class instance.
  *
@@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
     if (checkstart) {
       auto startchar = static_cast<char_utf8>(jnk.startchar);
       switch (jnk.starttype) {
-        case BOL:
-          if (pos == 0) break;
-          if (jnk.startchar != '^') { return cuda::std::nullopt; }
+        case BOL: {
+          if (pos == 0) { break; }
+          if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; }
+          if (startchar != '\n') { break; }
           --itr;
           startchar = static_cast<char_utf8>('\n');
+          [[fallthrough]];
+        }
         case CHAR: {
           auto const find_itr = find_char(startchar, dstr, itr);
           if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; }
@@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
             id_activate = inst.u2.next_id;
             expanded    = true;
             break;
-          case BOL:
-            if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) {
+          case BOL: {
+            auto titr         = itr;
+            auto const prev_c = pos > 0 ? *(--titr) : 0;
+            if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) ||
+                ((inst.u1.c == 'S') && (is_newline(prev_c)))) {
               id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
-          case EOL:
+          }
+          case EOL: {
             // after the last character OR:
             // - for MULTILINE, if current character is new-line
             // - for non-MULTILINE, the very last character of the string can also be a new-line
+            bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n');
             if (last_character ||
-                ((c == '\n') && (inst.u1.c != 'Z') &&
-                 ((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) {
+                (nl && (inst.u1.c != 'Z') &&
+                 ((inst.u1.c == '$' || inst.u1.c == 'S') ||
+                  (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) {
               id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
+          }
           case BOW:
           case NBOW: {
-            auto const prev_c       = pos > 0 ? dstr[pos - 1] : 0;
+            auto titr               = itr;
+            auto const prev_c       = pos > 0 ? *(--titr) : 0;
             auto const word_class   = reclass_device{CCLASS_W};
             bool const curr_is_word = word_class.is_match(c, _codepoint_flags);
             bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags);
@@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
         case CHAR:
           if (inst.u1.c == c) id_activate = inst.u2.next_id;
           break;
-        case ANY:
-          if (c != '\n') id_activate = inst.u2.next_id;
-          break;
+        case ANY: {
+          if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; }
+          [[fallthrough]];
+        }
         case ANYNL: id_activate = inst.u2.next_id; break;
         case NCCLASS:
         case CCLASS: {
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 978a844c476..4c39fc96397 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -122,26 +122,28 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,
       break;
     }
     size_type const cc = (itr < end) && is_begin_utf8_char(*itr);
-    size_type const bc = (itr < end);
+    size_type const bc = (itr < end) ? bytes_in_utf8_byte(*itr) : 0;
     char_count += cg::reduce(warp, cc, cg::plus<int>());
     byte_count += cg::reduce(warp, bc, cg::plus<int>());
     itr += cudf::detail::warp_size;
   }
 
+  __syncwarp();
+
   if (warp.thread_rank() == 0) {
     if (start >= char_count) {
       d_output[str_idx] = string_index_pair{"", 0};
       return;
     }
 
-    // we are just below start/stop and must now increment up to it from here
+    // we are just below start/stop and must now increment up to them from here
     auto first_byte = start_counts.second;
     if (start_counts.first < start) {
       auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte);
       first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
     }
 
-    stop           = max(stop, char_count);
+    stop           = min(stop, char_count);
     auto last_byte = stop_counts.second;
     if (stop_counts.first < stop) {
       auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte);
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 605582f28a6..a03a34f5fa7 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -25,6 +25,8 @@
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/lists/list_device_view.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -151,15 +153,111 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                           mr);
   auto d_hashes = hashes->mutable_view().data<hash_value_type>();
 
-  constexpr int block_size = 256;
-  cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+  constexpr cudf::thread_index_type block_size = 256;
+  cudf::detail::grid_1d grid{
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
   minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     *d_strings, seeds, width, d_hashes);
 
   return hashes;
 }
 
-std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const& input,
+/**
+ * @brief Compute the minhash of each list row of strings for each seed
+ *
+ * This is a warp-per-row algorithm where parallel threads within a warp
+ * work on strings in a single list row.
+ *
+ * @tparam HashFunction hash function to use on each string
+ *
+ * @param d_input List of strings to process
+ * @param seeds Seeds for hashing each string
+ * @param d_hashes Minhash output values (one per row)
+ */
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input,
+                                     cudf::device_span<hash_value_type const> seeds,
+                                     hash_value_type* d_hashes)
+{
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const row_idx = idx / cudf::detail::warp_size;
+
+  if (row_idx >= d_input.size()) { return; }
+  if (d_input.is_null(row_idx)) { return; }
+
+  auto const d_row    = cudf::list_device_view(d_input, row_idx);
+  auto const d_output = d_hashes + (row_idx * seeds.size());
+
+  // initialize hashes output for this row
+  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
+  if (lane_idx == 0) {
+    auto const init = d_row.size() == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
+    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
+  }
+  __syncwarp();
+
+  // each lane hashes a string from the input row
+  for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) {
+    auto const hash_str =
+      d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element<cudf::string_view>(str_idx);
+    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
+      auto const hasher = HashFunction(seeds[seed_idx]);
+      // hash string and store the min value
+      hash_value_type hv;
+      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
+        hv = hasher(hash_str);
+      } else {
+        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
+        // but only uses the first uint64 value as requested by the LLM team.
+        hv = thrust::get<0>(hasher(hash_str));
+      }
+      cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
+      ref.fetch_min(hv, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+std::unique_ptr<cudf::column> word_minhash_fn(cudf::lists_column_view const& input,
+                                              cudf::device_span<hash_value_type const> seeds,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
+  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
+                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+               "The number of seeds times the number of input rows exceeds the column size limit",
+               std::overflow_error);
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_input = cudf::column_device_view::create(input.parent(), stream);
+
+  auto hashes   = cudf::make_numeric_column(output_type,
+                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
+                                          cudf::mask_state::UNALLOCATED,
+                                          stream,
+                                          mr);
+  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
+  auto lcdv     = cudf::detail::lists_column_device_view(*d_input);
+
+  constexpr cudf::thread_index_type block_size = 256;
+  cudf::detail::grid_1d grid{
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
+  minhash_word_kernel<HashFunction>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(lcdv, seeds, d_hashes);
+
+  return hashes;
+}
+
+std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
                                                 std::unique_ptr<cudf::column>&& hashes,
                                                 cudf::size_type seeds_size,
                                                 rmm::cuda_stream_view stream,
@@ -176,7 +274,7 @@ std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const&
                                   std::move(offsets),
                                   std::move(hashes),
                                   input.null_count(),
-                                  cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                  cudf::detail::copy_bitmask(input, stream, mr),
                                   stream,
                                   mr);
   // expect this condition to be very rare
@@ -208,7 +306,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
@@ -232,7 +330,27 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
+                                           cudf::device_span<uint32_t const> seeds,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
+                                             cudf::device_span<uint64_t const> seeds,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 }  // namespace detail
 
@@ -276,4 +394,21 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return detail::minhash64(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
+                                           cudf::device_span<uint32_t const> seeds,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::word_minhash(input, seeds, stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
+                                             cudf::device_span<uint64_t const> seeds,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::word_minhash64(input, seeds, stream, mr);
+}
 }  // namespace nvtext
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 5f890722345..e512d28b625 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2180,6 +2180,86 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync)
   cudf::set_pinned_memory_resource(last_mr);
 }
 
+// Validation
+TEST_F(JsonReaderTest, ValueValidation)
+{
+  // parsing error as null rows
+  std::string data =
+    // 0 -> a: -2 (valid)
+    R"({"a":-2 }{})"
+    "\n"
+    // 1 -> (invalid)
+    R"({"b":{}should_be_invalid})"
+    "\n"
+    // 2 -> b (valid)
+    R"({"b":{"a":3} })"
+    "\n"
+    // 3 -> c: (valid/null based on option)
+    R"({"a": 1, "c":nan, "d": "null" } )"
+    "\n"
+    "\n"
+    // 4 -> (valid/null based on option)
+    R"({"a":04, "c": 1.23, "d": "abc"} 123)"
+    "\n"
+    // 5 -> (valid)
+    R"({"a":5}//Comment after record)"
+    "\n"
+    // 6 -> ((valid/null based on option)
+    R"({"a":06} //Comment after whitespace)"
+    "\n"
+    // 7 -> (invalid)
+    R"({"a":5 //Invalid Comment within record})";
+
+  // leadingZeros allowed
+  // na_values,
+  {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .strict_validation(true);
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 4);
+    EXPECT_EQ(result.tbl->num_rows(), 8);
+    auto b_a_col  = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
+    auto a_column = int64_wrapper{{-2, 0, 0, 0, 4, 5, 6, 0},
+                                  {true, false, false, false, true, true, true, false}};
+    auto b_column = cudf::test::structs_column_wrapper(
+      {b_a_col}, {false, false, true, false, false, false, false, false});
+    auto c_column = float64_wrapper({0.0, 0.0, 0.0, 0.0, 1.23, 0.0, 0.0, 0.0},
+                                    {false, false, false, false, true, false, false, false});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column);
+  }
+  // leadingZeros not allowed, NaN allowed
+  {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .strict_validation(true)
+        .numeric_leading_zeros(false)
+        .na_values({"nan"});
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 4);
+    EXPECT_EQ(result.tbl->num_rows(), 8);
+    EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::INT8);  // empty column
+    auto b_a_col  = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
+    auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0},
+                                  {true, false, false, true, false, true, false, false}};
+    auto b_column = cudf::test::structs_column_wrapper(
+      {b_a_col}, {false, false, true, false, false, false, false, false});
+    auto c_column = int8_wrapper({0, 0, 0, 0, 0, 0, 0, 0},
+                                 {false, false, false, false, false, false, false, false});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column);
+  }
+}
+
 TEST_F(JsonReaderTest, MixedTypes)
 {
   using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index c816316d0ff..acf850c7a66 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -613,6 +615,63 @@ TEST_F(StringsContainsTests, MultiLine)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
+TEST_F(StringsContainsTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé",
+                                                   "qqq\rzzé" LINE_SEPARATOR "lll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé" PARAGRAPH_SEPARATOR,
+                                                   "abc\nzzé" NEXT_LINE});
+  auto view  = cudf::strings_column_view(input);
+
+  auto pattern = std::string("^zzé$");
+  auto prog =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  auto ml_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::EXT_NEWLINE |
+                                                          cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml  = cudf::strings::regex_program::create(pattern, ml_flags);
+
+  auto expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1, 0});
+  auto results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1, 1});
+  results  = cudf::strings::contains_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1, 0});
+  results  = cudf::strings::matches_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1, 0});
+  results  = cudf::strings::matches_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto counts = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 1, 0});
+  results     = cudf::strings::count_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts);
+  counts  = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1, 1});
+  results = cudf::strings::count_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts);
+
+  pattern  = std::string("q.*l");
+  prog     = cudf::strings::regex_program::create(pattern);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
+  results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  // inst ANY will stop matching on first 'newline' and so should not match anything here
+  prog     = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0});
+  results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  // including the DOTALL flag accepts the newline characters
+  auto dot_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::EXT_NEWLINE |
+                                                           cudf::strings::regex_flags::DOTALL);
+  prog           = cudf::strings::regex_program::create(pattern, dot_flags);
+  expected       = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
+  results        = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsContainsTests, EndOfString)
 {
   auto input = cudf::test::strings_column_wrapper(
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index b26cbd5a549..1491da758d5 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -200,6 +203,43 @@ TEST_F(StringsExtractTests, DotAll)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
+TEST_F(StringsExtractTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé",
+                                                   "qqq" LINE_SEPARATOR "zzé\rlll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé" NEXT_LINE,
+                                                   "abc" PARAGRAPH_SEPARATOR "zzé\n"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto prog =
+    cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results = cudf::strings::extract(view, *prog);
+  auto expected =
+    cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags);
+  results      = cudf::strings::extract(view, *prog_ml);
+  expected =
+    cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+
+  prog = cudf::strings::regex_program::create("q(q.*l)l");
+  expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""},
+                                                {0, 1, 0, 0, 0, 0});
+  results = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+  // expect no matches here since the newline(s) interrupts the pattern
+  prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE);
+  expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0});
+  results  = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+}
+
 TEST_F(StringsExtractTests, EmptyExtractTest)
 {
   std::vector<char const*> h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""};
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 4582dcb1e38..47606b9b3ed 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -80,6 +82,32 @@ TEST_F(StringsFindallTests, DotAll)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
+TEST_F(StringsFindallTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé",
+                                                   "qqq\nzzé" PARAGRAPH_SEPARATOR "lll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé\r",
+                                                   "zzé" LINE_SEPARATOR "zzé" NEXT_LINE});
+  auto view  = cudf::strings_column_view(input);
+
+  auto prog =
+    cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results = cudf::strings::findall(view, *prog);
+  using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags);
+  results      = cudf::strings::findall(view, *prog_ml);
+  LCW expected_ml(
+    {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml);
+}
+
 TEST_F(StringsFindallTests, MediumRegex)
 {
   // This results in 15 regex instructions and falls in the 'medium' range.
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 8c0482653fb..9847d8d6bb5 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -245,6 +247,53 @@ TEST_F(StringsReplaceRegexTest, Multiline)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected);
 }
 
+TEST_F(StringsReplaceRegexTest, SpecialNewLines)
+{
+  auto input   = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé",
+                                                     "qqq" NEXT_LINE "zzé" NEXT_LINE "lll",
+                                                     "zzé",
+                                                     "",
+                                                     "zzé" PARAGRAPH_SEPARATOR,
+                                                     "abc\rzzé\r"});
+  auto view    = cudf::strings_column_view(input);
+  auto repl    = cudf::string_scalar("_");
+  auto pattern = std::string("^zzé$");
+  auto prog =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results  = cudf::strings::replace_re(view, *prog, repl);
+  auto expected = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé",
+                                                      "qqq" NEXT_LINE "zzé" NEXT_LINE "lll",
+                                                      "_",
+                                                      "",
+                                                      "_" PARAGRAPH_SEPARATOR,
+                                                      "abc\rzzé\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags);
+  results      = cudf::strings::replace_re(view, *prog_ml, repl);
+  expected     = cudf::test::strings_column_wrapper({"_" NEXT_LINE "qqq" NEXT_LINE "_",
+                                                     "qqq" NEXT_LINE "_" NEXT_LINE "lll",
+                                                     "_",
+                                                     "",
+                                                     "_" PARAGRAPH_SEPARATOR,
+                                                     "abc\r_\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto repl_template = std::string("[\\1]");
+  pattern            = std::string("(^zzé$)");
+  prog               = cudf::strings::regex_program::create(pattern, both_flags);
+  results            = cudf::strings::replace_with_backrefs(view, *prog, repl_template);
+  expected = cudf::test::strings_column_wrapper({"[zzé]" NEXT_LINE "qqq" NEXT_LINE "[zzé]",
+                                                 "qqq" NEXT_LINE "[zzé]" NEXT_LINE "lll",
+                                                 "[zzé]",
+                                                 "",
+                                                 "[zzé]" PARAGRAPH_SEPARATOR,
+                                                 "abc\r[zzé]\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
 {
   std::vector<char const*> h_strings{"the quick brown fox jumps over the lazy dog",
diff --git a/cpp/tests/strings/slice_tests.cpp b/cpp/tests/strings/slice_tests.cpp
index 52e439bd93f..7f7fd9d521b 100644
--- a/cpp/tests/strings/slice_tests.cpp
+++ b/cpp/tests/strings/slice_tests.cpp
@@ -268,6 +268,25 @@ TEST_F(StringsSliceTest, MaxPositions)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringsSliceTest, MultiByteChars)
+{
+  auto input = cudf::test::strings_column_wrapper({
+    // clang-format off
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "the following code snippet demonstrates how to use search for values in an ordered range  "
+            // this placement tests proper multi-byte chars handling  ------vvvvv
+    "it returns the last position where value could be inserted without the ééééé ordering ",
+    "algorithms execution is parallelized as determined by an execution policy; this is a 12345"
+    "continuation of previous row to make sure string boundaries are honored 012345678901234567"
+           //   v--- this one also
+    "01234567890é34567890012345678901234567890"
+    // clang-format on
+  });
+
+  auto results = cudf::strings::slice_strings(cudf::strings_column_view(input), 0);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
+}
+
 TEST_F(StringsSliceTest, Error)
 {
   cudf::test::strings_column_wrapper strings{"this string intentionally left blank"};
diff --git a/cpp/tests/strings/special_chars.h b/cpp/tests/strings/special_chars.h
new file mode 100644
index 00000000000..0d630f6bb52
--- /dev/null
+++ b/cpp/tests/strings/special_chars.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace cudf::test {
+
+// special new-line characters for use with regex_flags::EXT_NEWLINE
+#define NEXT_LINE           "\xC2\x85"
+#define LINE_SEPARATOR      "\xE2\x80\xA8"
+#define PARAGRAPH_SEPARATOR "\xE2\x80\xA9"
+
+}  // namespace cudf::test
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 7575a3ba846..e23f3f6e7d8 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -139,6 +139,41 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
+TEST_F(MinHashTest, WordsMinHash)
+{
+  using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto validity = cudf::test::iterators::null_at(1);
+
+  LCWS input(
+    {LCWS({"hello", "abcdéfgh"}),
+     LCWS{},
+     LCWS({"rapids", "moré", "test", "text"}),
+     LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})},
+    validity);
+
+  auto view = cudf::lists_column_view(input);
+
+  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
+  auto results = nvtext::word_minhash(view, cudf::column_view(seeds));
+  using LCW32  = cudf::test::lists_column_wrapper<uint32_t>;
+  LCW32 expected({LCW32{2069617641u, 1975382903u},
+                  LCW32{},
+                  LCW32{657297235u, 1010955999u},
+                  LCW32{644643885u, 310002789u}},
+                 validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
+  auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64));
+  using LCW64    = cudf::test::lists_column_wrapper<uint64_t>;
+  LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul},
+                    LCW64{},
+                    LCW64{5331949571924938590ul, 2088583894581919741ul},
+                    LCW64{3400468157617183341ul, 2398577492366130055ul}},
+                   validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
+}
+
 TEST_F(MinHashTest, EmptyTest)
 {
   auto input   = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
diff --git a/dependencies.yaml b/dependencies.yaml
index 483335c02ff..7a13043cc5f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -710,7 +710,16 @@ dependencies:
               - numpy==1.23.*
               - pandas==2.0.*
               - pyarrow==14.0.0
-              - cupy==12.0.0  # ignored as pip constraint
+          - matrix:
+            packages:
+      - output_types: conda
+        matrices:
+          - matrix: {dependencies: "oldest", arch: "aarch64", cuda: "12.*"}
+            packages:
+              - cupy==12.2.0  # cupy 12.2.0 is the earliest with CUDA 12 ARM packages.
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - cupy==12.0.0
           - matrix:
             packages:
       - output_types: requirements
diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md
index 0398a8d7086..41838e01dd9 100644
--- a/docs/cudf/source/cudf_pandas/usage.md
+++ b/docs/cudf/source/cudf_pandas/usage.md
@@ -120,3 +120,23 @@ To profile a script being run from the command line, pass the
 ```bash
 python -m cudf.pandas --profile script.py
 ```
+
+### cudf.pandas CLI Features
+
+Several of the ways to provide input to the `python` interpreter also work with `python -m cudf.pandas`, such as the REPL, the `-c` flag, and reading from stdin.
+
+Executing `python -m cudf.pandas` with no script name will enter a REPL (read-eval-print loop) similar to the behavior of the normal `python` interpreter.
+
+The `-c` flag accepts a code string to run, like this:
+
+```bash
+$ python -m cudf.pandas -c "import pandas; print(pandas)"
+<module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
+```
+
+Users can also provide code to execute from stdin, like this:
+
+```bash
+$ echo "import pandas; print(pandas)" | python -m cudf.pandas
+<module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
+```
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 2eaa75b3189..95f5f9734dd 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -5,9 +5,9 @@
    "id": "4c6c548b",
    "metadata": {},
    "source": [
-    "# 10 Minutes to cuDF and Dask-cuDF\n",
+    "# 10 Minutes to cuDF and Dask cuDF\n",
     "\n",
-    "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly towards new users.\n",
+    "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask cuDF, geared mainly towards new users.\n",
     "\n",
     "## What are these Libraries?\n",
     "\n",
@@ -18,13 +18,14 @@
     "[Dask cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n",
     "\n",
     "\n",
-    "> [!NOTE]  \n",
-    "> This notebook uses the explicit Dask cuDF API (`dask_cudf`) for clarity. However, we strongly recommend that you use Dask's [configuration infrastructure](https://docs.dask.org/en/latest/configuration.html) to set the `\"dataframe.backend\"` to `\"cudf\"`, and work with the `dask.dataframe` API directly. Please see the [Dask cuDF documentation](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) for more information.\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<b>Note:</b> This notebook uses the explicit Dask cuDF API (dask_cudf) for clarity. However, we strongly recommend that you use Dask's <a href=\"https://docs.dask.org/en/latest/configuration.html\">configuration infrastructure</a> to set the \"dataframe.backend\" option to \"cudf\", and work with the Dask DataFrame API directly. Please see the <a href=\"https://github.com/rapidsai/cudf/tree/main/python/dask_cudf\">Dask cuDF documentation</a> for more information.\n",
+    "</div>\n",
     "\n",
     "\n",
-    "## When to use cuDF and Dask-cuDF\n",
+    "## When to use cuDF and Dask cuDF\n",
     "\n",
-    "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF."
+    "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask cuDF."
    ]
   },
   {
@@ -115,7 +116,7 @@
    "source": [
     "ds = dask_cudf.from_cudf(s, npartitions=2)\n",
     "# Note the call to head here to show the first few entries, unlike\n",
-    "# cuDF objects, dask-cuDF objects do not have a printing\n",
+    "# cuDF objects, Dask-cuDF objects do not have a printing\n",
     "# representation that shows values since they may not be in local\n",
     "# memory.\n",
     "ds.head(n=3)"
@@ -331,11 +332,11 @@
    "id": "b17db919",
    "metadata": {},
    "source": [
-    "Now we will convert our cuDF dataframe into a dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n",
+    "Now we will convert our cuDF dataframe into a Dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n",
     "\n",
-    "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n",
+    "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the Dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a Dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n",
     "\n",
-    "*To understand more of the differences between how cuDF and dask-cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*"
+    "*To understand more of the differences between how cuDF and Dask cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*"
    ]
   },
   {
@@ -1680,7 +1681,7 @@
    "id": "7aa0089f",
    "metadata": {},
    "source": [
-    "Note here we call `compute()` rather than `head()` on the dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)."
+    "Note here we call `compute()` rather than `head()` on the Dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)."
    ]
   },
   {
@@ -2393,7 +2394,7 @@
    "id": "f6094cbe",
    "metadata": {},
    "source": [
-    "Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe."
+    "Applying functions to a `Series`. Note that applying user defined functions directly with Dask cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe."
    ]
   },
   {
@@ -3492,7 +3493,7 @@
    "id": "5ac3b004",
    "metadata": {},
    "source": [
-    "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF."
+    "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask cuDF."
    ]
   },
   {
@@ -4181,7 +4182,7 @@
    "id": "aa8a445b",
    "metadata": {},
    "source": [
-    "To convert the first few entries to pandas, we similarly call `.head()` on the dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert."
+    "To convert the first few entries to pandas, we similarly call `.head()` on the Dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert."
    ]
   },
   {
@@ -4899,7 +4900,7 @@
    "id": "787eae14",
    "metadata": {},
    "source": [
-    "Note that for the dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU."
+    "Note that for the Dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU."
    ]
   },
   {
diff --git a/docs/cudf/source/user_guide/io/io.md b/docs/cudf/source/user_guide/io/io.md
index adcdaa51e7e..97b961b455b 100644
--- a/docs/cudf/source/user_guide/io/io.md
+++ b/docs/cudf/source/user_guide/io/io.md
@@ -75,7 +75,6 @@ IO format.
 
 </div>
 
-
 **Notes:**
 
 - \[¹\] - Not all orientations are GPU-accelerated.
@@ -177,4 +176,9 @@ If no value is set, behavior will be the same as the "STABLE" option.
     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
     | DEFLATE               | ❌     | ❌     | ❌           | ❌           | ❌      | ❌     | Experimental | Experimental | ❌     |
     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+    | LZ4                   | ❌     | ❌     | Stable       | Stable       | ❌      | ❌     | Stable       | Stable       | ❌     |
+    +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+    | GZIP                  | ❌     | ❌     | Experimental | Experimental | ❌      | ❌     | ❌           | ❌           | ❌     |
+    +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+
 ```
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 9a216690384..7fe6cbd45fa 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -3,39 +3,42 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to dask-cudf's documentation!
+Welcome to Dask cuDF's documentation!
 =====================================
 
-**Dask-cuDF** (pronounced "DASK KOO-dee-eff") is an extension
+**Dask cuDF** (pronounced "DASK KOO-dee-eff") is an extension
 library for the `Dask <https://dask.org>`__ parallel computing
-framework that provides a `cuDF
-<https://docs.rapids.ai/api/cudf/stable/>`__-backed distributed
-dataframe with the same API as `Dask dataframes
-<https://docs.dask.org/en/stable/dataframe.html>`__.
+framework. When installed, Dask cuDF is automatically registered
+as the ``"cudf"`` dataframe backend for
+`Dask DataFrame <https://docs.dask.org/en/stable/dataframe.html>`__.
+
+.. note::
+  Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU
+  or multi-node execution on their own. You must also deploy a
+  `dask.distributed <https://distributed.dask.org/en/stable/>` cluster
+  to leverage multiple GPUs. We strongly recommend using `Dask-CUDA
+  <https://docs.rapids.ai/api/dask-cuda/stable/>`__ to simplify the
+  setup of the cluster, taking advantage of all features of the GPU
+  and networking hardware.
 
 If you are familiar with Dask and `pandas <pandas.pydata.org>`__ or
-`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask-cuDF
+`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask cuDF
 should feel familiar to you. If not, we recommend starting with `10
 minutes to Dask
 <https://docs.dask.org/en/stable/10-minutes-to-dask.html>`__ followed
-by `10 minutes to cuDF and Dask-cuDF
+by `10 minutes to cuDF and Dask cuDF
 <https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html>`__.
 
-When running on multi-GPU systems, `Dask-CUDA
-<https://docs.rapids.ai/api/dask-cuda/stable/>`__ is recommended to
-simplify the setup of the cluster, taking advantage of all features of
-the GPU and networking hardware.
 
-Using Dask-cuDF
+Using Dask cuDF
 ---------------
 
-When installed, Dask-cuDF registers itself as a dataframe backend for
-Dask. This means that in many cases, using cuDF-backed dataframes requires
-only small changes to an existing workflow. The minimal change is to
-select cuDF as the dataframe backend in :doc:`Dask's
-configuration <dask:configuration>`. To do so, we must set the option
-``dataframe.backend`` to ``cudf``. From Python, this can be achieved
-like so::
+The Dask DataFrame API (Recommended)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Simply use the `Dask configuration <dask:configuration>` system to
+set the ``"dataframe.backend"`` option to ``"cudf"``. From Python,
+this can be achieved like so::
 
   import dask
 
@@ -44,52 +47,157 @@ like so::
 Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the
 environment before running your code.
 
-Dataframe creation from on-disk formats
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If your workflow creates Dask dataframes from on-disk formats
-(for example using :func:`dask.dataframe.read_parquet`), then setting
-the backend may well be enough to migrate your workflow.
-
-For example, consider reading a dataframe from parquet::
+Once this is done, the public Dask DataFrame API will leverage
+``cudf`` automatically when a new DataFrame collection is created
+from an on-disk format using any of the following ``dask.dataframe``
+functions::
 
-   import dask.dataframe as dd
+* :func:`dask.dataframe.read_parquet`
+* :func:`dask.dataframe.read_json`
+* :func:`dask.dataframe.read_csv`
+* :func:`dask.dataframe.read_orc`
+* :func:`dask.dataframe.read_hdf`
+* :func:`dask.dataframe.from_dict`
 
-   # By default, we obtain a pandas-backed dataframe
-   df = dd.read_parquet("data.parquet", ...)
+For example::
 
+  import dask.dataframe as dd
 
-To obtain a cuDF-backed dataframe, we must set the
-``dataframe.backend`` configuration option::
+  # By default, we obtain a pandas-backed dataframe
+  df = dd.read_parquet("data.parquet", ...)
 
   import dask
-  import dask.dataframe as dd
 
   dask.config.set({"dataframe.backend": "cudf"})
-  # This gives us a cuDF-backed dataframe
+  # This now gives us a cuDF-backed dataframe
   df = dd.read_parquet("data.parquet", ...)
 
-This code will use cuDF's GPU-accelerated :func:`parquet reader
-<cudf.read_parquet>` to read partitions of the data.
+When other functions are used to create a new collection
+(e.g. :func:`from_map`, :func:`from_pandas`, :func:`from_delayed`,
+and :func:`from_array`), the backend of the new collection will
+depend on the inputs to those functions. For example::
+
+  import pandas as pd
+  import cudf
+
+  # This gives us a pandas-backed dataframe
+  dd.from_pandas(pd.DataFrame({"a": range(10)}))
+
+  # This gives us a cuDF-backed dataframe
+  dd.from_pandas(cudf.DataFrame({"a": range(10)}))
+
+An existing collection can always be moved to a specific backend
+using the :func:`dask.dataframe.DataFrame.to_backend` API::
+
+  # This ensures that we have a cuDF-backed dataframe
+  df = df.to_backend("cudf")
+
+  # This ensures that we have a pandas-backed dataframe
+  df = df.to_backend("pandas")
+
+The explicit Dask cuDF API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition to providing the ``"cudf"`` backend for Dask DataFrame,
+Dask cuDF also provides an explicit ``dask_cudf`` API::
+
+  import dask_cudf
+
+  # This always gives us a cuDF-backed dataframe
+  df = dask_cudf.read_parquet("data.parquet", ...)
+
+This API is used implicitly by the Dask DataFrame API when the ``"cudf"``
+backend is enabled. Therefore, using it directly will not provide any
+performance benefit over the CPU/GPU-portable ``dask.dataframe`` API.
+Also, using some parts of the explicit API are incompatible with
+automatic query planning (see the next section).
+
+The explicit Dask cuDF API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+).
+As long as the ``"dataframe.query-planning"`` configuration is set to
+``True`` (the default) when ``dask.dataframe`` is first imported, `Dask
+Expressions <https://github.com/dask/dask-expr>`__ will be used under the hood.
+
+For example, the following code will automatically benefit from predicate
+pushdown when the result is computed::
+
+  df = dd.read_parquet("/my/parquet/dataset/")
+  result = df.sort_values('B')['A']
+
+Unoptimized expression graph (``df.pprint()``)::
+
+  Projection: columns='A'
+    SortValues: by=['B'] shuffle_method='tasks' options={}
+      ReadParquetFSSpec: path='/my/parquet/dataset/' ...
+
+Simplified expression graph (``df.simplify().pprint()``)::
+
+  Projection: columns='A'
+    SortValues: by=['B'] shuffle_method='tasks' options={}
+      ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ...
+
+.. note::
+  Dask will automatically simplify the expression graph (within
+  :func:`optimize`) when the result is converted to a task graph
+  (via :func:`compute` or :func:`persist`). You do not need to call
+  :func:`simplify` yourself.
+
+
+Using Multiple GPUs and Multiple Nodes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try
+to partition your data into small-enough tasks to fit comfortably in the
+memory of a single GPU. This means the necessary compute tasks needed to
+compute a query can often be streamed to a single GPU process for
+out-of-core computing. This also means that the compute tasks can be
+executed in parallel over a multi-GPU cluster.
+
+In order to execute your Dask workflow on multiple GPUs, you will
+typically need to use `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+to deploy distributed Dask cluster, and
+`Distributed <https://distributed.dask.org/en/stable/client.html>`__
+to define a client object. For example::
+
+  from dask_cuda import LocalCUDACluster
+  from distributed import Client
+
+  if __name__ == "__main__":
+
+    client = Client(
+      LocalCUDACluster(
+        CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
+        rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
+        enable_cudf_spill=True,  # Improve device memory stability
+        local_directory="/fast/scratch/",  # Use fast local storage for spilling
+      )
+    )
+
+    df = dd.read_parquet("/my/parquet/dataset/")
+    agg = df.groupby('B').sum()
+    agg.compute()  # This will use the cluster defined above
+
+.. note::
+  This example uses :func:`compute` to materialize a concrete
+  ``cudf.DataFrame`` object in local memory. Never call :func:`compute`
+  on a large collection that cannot fit comfortably in the memory of a
+  single GPU! See Dask's `documentation on managing computation
+  <https://distributed.dask.org/en/stable/manage-computation.html>`__
+  for more details.
 
-Dataframe creation from in-memory formats
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Please see the `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+documentation for more information about deploying GPU-aware clusters
+(including `best practices
+<https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/>`__).
 
-If you already have a dataframe in memory and want to convert it to a
-cuDF-backend one, there are two options depending on whether the
-dataframe is already a Dask one or not. If you have a Dask dataframe,
-then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"``
-as the backend; if you have a pandas dataframe then you can either
-call :func:`dask.dataframe.from_pandas` followed by
-:func:`~dask.dataframe.to_backend` or first convert the dataframe with
-:func:`cudf.from_pandas` and then parallelise this with
-:func:`dask_cudf.from_cudf`.
 
 API Reference
 -------------
 
-Generally speaking, Dask-cuDF tries to offer exactly the same API as
-Dask itself. There are, however, some minor differences mostly because
+Generally speaking, Dask cuDF tries to offer exactly the same API as
+Dask DataFrame. There are, however, some minor differences mostly because
 cuDF does not :doc:`perfectly mirror <cudf:user_guide/PandasCompat>`
 the pandas API, or because cuDF provides additional configuration
 flags (these mostly occur in data reading and writing interfaces).
@@ -97,7 +205,7 @@ flags (these mostly occur in data reading and writing interfaces).
 As a result, straightforward workflows can be migrated without too
 much trouble, but more complex ones that utilise more features may
 need a bit of tweaking. The API documentation describes details of the
-differences and all functionality that Dask-cuDF supports.
+differences and all functionality that Dask cuDF supports.
 
 .. toctree::
    :maxdepth: 2
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 5a0fbd224ad..6a0f0f6f169 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -218,7 +218,13 @@ static long initViewHandle(DType type, int numRows, int nullCount,
         od, vd, nullCount, numRows, childHandles);
   }
 
-  static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) {
+  /**
+   * Creates a ColumnVector from a native column_view using a contiguous device allocation.
+   *
+   * @param columnViewAddress address of the native column_view
+   * @param buffer device buffer containing the data referenced by the column view
+   */
+  public static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) {
     return new ColumnVector(columnViewAddress, buffer);
   }
 
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index b37d0d88ec9..c8308ca17ec 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -34,6 +34,10 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean normalizeWhitespace;
   private final boolean mixedTypesAsStrings;
   private final boolean keepStringQuotes;
+  private final boolean strictValidation;
+  private final boolean allowLeadingZeros;
+  private final boolean allowNonNumericNumbers;
+  private final boolean allowUnquotedControlChars;
 
   private JSONOptions(Builder builder) {
     super(builder);
@@ -44,6 +48,10 @@ private JSONOptions(Builder builder) {
     normalizeWhitespace = builder.normalizeWhitespace;
     mixedTypesAsStrings = builder.mixedTypesAsStrings;
     keepStringQuotes = builder.keepQuotes;
+    strictValidation = builder.strictValidation;
+    allowLeadingZeros = builder.allowLeadingZeros;
+    allowNonNumericNumbers = builder.allowNonNumericNumbers;
+    allowUnquotedControlChars = builder.allowUnquotedControlChars;
   }
 
   public boolean isDayFirst() {
@@ -75,6 +83,22 @@ public boolean keepStringQuotes() {
     return keepStringQuotes;
   }
 
+  public boolean strictValidation() {
+    return strictValidation;
+  }
+
+  public boolean leadingZerosAllowed() {
+    return allowLeadingZeros;
+  }
+
+  public boolean nonNumericNumbersAllowed() {
+    return allowNonNumericNumbers;
+  }
+
+  public boolean unquotedControlChars() {
+    return allowUnquotedControlChars;
+  }
+
   @Override
   String[] getIncludeColumnNames() {
     throw new UnsupportedOperationException("JSON reader didn't support column prune");
@@ -85,6 +109,10 @@ public static Builder builder() {
   }
 
   public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptions.Builder> {
+    private boolean strictValidation = false;
+    private boolean allowUnquotedControlChars = true;
+    private boolean allowNonNumericNumbers = false;
+    private boolean allowLeadingZeros = false;
     private boolean dayFirst = false;
     private boolean lines = true;
 
@@ -95,10 +123,45 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean mixedTypesAsStrings = false;
     private boolean keepQuotes = false;
 
+    /**
+     * Should json validation be strict or not
+     */
+    public Builder withStrictValidation(boolean isAllowed) {
+      strictValidation = isAllowed;
+      return this;
+    }
+
+    /**
+     * Should leading zeros on numbers be allowed or not. Strict validation
+     * must be enabled for this to have any effect.
+     */
+    public Builder withLeadingZeros(boolean isAllowed) {
+      allowLeadingZeros = isAllowed;
+      return this;
+    }
+
+    /**
+     * Should non-numeric numbers be allowed or not. Strict validation
+     * must be enabled for this to have any effect.
+     */
+    public Builder withNonNumericNumbers(boolean isAllowed) {
+      allowNonNumericNumbers = isAllowed;
+      return this;
+    }
+
+    /**
+     * Should unquoted control chars be allowed in strings. Strict validation
+     * must be enabled for this to have any effect.
+     */
+    public Builder withUnquotedControlChars(boolean isAllowed) {
+      allowUnquotedControlChars = isAllowed;
+      return this;
+    }
+
     /**
      * Whether to parse dates as DD/MM versus MM/DD
      * @param dayFirst true: DD/MM, false, MM/DD
-     * @return
+     * @return builder for chaining
      */
     public Builder withDayFirst(boolean dayFirst) {
       this.dayFirst = dayFirst;
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
index 7b58817550d..8c8180436e6 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -24,9 +24,13 @@
  */
 public final class ParquetWriterOptions extends CompressionMetadataWriterOptions {
   private final StatisticsFrequency statsGranularity;
+  private int rowGroupSizeRows;
+  private long rowGroupSizeBytes;
 
   private ParquetWriterOptions(Builder builder) {
     super(builder);
+    this.rowGroupSizeRows = builder.rowGroupSizeRows;
+    this.rowGroupSizeBytes = builder.rowGroupSizeBytes;
     this.statsGranularity = builder.statsGranularity;
   }
 
@@ -51,18 +55,38 @@ public static Builder builder() {
     return new Builder();
   }
 
+  public int getRowGroupSizeRows() {
+    return rowGroupSizeRows;
+  }
+
+  public long getRowGroupSizeBytes() {
+    return rowGroupSizeBytes;
+  }
+
   public StatisticsFrequency getStatisticsFrequency() {
     return statsGranularity;
   }
 
   public static class Builder extends CompressionMetadataWriterOptions.Builder
         <Builder, ParquetWriterOptions> {
+    private int rowGroupSizeRows = 1000000; //Max of 1 million rows per row group
+    private long rowGroupSizeBytes = 128 * 1024 * 1024; //Max of 128MB per row group
     private StatisticsFrequency statsGranularity = StatisticsFrequency.ROWGROUP;
 
     public Builder() {
       super();
     }
 
+    public Builder withRowGroupSizeRows(int rowGroupSizeRows) {
+      this.rowGroupSizeRows = rowGroupSizeRows;
+      return this;
+    }
+
+    public Builder withRowGroupSizeBytes(long rowGroupSizeBytes) {
+      this.rowGroupSizeBytes = rowGroupSizeBytes;
+      return this;
+    }
+
     public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) {
       this.statsGranularity = statsGranularity;
       return this;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 36e342cae13..09da43374ae 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -254,7 +254,11 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean normalizeSingleQuotes,
                                         boolean normalizeWhitespace,
                                         boolean mixedTypesAsStrings,
-                                        boolean keepStringQuotes) throws CudfException;
+                                        boolean keepStringQuotes,
+                                        boolean strictValidation,
+                                        boolean allowLeadingZeros,
+                                        boolean allowNonNumericNumbers,
+                                        boolean allowUnquotedControl) throws CudfException;
 
   private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
                                       int[] dTypeIds, int[] dTypeScales,
@@ -264,6 +268,10 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
+                                      boolean strictValidation,
+                                      boolean allowLeadingZeros,
+                                      boolean allowNonNumericNumbers,
+                                      boolean allowUnquotedControl,
                                       long dsHandle) throws CudfException;
 
   private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
@@ -272,7 +280,12 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool
                                       boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
+                                      boolean strictValidation,
+                                      boolean allowLeadingZeros,
+                                      boolean allowNonNumericNumbers,
+                                      boolean allowUnquotedControl,
                                       long dsHandle) throws CudfException;
+
   private static native long readAndInferJSON(long address, long length,
                                               boolean dayFirst,
                                               boolean lines,
@@ -280,7 +293,11 @@ private static native long readAndInferJSON(long address, long length,
                                               boolean normalizeSingleQuotes,
                                               boolean normalizeWhitespace,
                                               boolean mixedTypesAsStrings,
-                                              boolean keepStringQuotes) throws CudfException;
+                                              boolean keepStringQuotes,
+                                              boolean strictValidation,
+                                              boolean allowLeadingZeros,
+                                              boolean allowNonNumericNumbers,
+                                              boolean allowUnquotedControl) throws CudfException;
 
   /**
    * Read in Parquet formatted data.
@@ -315,20 +332,22 @@ private static native long[] readAvroFromDataSource(String[] filterColumnNames,
 
   /**
    * Setup everything to write parquet formatted data to a file.
-   * @param columnNames     names that correspond to the table columns
-   * @param numChildren     Children of the top level
-   * @param flatNumChildren flattened list of children per column
-   * @param nullable        true if the column can have nulls else false
-   * @param metadataKeys    Metadata key names to place in the Parquet file
-   * @param metadataValues  Metadata values corresponding to metadataKeys
-   * @param compression     native compression codec ID
-   * @param statsFreq       native statistics frequency ID
-   * @param isInt96         true if timestamp type is int96
-   * @param precisions      precision list containing all the precisions of the decimal types in
-   *                        the columns
-   * @param isMapValues     true if a column is a map
-   * @param isBinaryValues  true if a column is a binary
-   * @param filename        local output path
+   * @param columnNames       names that correspond to the table columns
+   * @param numChildren       Children of the top level
+   * @param flatNumChildren   flattened list of children per column
+   * @param nullable          true if the column can have nulls else false
+   * @param metadataKeys      Metadata key names to place in the Parquet file
+   * @param metadataValues    Metadata values corresponding to metadataKeys
+   * @param compression       native compression codec ID
+   * @param rowGroupSizeRows  max #rows in a row group
+   * @param rowGroupSizeBytes max #bytes in a row group
+   * @param statsFreq         native statistics frequency ID
+   * @param isInt96           true if timestamp type is int96
+   * @param precisions        precision list containing all the precisions of the decimal types in
+   *                          the columns
+   * @param isMapValues       true if a column is a map
+   * @param isBinaryValues    true if a column is a binary
+   * @param filename          local output path
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
   private static native long writeParquetFileBegin(String[] columnNames,
@@ -338,6 +357,8 @@ private static native long writeParquetFileBegin(String[] columnNames,
                                                    String[] metadataKeys,
                                                    String[] metadataValues,
                                                    int compression,
+                                                   int rowGroupSizeRows,
+                                                   long rowGroupSizeBytes,
                                                    int statsFreq,
                                                    boolean[] isInt96,
                                                    int[] precisions,
@@ -349,20 +370,22 @@ private static native long writeParquetFileBegin(String[] columnNames,
 
   /**
    * Setup everything to write parquet formatted data to a buffer.
-   * @param columnNames     names that correspond to the table columns
-   * @param numChildren     Children of the top level
-   * @param flatNumChildren flattened list of children per column
-   * @param nullable        true if the column can have nulls else false
-   * @param metadataKeys    Metadata key names to place in the Parquet file
-   * @param metadataValues  Metadata values corresponding to metadataKeys
-   * @param compression     native compression codec ID
-   * @param statsFreq       native statistics frequency ID
-   * @param isInt96         true if timestamp type is int96
-   * @param precisions      precision list containing all the precisions of the decimal types in
-   *                        the columns
-   * @param isMapValues     true if a column is a map
-   * @param isBinaryValues  true if a column is a binary
-   * @param consumer        consumer of host buffers produced.
+   * @param columnNames       names that correspond to the table columns
+   * @param numChildren       Children of the top level
+   * @param flatNumChildren   flattened list of children per column
+   * @param nullable          true if the column can have nulls else false
+   * @param metadataKeys      Metadata key names to place in the Parquet file
+   * @param metadataValues    Metadata values corresponding to metadataKeys
+   * @param compression       native compression codec ID
+   * @param rowGroupSizeRows  max #rows in a row group
+   * @param rowGroupSizeBytes max #bytes in a row group
+   * @param statsFreq         native statistics frequency ID
+   * @param isInt96           true if timestamp type is int96
+   * @param precisions        precision list containing all the precisions of the decimal types in
+   *                          the columns
+   * @param isMapValues       true if a column is a map
+   * @param isBinaryValues    true if a column is a binary
+   * @param consumer          consumer of host buffers produced.
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
   private static native long writeParquetBufferBegin(String[] columnNames,
@@ -372,6 +395,8 @@ private static native long writeParquetBufferBegin(String[] columnNames,
                                                      String[] metadataKeys,
                                                      String[] metadataValues,
                                                      int compression,
+                                                     int rowGroupSizeRows,
+                                                     long rowGroupSizeBytes,
                                                      int statsFreq,
                                                      boolean[] isInt96,
                                                      int[] precisions,
@@ -1292,7 +1317,11 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     opts.isNormalizeSingleQuotes(),
                     opts.isNormalizeWhitespace(),
                     opts.isMixedTypesAsStrings(),
-                opts.keepStringQuotes()))) {
+                    opts.keepStringQuotes(),
+                    opts.strictValidation(),
+                    opts.leadingZerosAllowed(),
+                    opts.nonNumericNumbersAllowed(),
+                    opts.unquotedControlChars()))) {
 
       return gatherJSONColumns(schema, twm, -1);
     }
@@ -1370,7 +1399,12 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
         opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
         opts.isNormalizeSingleQuotes(),
         opts.isNormalizeWhitespace(),
-        opts.isMixedTypesAsStrings(), opts.keepStringQuotes()));
+        opts.isMixedTypesAsStrings(),
+        opts.keepStringQuotes(),
+        opts.strictValidation(),
+        opts.leadingZerosAllowed(),
+        opts.nonNumericNumbersAllowed(),
+        opts.unquotedControlChars()));
   }
 
   /**
@@ -1388,6 +1422,10 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
           opts.isNormalizeWhitespace(),
           opts.isMixedTypesAsStrings(),
           opts.keepStringQuotes(),
+          opts.strictValidation(),
+          opts.leadingZerosAllowed(),
+          opts.nonNumericNumbersAllowed(),
+          opts.unquotedControlChars(),
           dsHandle));
         return twm;
       } finally {
@@ -1430,10 +1468,18 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     try (TableWithMeta twm = new TableWithMeta(readJSON(
             schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
             schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
-            buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
-            opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+            buffer.getAddress() + offset, len,
+            opts.isDayFirst(),
+            opts.isLines(),
+            opts.isRecoverWithNull(),
+            opts.isNormalizeSingleQuotes(),
             opts.isNormalizeWhitespace(),
-            opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) {
+            opts.isMixedTypesAsStrings(),
+            opts.keepStringQuotes(),
+            opts.strictValidation(),
+            opts.leadingZerosAllowed(),
+            opts.nonNumericNumbersAllowed(),
+            opts.unquotedControlChars()))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
     }
   }
@@ -1454,17 +1500,26 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
    * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
    * @param opts various JSON parsing options.
    * @param ds the DataSource to read from.
-   * @param emtpyRowCount the number of rows to return if no columns were read.
+   * @param emptyRowCount the number of rows to return if no columns were read.
    * @return the data parsed as a table on the GPU.
    */
-  public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emtpyRowCount) {
+  public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) {
     long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
-        schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(),
-        opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+        schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
+        opts.isDayFirst(),
+        opts.isLines(),
+        opts.isRecoverWithNull(),
+        opts.isNormalizeSingleQuotes(),
         opts.isNormalizeWhitespace(),
-        opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) {
-      return gatherJSONColumns(schema, twm, emtpyRowCount);
+        opts.isMixedTypesAsStrings(),
+        opts.keepStringQuotes(),
+        opts.strictValidation(),
+        opts.leadingZerosAllowed(),
+        opts.nonNumericNumbersAllowed(),
+        opts.unquotedControlChars(),
+        dsHandle))) {
+      return gatherJSONColumns(schema, twm, emptyRowCount);
     } finally {
       DataSourceHelper.destroyWrapperDataSource(dsHandle);
     }
@@ -1773,6 +1828,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
+          options.getRowGroupSizeRows(),
+          options.getRowGroupSizeBytes(),
           options.getStatisticsFrequency().nativeId,
           options.getFlatIsTimeTypeInt96(),
           options.getFlatPrecision(),
@@ -1793,6 +1850,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
+          options.getRowGroupSizeRows(),
+          options.getRowGroupSizeBytes(),
           options.getStatisticsFrequency().nativeId,
           options.getFlatIsTimeTypeInt96(),
           options.getFlatPrecision(),
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c5abf08a59d..92e213bcb60 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1623,6 +1623,10 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
                                                          jboolean normalize_whitespace,
                                                          jboolean mixed_types_as_string,
                                                          jboolean keep_quotes,
+                                                         jboolean strict_validation,
+                                                         jboolean allow_leading_zeros,
+                                                         jboolean allow_nonnumeric_numbers,
+                                                         jboolean allow_unquoted_control,
                                                          jlong ds_handle)
 {
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
@@ -1642,8 +1646,13 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
-
+    if (strict_validation) {
+      opts.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
     auto result =
       std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
@@ -1652,17 +1661,22 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
-                                                                   jclass,
-                                                                   jlong buffer,
-                                                                   jlong buffer_length,
-                                                                   jboolean day_first,
-                                                                   jboolean lines,
-                                                                   jboolean recover_with_null,
-                                                                   jboolean normalize_single_quotes,
-                                                                   jboolean normalize_whitespace,
-                                                                   jboolean mixed_types_as_string,
-                                                                   jboolean keep_quotes)
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
+                                           jclass,
+                                           jlong buffer,
+                                           jlong buffer_length,
+                                           jboolean day_first,
+                                           jboolean lines,
+                                           jboolean recover_with_null,
+                                           jboolean normalize_single_quotes,
+                                           jboolean normalize_whitespace,
+                                           jboolean mixed_types_as_string,
+                                           jboolean keep_quotes,
+                                           jboolean strict_validation,
+                                           jboolean allow_leading_zeros,
+                                           jboolean allow_nonnumeric_numbers,
+                                           jboolean allow_unquoted_control)
 {
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1684,8 +1698,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
         .recovery_mode(recovery_mode)
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .strict_validation(strict_validation)
         .mixed_types_as_string(mixed_types_as_string)
         .keep_quotes(keep_quotes);
+    if (strict_validation) {
+      opts.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
 
     auto result =
       std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1790,6 +1810,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
                                                  jboolean normalize_whitespace,
                                                  jboolean mixed_types_as_string,
                                                  jboolean keep_quotes,
+                                                 jboolean strict_validation,
+                                                 jboolean allow_leading_zeros,
+                                                 jboolean allow_nonnumeric_numbers,
+                                                 jboolean allow_unquoted_control,
                                                  jlong ds_handle)
 {
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
@@ -1824,7 +1848,13 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
+    if (strict_validation) {
+      opts.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
@@ -1874,7 +1904,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
                                                            jboolean normalize_single_quotes,
                                                            jboolean normalize_whitespace,
                                                            jboolean mixed_types_as_string,
-                                                           jboolean keep_quotes)
+                                                           jboolean keep_quotes,
+                                                           jboolean strict_validation,
+                                                           jboolean allow_leading_zeros,
+                                                           jboolean allow_nonnumeric_numbers,
+                                                           jboolean allow_unquoted_control)
 {
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1923,7 +1957,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
+    if (strict_validation) {
+      opts.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
@@ -2110,6 +2150,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
                                                   jobjectArray j_metadata_keys,
                                                   jobjectArray j_metadata_values,
                                                   jint j_compression,
+                                                  jint j_row_group_size_rows,
+                                                  jlong j_row_group_size_bytes,
                                                   jint j_stats_freq,
                                                   jbooleanArray j_isInt96,
                                                   jintArray j_precisions,
@@ -2165,6 +2207,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
       chunked_parquet_writer_options::builder(sink)
         .metadata(std::move(metadata))
         .compression(static_cast<compression_type>(j_compression))
+        .row_group_size_rows(j_row_group_size_rows)
+        .row_group_size_bytes(j_row_group_size_bytes)
         .stats_level(static_cast<statistics_freq>(j_stats_freq))
         .key_value_metadata({kv_metadata})
         .compression_statistics(stats)
@@ -2187,6 +2231,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
                                                 jobjectArray j_metadata_keys,
                                                 jobjectArray j_metadata_values,
                                                 jint j_compression,
+                                                jint j_row_group_size_rows,
+                                                jlong j_row_group_size_bytes,
                                                 jint j_stats_freq,
                                                 jbooleanArray j_isInt96,
                                                 jintArray j_precisions,
@@ -2240,6 +2286,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
       chunked_parquet_writer_options::builder(sink)
         .metadata(std::move(metadata))
         .compression(static_cast<compression_type>(j_compression))
+        .row_group_size_rows(j_row_group_size_rows)
+        .row_group_size_bytes(j_row_group_size_bytes)
         .stats_level(static_cast<statistics_freq>(j_stats_freq))
         .key_value_metadata({kv_metadata})
         .compression_statistics(stats)
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 050bcbb268f..830f2b33b32 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -437,6 +437,7 @@ void testReadWhitespacesJSONFile() throws IOException {
     }
   }
 
+  @Test
   void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
     Schema schema = Schema.builder()
         .column(DType.STRING, "A")
@@ -455,6 +456,206 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
     }
   }
 
+  private static final byte[] JSON_VALIDATION_BUFFER = (
+      "{\"a\":true}\n" +
+      "{\"a\":false}\n" +
+      "{\"a\":null}\n" +
+      "{\"a\":true, \"b\":truee}\n" +
+      "{\"a\":true, \"b\":\"nulll\"}\n" +
+      "{\"a\": 1}\n" +
+      "{\"a\": 0}\n" +
+      "{\"a\": -}\n" +
+      "{\"a\": -0}\n" +
+      "{\"a\": -01}\n" +
+
+      "{\"a\": 01}\n" +
+      "{\"a\": -0.1}\n" +
+      "{\"a\": -00.1}\n" +
+      "{\"a\": NaN}\n" +
+      "{\"a\": INF}\n" +
+      "{\"a\": +INF}\n" +
+      "{\"a\": -INF}\n" +
+      "{\"a\": +Infinity}\n" +
+      "{\"a\": Infinity}\n" +
+      "{\"a\": -Infinity}\n" +
+
+      "{\"a\": INFinity}\n" +
+      "{\"a\":\"3710-11-10T02:46:58.732Z\"}\n" +
+      "{\"a\":12.}\n" +
+      "{\"a\": -3.4e+38}\n" +
+      "{\"a\": -3.4e-38}\n" +
+      "{\"a\": 1.4e38}\n" +
+      "{\"a\": -3.4E+38}\n" +
+      "{\"a\": -3.4E-38}\n" +
+      "{\"a\": 1.4E38}\n" +
+      "{\"a\": -3.4E+}\n" +
+
+      "{\"a\": -3.4E-}\n" +
+      "{\"a\": \"A\u0000B\"}\n" +
+      "{\"a\": \"A\\u0000B\"}\n" +
+      "{\"a\": \"A\u0001B\"}\n" +
+      "{\"a\": \"A\\u0001B\"}\n" +
+      "{\"a\": \"A\u001FB\"}\n" +
+      "{\"a\": \"A\\u001FB\"}\n" +
+      "{\"a\": \"A\u0020B\"}\n" +
+      "{\"a\": \"A\\u0020B\"}\n" +
+      "{\"a\": \"\\u12\"}\n" +
+
+      "{\"a\": \"\\z\"}\n" +
+      "{\"a\": \"\\r\"}\n" +
+      "{\"a\": \"something\", \"b\": \"\\z\"}\n"
+  ).getBytes(StandardCharsets.UTF_8);
+
+  @Test
+  void testJSONValidationNoStrict() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(false)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01",
+            "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity",
+            "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+",
+            "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", "\"something\"")
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidation() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", null,
+            null, "-0.1", null, null, null, null, null, null, null, null,
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidationLeadingZeros() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(true)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", "-01",
+            "01", "-0.1", "-00.1", null, null, null, null, null, null, null,
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidationNonNumeric() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(true)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", null,
+            null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity",
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidationUnquotedControl() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(false)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", null,
+            null, "-0.1", null, null, null, null, null, null, null, null,
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" +
       "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" +
       "{\"d\":[1,2,3]}\n" +
@@ -8921,7 +9122,11 @@ void testParquetWriteToBufferChunked() {
     columns.add(Columns.STRUCT.name);
     WriteUtils.buildWriterOptions(optBuilder, columns);
     ParquetWriterOptions options = optBuilder.build();
-    ParquetWriterOptions optionsNoCompress = optBuilder.withCompressionType(CompressionType.NONE).build();
+    ParquetWriterOptions optionsNoCompress =
+      optBuilder.withCompressionType(CompressionType.NONE)
+      .withRowGroupSizeRows(10000)
+      .withRowGroupSizeBytes(10000)
+      .build();
     try (Table table0 = getExpectedFileTable(columns);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
@@ -9007,6 +9212,8 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
           .withDecimalColumn("_c7", 4)
           .withDecimalColumn("_c8", 6)
           .withCompressionType(CompressionType.NONE)
+          .withRowGroupSizeRows(10000)
+          .withRowGroupSizeBytes(10000)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index d7da42a1708..99b759e2166 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -46,7 +46,7 @@
     ListDtype,
     StructDtype,
 )
-from cudf.core.groupby import Grouper
+from cudf.core.groupby import Grouper, NamedAgg
 from cudf.core.index import (
     BaseIndex,
     CategoricalIndex,
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 5ee15d0e409..59cb8d51440 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -10,6 +10,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
+    word_minhash as cpp_word_minhash,
+    word_minhash64 as cpp_word_minhash64,
 )
 from pylibcudf.libcudf.types cimport size_type
 
@@ -54,3 +56,39 @@ def minhash64(Column strings, Column seeds, int width):
         )
 
     return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def word_minhash(Column input, Column seeds):
+
+    cdef column_view c_input = input.view()
+    cdef column_view c_seeds = seeds.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash(
+                c_input,
+                c_seeds
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def word_minhash64(Column input, Column seeds):
+
+    cdef column_view c_input = input.view()
+    cdef column_view c_seeds = seeds.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash64(
+                c_input,
+                c_seeds
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 47a194c4fda..4bf8a9b1a8f 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
 from cudf._lib.nvtext.generate_ngrams import (
     generate_character_ngrams,
@@ -6,7 +6,12 @@
     hash_character_ngrams,
 )
 from cudf._lib.nvtext.jaccard import jaccard_index
-from cudf._lib.nvtext.minhash import minhash, minhash64
+from cudf._lib.nvtext.minhash import (
+    minhash,
+    minhash64,
+    word_minhash,
+    word_minhash64,
+)
 from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
 from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
 from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 16e6908f308..e059917b0b8 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5349,6 +5349,76 @@ def minhash64(
             libstrings.minhash64(self._column, seeds_column, width)
         )
 
+    def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
+        """
+        Compute the minhash of a list column of strings.
+        This uses the MurmurHash3_x86_32 algorithm for the hash function.
+
+        Parameters
+        ----------
+        seeds : ColumnLike
+            The seeds used for the hash algorithm.
+            Must be of type uint32.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+        >>> ls.str.word_minhash(seeds=seeds)
+        0     [21141582, 1232889953, 1268336794]
+        1    [962346254, 2321233602, 1354839212]
+        dtype: list
+        """
+        if seeds is None:
+            seeds_column = column.as_column(0, dtype=np.uint32, length=1)
+        else:
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint32:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.word_minhash(self._column, seeds_column)
+        )
+
+    def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
+        """
+        Compute the minhash of a list column of strings.
+        This uses the MurmurHash3_x64_128 algorithm for the hash function.
+        This function generates 2 uint64 values but only the first
+        uint64 value is used.
+
+        Parameters
+        ----------
+        seeds : ColumnLike
+            The seeds used for the hash algorithm.
+            Must be of type uint64.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+        >>> ls.str.word_minhash64(seeds)
+        0    [2603139454418834912, 8644371945174847701, 5541030711534384340]
+        1    [5240044617220523711, 5847101123925041457, 153762819128779913]
+        dtype: list
+        """
+        if seeds is None:
+            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
+        else:
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint64:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.word_minhash64(self._column, seeds_column)
+        )
+
     def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
         """
         Compute the Jaccard index between this column and the given
diff --git a/python/cudf/cudf/core/groupby/__init__.py b/python/cudf/cudf/core/groupby/__init__.py
index 4375ed3e3da..621edb316cf 100644
--- a/python/cudf/cudf/core/groupby/__init__.py
+++ b/python/cudf/cudf/core/groupby/__init__.py
@@ -1,8 +1,9 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf.core.groupby.groupby import GroupBy, Grouper
+from cudf.core.groupby.groupby import GroupBy, Grouper, NamedAgg
 
 __all__ = [
     "GroupBy",
     "Grouper",
+    "NamedAgg",
 ]
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 4f283d41b17..6424c8af877 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -76,6 +76,34 @@ def _is_row_of(chunk, obj):
     )
 
 
+NamedAgg = pd.NamedAgg
+
+
+NamedAgg.__doc__ = """
+Helper for column specific aggregation with control over output column names.
+
+Subclass of typing.NamedTuple.
+
+Parameters
+----------
+column : Hashable
+    Column label in the DataFrame to apply aggfunc.
+aggfunc : function or str
+    Function to apply to the provided column.
+
+Examples
+--------
+>>> df = cudf.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]})
+>>> agg_a = cudf.NamedAgg(column="a", aggfunc="min")
+>>> agg_1 = cudf.NamedAgg(column=1, aggfunc=lambda x: x.mean())
+>>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1)
+        result_a  result_1
+key
+1          -1      10.5
+2           1      12.0
+"""
+
+
 groupby_doc_template = textwrap.dedent(
     """Group using a mapper or by a Series of columns.
 
@@ -1296,9 +1324,21 @@ def _normalize_aggs(
                 columns = values._columns
                 aggs_per_column = (aggs,) * len(columns)
         elif not aggs and kwargs:
-            column_names, aggs_per_column = kwargs.keys(), kwargs.values()
-            columns = tuple(self.obj._data[x[0]] for x in kwargs.values())
-            aggs_per_column = tuple(x[1] for x in kwargs.values())
+            column_names = kwargs.keys()
+
+            def _raise_invalid_type(x):
+                raise TypeError(
+                    f"Invalid keyword argument {x} of type {type(x)} was passed to agg"
+                )
+
+            columns, aggs_per_column = zip(
+                *(
+                    (self.obj._data[x[0]], x[1])
+                    if isinstance(x, tuple)
+                    else _raise_invalid_type(x)
+                    for x in kwargs.values()
+                )
+            )
         else:
             raise TypeError("Must provide at least one aggregation function.")
 
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 3d205957126..c026579b8b5 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -738,7 +738,8 @@ def get_dummies(
     sparse : boolean, optional
         Right now this is NON-FUNCTIONAL argument in rapids.
     drop_first : boolean, optional
-        Right now this is NON-FUNCTIONAL argument in rapids.
+        Whether to get k-1 dummies out of k categorical levels by removing the
+        first level.
     columns : sequence of str, optional
         Names of columns to encode. If not provided, will attempt to encode all
         columns. Note this is different from pandas default behavior, which
@@ -806,9 +807,6 @@ def get_dummies(
     if sparse:
         raise NotImplementedError("sparse is not supported yet")
 
-    if drop_first:
-        raise NotImplementedError("drop_first is not supported yet")
-
     if isinstance(data, cudf.DataFrame):
         encode_fallback_dtypes = ["object", "category"]
 
@@ -862,6 +860,7 @@ def get_dummies(
                     prefix=prefix_map.get(name, prefix),
                     prefix_sep=prefix_sep_map.get(name, prefix_sep),
                     dtype=dtype,
+                    drop_first=drop_first,
                 )
                 result_data.update(col_enc_data)
             return cudf.DataFrame._from_data(result_data, index=data.index)
@@ -874,6 +873,7 @@ def get_dummies(
             prefix=prefix,
             prefix_sep=prefix_sep,
             dtype=dtype,
+            drop_first=drop_first,
         )
         return cudf.DataFrame._from_data(data, index=ser.index)
 
@@ -1256,6 +1256,7 @@ def _one_hot_encode_column(
     prefix: str | None,
     prefix_sep: str | None,
     dtype: Dtype | None,
+    drop_first: bool,
 ) -> dict[str, ColumnBase]:
     """Encode a single column with one hot encoding. The return dictionary
     contains pairs of (category, encodings). The keys may be prefixed with
@@ -1276,6 +1277,8 @@ def _one_hot_encode_column(
         )
     data = one_hot_encode(column, categories)
 
+    if drop_first and len(data):
+        data.pop(next(iter(data)))
     if prefix is not None and prefix_sep is not None:
         data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()}
     if dtype:
diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index 3a82829eb7a..e0d3d9101a9 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -10,6 +10,7 @@
 """
 
 import argparse
+import code
 import runpy
 import sys
 import tempfile
@@ -21,6 +22,8 @@
 
 @contextmanager
 def profile(function_profile, line_profile, fn):
+    if fn is None and (line_profile or function_profile):
+        raise RuntimeError("Enabling the profiler requires a script name.")
     if line_profile:
         with open(fn) as f:
             lines = f.readlines()
@@ -54,6 +57,11 @@ def main():
         dest="module",
         nargs=1,
     )
+    parser.add_argument(
+        "-c",
+        dest="cmd",
+        nargs=1,
+    )
     parser.add_argument(
         "--profile",
         action="store_true",
@@ -72,9 +80,18 @@ def main():
 
     args = parser.parse_args()
 
+    if args.cmd:
+        f = tempfile.NamedTemporaryFile(mode="w+b", suffix=".py")
+        f.write(args.cmd[0].encode())
+        f.seek(0)
+        args.args.insert(0, f.name)
+
     install()
-    with profile(args.profile, args.line_profile, args.args[0]) as fn:
-        args.args[0] = fn
+
+    script_name = args.args[0] if len(args.args) > 0 else None
+    with profile(args.profile, args.line_profile, script_name) as fn:
+        if script_name is not None:
+            args.args[0] = fn
         if args.module:
             (module,) = args.module
             # run the module passing the remaining arguments
@@ -85,6 +102,21 @@ def main():
             # Remove ourself from argv and continue
             sys.argv[:] = args.args
             runpy.run_path(args.args[0], run_name="__main__")
+        else:
+            if sys.stdin.isatty():
+                banner = f"Python {sys.version} on {sys.platform}"
+                site_import = not sys.flags.no_site
+                if site_import:
+                    cprt = 'Type "help", "copyright", "credits" or "license" for more information.'
+                    banner += "\n" + cprt
+            else:
+                # Don't show prompts or banners if stdin is not a TTY
+                sys.ps1 = ""
+                sys.ps2 = ""
+                banner = ""
+
+            # Launch an interactive interpreter
+            code.interact(banner=banner, exitmsg="")
 
 
 if __name__ == "__main__":
diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py
index 99e7523031b..dc20a27177a 100644
--- a/python/cudf/cudf/tests/groupby/test_agg.py
+++ b/python/cudf/cudf/tests/groupby/test_agg.py
@@ -56,3 +56,19 @@ def test_dataframe_agg(attr, func):
     )
 
     assert_eq(agg, pd_agg)
+
+    agg = getattr(df.groupby("a"), attr)(
+        foo=cudf.NamedAgg(column="b", aggfunc=func),
+        bar=cudf.NamedAgg(column="a", aggfunc=func),
+    )
+    pd_agg = getattr(pdf.groupby(["a"]), attr)(
+        foo=("b", func), bar=("a", func)
+    )
+
+    assert_eq(agg, pd_agg)
+
+
+def test_dataframe_agg_with_invalid_kwarg():
+    with pytest.raises(TypeError, match="Invalid keyword argument"):
+        df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]})
+        df.groupby("a").agg(foo=set())
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index cc17dc46e0a..e054143b438 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -161,3 +161,20 @@ def test_get_dummies_cats_deprecated():
     df = cudf.DataFrame(range(3))
     with pytest.warns(FutureWarning):
         cudf.get_dummies(df, cats={0: [0, 1, 2]})
+
+
+def test_get_dummies_drop_first_series():
+    result = cudf.get_dummies(cudf.Series(list("abcaa")), drop_first=True)
+    expected = pd.get_dummies(pd.Series(list("abcaa")), drop_first=True)
+    assert_eq(result, expected)
+
+
+def test_get_dummies_drop_first_dataframe():
+    result = cudf.get_dummies(
+        cudf.DataFrame({"A": list("abcaa"), "B": list("bcaab")}),
+        drop_first=True,
+    )
+    expected = pd.get_dummies(
+        pd.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), drop_first=True
+    )
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 8b59a7eef08..7f1b0b1cd46 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3822,8 +3822,8 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
     df1 = cudf.DataFrame(
         {
             "i32": cudf.Series([None, None, None], dtype="int32"),
-            "i64": cudf.Series([1234, None, 123], dtype="int64"),
-            "list": list([[1, 2], [None, 4], [5, 6]]),
+            "i64": cudf.Series([1234, 467, 123], dtype="int64"),
+            "list": list([[1, 2], None, [None, 6]]),
             "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
             "str": ["vfd", None, "ghu"],
             "d_list": list(
@@ -3838,14 +3838,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
 
     df2 = cudf.DataFrame(
         {
-            "str": ["abc", "def", None],
+            "str": ["abc", "def", "ghi"],
             "i64": cudf.Series([None, 65, 98], dtype="int64"),
             "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"),
-            "list": list([[7, 8], [9, 10], [None, 12]]),
+            "list": list([[7, 8], [9, 10], [11, 12]]),
             "d_list": list(
                 [
                     [pd.Timedelta(minutes=4), None],
-                    [None, None],
+                    None,
                     [pd.Timedelta(minutes=6), None],
                 ]
             ),
@@ -3900,38 +3900,27 @@ def test_parquet_reader_with_mismatched_structs():
         {
             "a": 1,
             "b": {
-                "inner_a": 10,
-                "inner_b": {"inner_inner_b": 1, "inner_inner_a": 2},
+                "a_a": 10,
+                "b_b": {"b_b_b": 1, "b_b_a": 2},
             },
             "c": 2,
         },
         {
             "a": 3,
-            "b": {"inner_a": 30, "inner_b": {"inner_inner_a": 210}},
+            "b": {"b_a": 30, "b_b": {"b_b_a": 210}},
             "c": 4,
         },
-        {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+        {"a": 5, "b": {"b_a": 50, "b_b": None}, "c": 6},
         {"a": 7, "b": None, "c": 8},
-        {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
-        None,
-        {
-            "a": None,
-            "b": {
-                "inner_a": None,
-                "inner_b": {"inner_inner_b": None, "inner_inner_a": 10},
-            },
-            "c": 10,
-        },
+        {"a": 5, "b": {"b_a": None, "b_b": None}, "c": None},
     ]
 
     data2 = [
-        {"a": 1, "b": {"inner_b": {"inner_inner_a": None}}},
-        {"a": 3, "b": {"inner_b": {"inner_inner_a": 1}}},
-        {"a": 5, "b": {"inner_b": None}},
-        {"a": 7, "b": {"inner_b": {"inner_inner_b": 1, "inner_inner_a": 0}}},
-        {"a": None, "b": {"inner_b": None}},
+        {"a": 1, "b": {"b_b": {"b_b_a": None}}},
+        {"a": 5, "b": {"b_b": None}},
+        {"a": 7, "b": {"b_b": {"b_b_b": 1, "b_b_a": 0}}},
+        {"a": None, "b": {"b_b": None}},
         None,
-        {"a": None, "b": {"inner_b": {"inner_inner_a": 1}}},
     ]
 
     # cuDF tables from struct data
@@ -3949,20 +3938,20 @@ def test_parquet_reader_with_mismatched_structs():
     # Read the struct.b.inner_b.inner_inner_a column from parquet
     got = cudf.read_parquet(
         [buf1, buf2],
-        columns=["struct.b.inner_b.inner_inner_a"],
+        columns=["struct.b.b_b.b_b_a"],
         allow_mismatched_pq_schemas=True,
     )
     got = (
         cudf.Series(got["struct"])
         .struct.field("b")
-        .struct.field("inner_b")
-        .struct.field("inner_inner_a")
+        .struct.field("b_b")
+        .struct.field("b_b_a")
     )
 
     # Read with chunked reader
     got_chunked = read_parquet_chunked(
         [buf1, buf2],
-        columns=["struct.b.inner_b.inner_inner_a"],
+        columns=["struct.b.b_b.b_b_a"],
         chunk_read_limit=240,
         pass_read_limit=240,
         allow_mismatched_pq_schemas=True,
@@ -3970,8 +3959,8 @@ def test_parquet_reader_with_mismatched_structs():
     got_chunked = (
         cudf.Series(got_chunked["struct"])
         .struct.field("b")
-        .struct.field("inner_b")
-        .struct.field("inner_inner_a")
+        .struct.field("b_b")
+        .struct.field("b_b_a")
     )
 
     # Construct the expected series
@@ -3979,12 +3968,12 @@ def test_parquet_reader_with_mismatched_structs():
         [
             cudf.Series(df1["struct"])
             .struct.field("b")
-            .struct.field("inner_b")
-            .struct.field("inner_inner_a"),
+            .struct.field("b_b")
+            .struct.field("b_b_a"),
             cudf.Series(df2["struct"])
             .struct.field("b")
-            .struct.field("inner_b")
-            .struct.field("inner_inner_a"),
+            .struct.field("b_b")
+            .struct.field("b_b_a"),
         ]
     ).reset_index(drop=True)
 
@@ -4023,12 +4012,12 @@ def test_parquet_reader_with_mismatched_schemas_error():
         )
 
     data1 = [
-        {"a": 1, "b": {"inner_a": 1, "inner_b": 6}},
-        {"a": 3, "b": {"inner_a": None, "inner_b": 2}},
+        {"a": 1, "b": {"b_a": 1, "b_b": 6}},
+        {"a": 3, "b": {"b_a": None, "b_b": 2}},
     ]
     data2 = [
-        {"b": {"inner_a": 1}, "c": "str"},
-        {"b": {"inner_a": None}, "c": None},
+        {"b": {"b_a": 1}, "c": "str"},
+        {"b": {"b_a": None}, "c": None},
     ]
 
     # cuDF tables from struct data
@@ -4059,6 +4048,191 @@ def test_parquet_reader_with_mismatched_schemas_error():
     ):
         cudf.read_parquet(
             [buf1, buf2],
-            columns=["struct.b.inner_b"],
+            columns=["struct.b.b_b"],
             allow_mismatched_pq_schemas=True,
         )
+
+
+def test_parquet_reader_mismatched_nullability():
+    # Ensure that we can faithfully read the tables with mismatched nullabilities
+    df1 = cudf.DataFrame(
+        {
+            "timedelta": cudf.Series([12, 54, 1231], dtype="timedelta64[ms]"),
+            "duration_list": list(
+                [
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            None,
+                            [pd.Timedelta(minutes=8), None],
+                        ],
+                        None,
+                    ],
+                    None,
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([1234, None, 4123], dtype="int64"),
+            "int32": cudf.Series([1234, 123, 4123], dtype="int32"),
+            "list": list([[1, 2], [1, 2], [1, 2]]),
+            "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "string": cudf.Series(["kitten", "puppy", "cub"]),
+        }
+    )
+
+    df2 = cudf.DataFrame(
+        {
+            "timedelta": cudf.Series(
+                [None, None, None], dtype="timedelta64[ms]"
+            ),
+            "duration_list": list(
+                [
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=1)],
+                        ],
+                    ],
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([1234, 123, 4123], dtype="int64"),
+            "int32": cudf.Series([1234, None, 4123], dtype="int32"),
+            "list": list([[1, 2], None, [1, 2]]),
+            "datetime": cudf.Series(
+                [1234, None, 4123], dtype="datetime64[ms]"
+            ),
+            "string": cudf.Series(["kitten", None, "cub"]),
+        }
+    )
+
+    # Write tables to parquet with arrow schema for compatibility for duration column(s)
+    fname1 = BytesIO()
+    df1.to_parquet(fname1, store_schema=True)
+    fname2 = BytesIO()
+    df2.to_parquet(fname2, store_schema=True)
+
+    # Read tables back with cudf and arrow in either order and compare
+    assert_eq(
+        cudf.read_parquet([fname1, fname2]),
+        cudf.concat([df1, df2]).reset_index(drop=True),
+    )
+    assert_eq(
+        cudf.read_parquet([fname2, fname1]),
+        cudf.concat([df2, df1]).reset_index(drop=True),
+    )
+
+
+def test_parquet_reader_mismatched_nullability_structs(tmpdir):
+    data1 = [
+        {
+            "a": "a",
+            "b": {
+                "b_a": 10,
+                "b_b": {"b_b_b": 1, "b_b_a": 12},
+            },
+            "c": [1, 2],
+        },
+        {
+            "a": "b",
+            "b": {
+                "b_a": 30,
+                "b_b": {"b_b_b": 2, "b_b_a": 2},
+            },
+            "c": [3, 4],
+        },
+        {
+            "a": "c",
+            "b": {
+                "b_a": 50,
+                "b_b": {"b_b_b": 4, "b_b_a": 5},
+            },
+            "c": [5, 6],
+        },
+        {
+            "a": "d",
+            "b": {
+                "b_a": 135,
+                "b_b": {"b_b_b": 12, "b_b_a": 32},
+            },
+            "c": [7, 8],
+        },
+        {
+            "a": "e",
+            "b": {
+                "b_a": 1,
+                "b_b": {"b_b_b": 1, "b_b_a": 5},
+            },
+            "c": [9, 10],
+        },
+        {
+            "a": "f",
+            "b": {
+                "b_a": 32,
+                "b_b": {"b_b_b": 1, "b_b_a": 6},
+            },
+            "c": [11, 12],
+        },
+    ]
+
+    data2 = [
+        {
+            "a": "g",
+            "b": {
+                "b_a": 10,
+                "b_b": {"b_b_b": None, "b_b_a": 2},
+            },
+            "c": None,
+        },
+        {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]},
+        {"a": "j", "b": None, "c": [8, 10]},
+        {"a": None, "b": {"b_a": None, "b_b": None}, "c": None},
+        None,
+        {
+            "a": None,
+            "b": {"b_a": None, "b_b": {"b_b_b": 1}},
+            "c": [18, 19],
+        },
+        {"a": None, "b": None, "c": None},
+    ]
+
+    pa_table1 = pa.Table.from_pydict({"struct": data1})
+    df1 = cudf.DataFrame.from_arrow(pa_table1)
+
+    pa_table2 = pa.Table.from_pydict({"struct": data2})
+    df2 = cudf.DataFrame.from_arrow(pa_table2)
+
+    # Write tables to parquet
+    buf1 = BytesIO()
+    df1.to_parquet(buf1)
+    buf2 = BytesIO()
+    df2.to_parquet(buf2)
+
+    # Read tables back with cudf and compare with expected.
+    assert_eq(
+        cudf.read_parquet([buf1, buf2]),
+        cudf.concat([df1, df2]).reset_index(drop=True),
+    )
+    assert_eq(
+        cudf.read_parquet([buf2, buf1]),
+        cudf.concat([df2, df1]).reset_index(drop=True),
+    )
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 52179f55da3..997ca357986 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -946,6 +946,66 @@ def test_minhash():
         strings.str.minhash64(seeds=seeds)
 
 
+def test_word_minhash():
+    ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+
+    expected = cudf.Series(
+        [
+            cudf.Series([21141582], dtype=np.uint32),
+            cudf.Series([962346254], dtype=np.uint32),
+        ]
+    )
+    actual = ls.str.word_minhash()
+    assert_eq(expected, actual)
+    seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+    expected = cudf.Series(
+        [
+            cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32),
+            cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32),
+        ]
+    )
+    actual = ls.str.word_minhash(seeds=seeds)
+    assert_eq(expected, actual)
+
+    expected = cudf.Series(
+        [
+            cudf.Series([2603139454418834912], dtype=np.uint64),
+            cudf.Series([5240044617220523711], dtype=np.uint64),
+        ]
+    )
+    actual = ls.str.word_minhash64()
+    assert_eq(expected, actual)
+    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+    expected = cudf.Series(
+        [
+            cudf.Series(
+                [
+                    2603139454418834912,
+                    8644371945174847701,
+                    5541030711534384340,
+                ],
+                dtype=np.uint64,
+            ),
+            cudf.Series(
+                [5240044617220523711, 5847101123925041457, 153762819128779913],
+                dtype=np.uint64,
+            ),
+        ]
+    )
+    actual = ls.str.word_minhash64(seeds=seeds)
+    assert_eq(expected, actual)
+
+    # test wrong seed types
+    with pytest.raises(ValueError):
+        ls.str.word_minhash(seeds="a")
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
+        ls.str.word_minhash(seeds=seeds)
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+        ls.str.word_minhash64(seeds=seeds)
+
+
 def test_jaccard_index():
     str1 = cudf.Series(["the brown dog", "jumped about"])
     str2 = cudf.Series(["the black cat", "jumped around"])
diff --git a/python/cudf/cudf_pandas_tests/test_main.py b/python/cudf/cudf_pandas_tests/test_main.py
new file mode 100644
index 00000000000..326224c8fc0
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/test_main.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import subprocess
+import tempfile
+import textwrap
+
+
+def _run_python(*, cudf_pandas, command):
+    executable = "python "
+    if cudf_pandas:
+        executable += "-m cudf.pandas "
+    return subprocess.run(
+        executable + command,
+        shell=True,
+        capture_output=True,
+        check=True,
+        text=True,
+    )
+
+
+def test_run_cudf_pandas_with_script():
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=True) as f:
+        code = textwrap.dedent(
+            """
+            import pandas as pd
+            df = pd.DataFrame({'a': [1, 2, 3]})
+            print(df['a'].sum())
+            """
+        )
+        f.write(code)
+        f.flush()
+
+        res = _run_python(cudf_pandas=True, command=f.name)
+        expect = _run_python(cudf_pandas=False, command=f.name)
+
+    assert res.stdout != ""
+    assert res.stdout == expect.stdout
+
+
+def test_run_cudf_pandas_with_script_with_cmd_args():
+    input_args_and_code = """-c 'import pandas as pd; df = pd.DataFrame({"a": [1, 2, 3]}); print(df["a"].sum())'"""
+
+    res = _run_python(cudf_pandas=True, command=input_args_and_code)
+    expect = _run_python(cudf_pandas=False, command=input_args_and_code)
+
+    assert res.stdout != ""
+    assert res.stdout == expect.stdout
+
+
+def test_run_cudf_pandas_with_script_with_cmd_args_check_cudf():
+    """Verify that cudf is active with -m cudf.pandas."""
+    input_args_and_code = """-c 'import pandas as pd; print(pd)'"""
+
+    res = _run_python(cudf_pandas=True, command=input_args_and_code)
+    expect = _run_python(cudf_pandas=False, command=input_args_and_code)
+
+    assert "cudf" in res.stdout
+    assert "cudf" not in expect.stdout
+
+
+def test_cudf_pandas_script_repl():
+    def start_repl_process(cmd):
+        return subprocess.Popen(
+            cmd.split(),
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            text=True,
+        )
+
+    def get_repl_output(process, commands):
+        for command in commands:
+            process.stdin.write(command)
+            process.stdin.flush()
+        return process.communicate()
+
+    p1 = start_repl_process("python -m cudf.pandas")
+    p2 = start_repl_process("python")
+    commands = [
+        "import pandas as pd\n",
+        "print(pd.Series(range(2)).sum())\n",
+        "print(pd.Series(range(5)).sum())\n",
+        "import sys\n",
+        "print(pd.Series(list('abcd')), out=sys.stderr)\n",
+    ]
+
+    res = get_repl_output(p1, commands)
+    expect = get_repl_output(p2, commands)
+
+    # Check stdout
+    assert res[0] != ""
+    assert res[0] == expect[0]
+
+    # Check stderr
+    assert res[1] != ""
+    assert res[1] == expect[1]
+
+    p1.kill()
+    p2.kill()
diff --git a/python/custreamz/README.md b/python/custreamz/README.md
index 1509dac9e61..8da17ef09dc 100644
--- a/python/custreamz/README.md
+++ b/python/custreamz/README.md
@@ -54,7 +54,7 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids
 
 ### Conda
 
-cuStreamz is installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` or `rapidsai-nightly` channel:
+cuStraamz can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:
 
 Release:
 ```bash
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
index 6edb9f87d48..4655d2165f0 100644
--- a/python/dask_cudf/README.md
+++ b/python/dask_cudf/README.md
@@ -1,135 +1,63 @@
 # <div align="left"><img src="../../img/rapids_logo.png" width="90px"/>&nbsp;Dask cuDF - A GPU Backend for Dask DataFrame</div>
 
-Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html). When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame.
-
-## Using Dask cuDF
-
-### The Dask DataFrame API (Recommended)
-
-Simply set the `"dataframe.backend"` [configuration](https://docs.dask.org/en/stable/configuration.html) to `"cudf"` in Dask, and the public Dask DataFrame API will leverage `cudf` automatically:
-
-```python
-import dask
-dask.config.set({"dataframe.backend": "cudf"})
-
-import dask.dataframe as dd
-# This gives us a cuDF-backed dataframe
-df = dd.read_parquet("data.parquet", ...)
-```
+Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html) that provides a Pandas-like API for parallel and larger-than-memory DataFrame computing on GPUs. When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame.
 
 > [!IMPORTANT]
-> The `"dataframe.backend"` configuration will only be used for collection creation when the following APIs are used: `read_parquet`, `read_json`, `read_csv`, `read_orc`, `read_hdf`, and `from_dict`. For example, if `from_map`, `from_pandas`, `from_delayed`, or `from_array` are used, the backend of the new collection will depend on the input to the function:
-
-```python
-import pandas as pd
-import cudf
-
-# This gives us a Pandas-backed dataframe
-dd.from_pandas(pd.DataFrame({"a": range(10)}))
-
-# This gives us a cuDF-backed dataframe
-dd.from_pandas(cudf.DataFrame({"a": range(10)}))
-```
-
-A cuDF-backed DataFrame collection can be moved to the `"pandas"` backend:
-
-```python
-df = df.to_backend("pandas")
-```
-
-Similarly, a Pandas-backed DataFrame collection can be moved to the `"cudf"` backend:
-
-```python
-df = df.to_backend("cudf")
-```
-
-### The Explicit Dask cuDF API
-
-In addition to providing the `"cudf"` backend for Dask DataFrame, Dask cuDF also provides an explicit `dask_cudf` API:
-
-```python
-import dask_cudf
-
-# This always gives us a cuDF-backed dataframe
-df = dask_cudf.read_parquet("data.parquet", ...)
-```
-
-> [!NOTE]
-> This API is used implicitly by the Dask DataFrame API when the `"cudf"` backend is enabled. Therefore, using it directly will not provide any performance benefit over the CPU/GPU-portable `dask.dataframe` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section).
+> Dask cuDF does not provide support for multi-GPU or multi-node execution on its own. You must also deploy a distributed cluster (ideally with [Dask-CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs efficiently.
 
-See the [Dask cuDF's API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) for further information.
-
-## Query Planning
-
-Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the `"dataframe.query-planning"` configuration is set to `True` (the default) when `dask.dataframe` is first imported, [Dask Expressions](https://github.com/dask/dask-expr) will be used under the hood.
-
-For example, the following user code will automatically benefit from predicate pushdown when the result is computed.
-
-```python
-df = dd.read_parquet("/my/parquet/dataset/")
-result = df.sort_values('B')['A']
-```
-
-Unoptimized expression graph (`df.pprint()`):
-```
-Projection: columns='A'
-  SortValues: by=['B'] shuffle_method='tasks' options={}
-    ReadParquetFSSpec: path='/my/parquet/dataset/' ...
-```
+## Using Dask cuDF
 
-Simplified expression graph (`df.simplify().pprint()`):
-```
-Projection: columns='A'
-  SortValues: by=['B'] shuffle_method='tasks' options={}
-    ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ...
-```
+Please visit [the official documentation page](https://docs.rapids.ai/api/dask-cudf/stable/) for detailed information about using Dask cuDF.
 
-> [!NOTE]
-> Dask will automatically simplify the expression graph (within `optimize`) when the result is converted to a task graph (via `compute` or `persist`). The user does not need to call `simplify` themself.
+## Installation
 
+See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages.
 
-## Using Multiple GPUs and Multiple Nodes
+## Resources
 
-Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try to partition your data into small-enough tasks to fit comfortably in the memory of a single GPU. This means the necessary compute tasks needed to compute a query can often be streamed to a single GPU process for out-of-core computing. This also means that the compute tasks can be executed in parallel over a multi-GPU cluster.
+- [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
+- [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/)
+- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
+- [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
+- [Deployment](https://docs.rapids.ai/deployment/stable/)
+- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.
 
-> [!IMPORTANT]
-> Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must deploy a distributed cluster (ideally with [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs.
+### Quick-start example
 
-In order to execute your Dask workflow on multiple GPUs, you will typically need to use [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) to deploy distributed Dask cluster, and [Distributed](https://distributed.dask.org/en/stable/client.html) to define a `client` object. For example:
+A very common Dask cuDF use case is single-node multi-GPU data processing. These workflows typically use the following pattern:
 
 ```python
-
+import dask
+import dask.dataframe as dd
 from dask_cuda import LocalCUDACluster
 from distributed import Client
 
-client = Client(
+if __name__ == "__main__":
+
+  # Define a GPU-aware cluster to leverage multiple GPUs
+  client = Client(
     LocalCUDACluster(
-        CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
-        rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
-        enable_cudf_spill=True,  # Improve device memory stability
-        local_directory="/fast/scratch/",  # Use fast local storage for spilling
+      CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
+      rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
+      enable_cudf_spill=True,  # Improve device memory stability
+      local_directory="/fast/scratch/",  # Use fast local storage for spilling
     )
-)
+  )
 
-df = dd.read_parquet("/my/parquet/dataset/")
-agg = df.groupby('B').sum()
-agg.compute()  # This will use the cluster defined above
-```
+  # Set the default dataframe backend to "cudf"
+  dask.config.set({"dataframe.backend": "cudf"})
 
-> [!NOTE]
-> This example uses `compute` to materialize a concrete `cudf.DataFrame` object in local memory. Never call `compute` on a large collection that cannot fit comfortably in the memory of a single GPU! See Dask's [documentation on managing computation](https://distributed.dask.org/en/stable/manage-computation.html) for more details.
+  # Create your DataFrame collection from on-disk
+  # or in-memory data
+  df = dd.read_parquet("/my/parquet/dataset/")
 
-Please see the [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) documentation for more information about deploying GPU-aware clusters (including [best practices](https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/)).
+  # Use cudf-like syntax to transform and/or query your data
+  query = df.groupby('item')['price'].mean()
 
-## Install
-
-See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages.
+  # Compute, persist, or write out the result
+  query.head()
+```
 
-## Resources
+If you do not have multiple GPUs available, using `LocalCUDACluster` is optional. However, it is still a good idea to [enable cuDF spilling](https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory).
 
-- [Dask cuDF API documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
-- [cuDF API documentation](https://docs.rapids.ai/api/cudf/stable/)
-- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
-- [Dask CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
-- [Deployment](https://docs.rapids.ai/deployment/stable/)
-- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.
+If you wish to scale across multiple nodes, you will need to use a different mechanism to deploy your Dask-CUDA workers. Please see [the RAPIDS deployment documentation](https://docs.rapids.ai/deployment/stable/) for more instructions.
diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index f60e4ff81ef..97e1dffc65b 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -49,8 +49,24 @@ def to_dask_dataframe(self, **kwargs):
 
         return self.to_backend("pandas", **kwargs)
 
+    def _prepare_cov_corr(self, min_periods, numeric_only):
+        # Upstream version of this method sets min_periods
+        # to 2 by default (which is not supported by cudf)
+        # TODO: Remove when cudf supports both min_periods
+        # and numeric_only
+        # See: https://github.com/rapidsai/cudf/issues/12626
+        # See: https://github.com/rapidsai/cudf/issues/9009
+        self._meta.cov(min_periods=min_periods)
+
+        frame = self
+        if numeric_only:
+            numerics = self._meta._get_numeric_data()
+            if len(numerics.columns) != len(self.columns):
+                frame = frame[list(numerics.columns)]
+        return frame, min_periods
+
     # var can be removed if cudf#15179 is addressed.
-    # See: https://github.com/rapidsai/cudf/issues/15179
+    # See: https://github.com/rapidsai/cudf/issues/14935
     def var(
         self,
         axis=0,
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 905d8c08135..7aa0f6320f2 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -1007,3 +1007,20 @@ def test_to_backend_simplify():
         df2 = df.to_backend("cudf")[["y"]].simplify()
         df3 = df[["y"]].to_backend("cudf").to_backend("cudf").simplify()
         assert df2._name == df3._name
+
+
+@pytest.mark.parametrize("numeric_only", [True, False])
+@pytest.mark.parametrize("op", ["corr", "cov"])
+def test_cov_corr(op, numeric_only):
+    df = cudf.DataFrame.from_dict(
+        {
+            "x": np.random.randint(0, 5, size=10),
+            "y": np.random.normal(size=10),
+        }
+    )
+    ddf = dd.from_pandas(df, npartitions=2)
+    res = getattr(ddf, op)(numeric_only=numeric_only)
+    # Use to_pandas until cudf supports numeric_only
+    # (See: https://github.com/rapidsai/cudf/issues/12626)
+    expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only)
+    dd.assert_eq(res, expect)
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 0c352a5068b..f2dd22f43aa 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -19,3 +19,13 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const column_view &seeds,
         const size_type width,
     ) except +
+
+    cdef unique_ptr[column] word_minhash(
+        const column_view &input,
+        const column_view &seeds
+    ) except +
+
+    cdef unique_ptr[column] word_minhash64(
+        const column_view &input,
+        const column_view &seeds
+    ) except +