From 30e3946ae79396b7fd09ea368fada0df4babea85 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 19 Sep 2024 01:44:30 -0400
Subject: [PATCH] Whitespace normalization of nested column coerced as string
 column in JSONL inputs (#16759)

Addresses #15280

Whitespace normalization is expected to remove unquoted whitespace characters in JSON lines inputs. However, in the cases where the JSON line is invalid due to an unquoted whitespace occurring in between numbers or literals, the existing normalization implementation is incorrect since it removes these invalidating whitespaces and makes the line valid.

This PR implements the normalization as a post-processing step on only nested columns forced as string columns.
Idea:
1. Create a single buffer by concatenating the rows of the string column. Create segment offsets and lengths array for concatenated buffer
2. Run a complementary whitespace normalization FST i.e. NOP for non-whitespace and quoted whitespace characters, and output indices of unquoted whitespace characters
3. Update segment lengths based on the number of output indices between segment offsets
4. Remove characters at output indices from concatenated buffer.
5. Return updated buffer, segment lengths and updated segment offsets

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16759
---
 cpp/include/cudf/io/detail/json.hpp           |  16 +-
 cpp/src/io/json/json_column.cu                | 149 +++++++++-----
 cpp/src/io/json/json_normalization.cu         | 165 ++++++++++++----
 cpp/src/io/json/nested_json_gpu.cu            |  10 +-
 cpp/src/io/json/read_json.cu                  |   6 -
 cpp/src/io/utilities/parsing_utils.cuh        |   6 +
 cpp/tests/io/json/json_test.cpp               |  43 +++++
 .../json_whitespace_normalization_test.cu     | 182 +++++++++---------
 8 files changed, 388 insertions(+), 189 deletions(-)
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 73ff17b2b93..940d03cdb41 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -69,11 +69,21 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
  * @brief Normalize unquoted whitespace (space and tab characters) using FST
  *
  * @param indata Input device buffer
+ * @param col_offsets Offsets to column contents in input buffer
+ * @param col_lengths Length of contents of each row in column
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
+ *
+ * @returns Tuple of the normalized column, offsets to each row in column, and lengths of contents
+ * of each row
  */
-void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
-                          rmm::cuda_stream_view stream,
-                          rmm::device_async_resource_ref mr);
+std::
+  tuple<rmm::device_uvector<char>, rmm::device_uvector<size_type>, rmm::device_uvector<size_type>>
+  normalize_whitespace(device_span<char const> d_input,
+                       device_span<size_type const> col_offsets,
+                       device_span<size_type const> col_lengths,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
+
 }  // namespace io::json::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 8890c786287..756047d383a 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
+#include <cudf/io/detail/json.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -625,6 +626,8 @@ void make_device_json_column(device_span<SymbolT const> input,
   auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   std::vector<uint8_t> is_pruned(num_columns, 0);
+  // for columns that are not mixed type but have been forced as string
+  std::vector<bool> forced_as_string_column(num_columns);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
   std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
@@ -695,11 +698,14 @@ void make_device_json_column(device_span<SymbolT const> input,
     // Struct, List, String, Value
     auto [name, parent_col_id] = name_and_parent_index(this_col_id);
 
-    // if parent is mixed type column or this column is pruned, ignore this column.
+    // if parent is mixed type column or this column is pruned or if parent
+    // has been forced as string, ignore this column.
     if (parent_col_id != parent_node_sentinel &&
-        (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) {
+          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
+        forced_as_string_column[parent_col_id]) {
       ignore_vals[this_col_id] = 1;
       if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
+      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
       continue;
     }
 
@@ -765,22 +771,26 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
 
     auto this_column_category = column_categories[this_col_id];
-    if (is_enabled_mixed_types_as_string) {
-      // get path of this column, check if it is a struct/list forced as string, and enforce it
-      auto const nt                             = tree_path.get_path(this_col_id);
-      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if ((column_categories[this_col_id] == NC_STRUCT or
-           column_categories[this_col_id] == NC_LIST) and
-          user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-        is_mixed_type_column[this_col_id] = 1;
-        this_column_category              = NC_STR;
-      }
+    // get path of this column, check if it is a struct/list forced as string, and enforce it
+    auto const nt                             = tree_path.get_path(this_col_id);
+    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      this_column_category = NC_STR;
     }
 
     CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
     // move into parent
     device_json_column col(stream, mr);
     initialize_json_columns(this_col_id, col, this_column_category);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      col.forced_as_string_column          = true;
+      forced_as_string_column[this_col_id] = true;
+    }
+
     auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
     CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
     if (not replaced) parent_col.column_order.push_back(name);
@@ -802,12 +812,30 @@ void make_device_json_column(device_span<SymbolT const> input,
           is_mixed_type_column[this_col_id] == 1)
         column_categories[this_col_id] = NC_STR;
     }
-    cudaMemcpyAsync(d_column_tree.node_categories.begin(),
-                    column_categories.data(),
-                    column_categories.size() * sizeof(column_categories[0]),
-                    cudaMemcpyDefault,
-                    stream.value());
+    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                    column_categories.data(),
+                                    column_categories.size() * sizeof(column_categories[0]),
+                                    cudf::detail::host_memory_kind::PAGEABLE,
+                                    stream);
+  }
+
+  // ignore all children of columns forced as string
+  for (auto const this_col_id : unique_col_ids) {
+    auto parent_col_id = column_parent_ids[this_col_id];
+    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
+      forced_as_string_column[this_col_id] = true;
+      ignore_vals[this_col_id]             = 1;
+    }
+    // Convert only mixed type columns as string (so to copy), but not its children
+    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
+        forced_as_string_column[this_col_id])
+      column_categories[this_col_id] = NC_STR;
   }
+  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                  column_categories.data(),
+                                  column_categories.size() * sizeof(column_categories[0]),
+                                  cudf::detail::host_memory_kind::PAGEABLE,
+                                  stream);
 
   // restore unique_col_ids order
   std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
@@ -982,39 +1010,58 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                    "string offset, string length mismatch");
       rmm::device_uvector<char_length_pair_t> d_string_data(col_size, stream);
       // TODO how about directly storing pair<char*, size_t> in json_column?
-      auto offset_length_it =
-        thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin());
 
-      data_type target_type{};
+      auto [result_bitmask, null_count] = make_validity(json_col);
 
-      if (schema.has_value()) {
+      data_type target_type{};
+      std::unique_ptr<column> col{};
+      if (options.normalize_whitespace && json_col.forced_as_string_column) {
+        CUDF_EXPECTS(prune_columns || options.mixed_types_as_string,
+                     "Whitespace normalization of nested columns requested as string requires "
+                     "either prune_columns or mixed_types_as_string to be enabled");
+        auto [normalized_d_input, col_offsets, col_lengths] =
+          cudf::io::json::detail::normalize_whitespace(
+            d_input, json_col.string_offsets, json_col.string_lengths, stream, mr);
+        auto offset_length_it = thrust::make_zip_iterator(col_offsets.begin(), col_lengths.begin());
+        target_type           = data_type{type_id::STRING};
+        // Convert strings to the inferred data type
+        col = parse_data(normalized_d_input.data(),
+                         offset_length_it,
+                         col_size,
+                         target_type,
+                         std::move(result_bitmask),
+                         null_count,
+                         options.view(),
+                         stream,
+                         mr);
+      } else {
+        auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(),
+                                                          json_col.string_lengths.begin());
+        if (schema.has_value()) {
 #ifdef NJP_DEBUG_PRINT
-        std::cout << "-> explicit type: "
-                  << (schema.has_value() ? std::to_string(static_cast<int>(schema->type.id()))
-                                         : "n/a");
+          std::cout << "-> explicit type: "
+                    << (schema.has_value() ? std::to_string(static_cast<int>(schema->type.id()))
+                                           : "n/a");
 #endif
-        target_type = schema.value().type;
-      } else if (json_col.forced_as_string_column) {
-        target_type = data_type{type_id::STRING};
-      }
-      // Infer column type, if we don't have an explicit type for it
-      else {
-        target_type = cudf::io::detail::infer_data_type(
-          options.json_view(), d_input, offset_length_it, col_size, stream);
+          target_type = schema.value().type;
+        }
+        // Infer column type, if we don't have an explicit type for it
+        else {
+          target_type = cudf::io::detail::infer_data_type(
+            options.json_view(), d_input, offset_length_it, col_size, stream);
+        }
+        // Convert strings to the inferred data type
+        col = parse_data(d_input.data(),
+                         offset_length_it,
+                         col_size,
+                         target_type,
+                         std::move(result_bitmask),
+                         null_count,
+                         options.view(),
+                         stream,
+                         mr);
       }
 
-      auto [result_bitmask, null_count] = make_validity(json_col);
-      // Convert strings to the inferred data type
-      auto col = parse_data(d_input.data(),
-                            offset_length_it,
-                            col_size,
-                            target_type,
-                            std::move(result_bitmask),
-                            null_count,
-                            options.view(),
-                            stream,
-                            mr);
-
       // Reset nullable if we do not have nulls
       // This is to match the existing JSON reader's behaviour:
       // - Non-string columns will always be returned as nullable
@@ -1120,11 +1167,15 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     const auto [tokens_gpu, token_indices_gpu] =
       get_token_stream(d_input, options, stream, cudf::get_current_device_resource_ref());
     // gpu tree generation
-    return get_tree_representation(tokens_gpu,
-                                   token_indices_gpu,
-                                   options.is_enabled_mixed_types_as_string(),
-                                   stream,
-                                   cudf::get_current_device_resource_ref());
+    // Note that to normalize whitespaces in nested columns coerced to be string, we need the column
+    // to either be of mixed type or we need to request the column to be returned as string by
+    // pruning it with the STRING dtype
+    return get_tree_representation(
+      tokens_gpu,
+      token_indices_gpu,
+      options.is_enabled_mixed_types_as_string() || options.is_enabled_prune_columns(),
+      stream,
+      cudf::get_current_device_resource_ref());
   }();  // IILE used to free memory of token data.
 #ifdef NJP_DEBUG_PRINT
   auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 97d5884fef1..2d435dc8e1a 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -17,6 +17,7 @@
 #include "io/fst/lookup_tables.cuh"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -25,8 +26,17 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-
+#include <rmm/exec_policy.hpp>
+
+#include <cub/device/device_copy.cuh>
+#include <cuda/atomic>
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
 
 #include <cstdlib>
 #include <string>
@@ -215,14 +225,6 @@ std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
  *        |   state is necessary to process escaped double-quote characters. Without this
  *        |   state, whitespaces following escaped double quotes inside strings may be removed.
  *
- * NOTE: An important case NOT handled by this FST is that of whitespace following newline
- * characters within a string. Consider the following example
- * Input:           {"a":"x\n y"}
- * FST output:      {"a":"x\ny"}
- * Expected output: {"a":"x\n y"}
- * Such strings are not part of the JSON standard (characters allowed within quotes should
- * have ASCII at least 0x20 i.e. space character and above) but may be encountered while
- * reading JSON files
  */
 enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
 // Aliases for readability of the transition table
@@ -255,17 +257,17 @@ struct TransduceToNormalizedWS {
     //      Let the alphabet set be Sigma
     // ---------------------------------------
     // ---------- NON-SPECIAL CASES: ----------
-    //      Output symbol same as input symbol <s>
+    //    Input symbol translates to output symbol
     // state | read_symbol <s>  -> output_symbol <s>
-    // DQS   | Sigma            -> Sigma
-    // OOS   | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
-    // DEC   | Sigma            -> Sigma
+    // DQS   | Sigma            -> <nop>
+    // OOS   | Sigma\{<SPC>,\t} -> <nop>
+    // DEC   | Sigma            -> <nop>
     // ---------- SPECIAL CASES: --------------
-    //    Input symbol translates to output symbol
-    // OOS   | {<SPC>}          -> <nop>
-    // OOS   | {\t}             -> <nop>
+    //      Output symbol same as input symbol <s>
+    // OOS   | {<SPC>}          -> {<SPC>}
+    // OOS   | {\t}             -> {\t}
 
-    // Case when read symbol is a space or tab but is unquoted
+    // Case when read symbol is not an unquoted space or tab
     // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
     // However, since there is no output in this case i.e. the count returned by
     // operator()(state_id, match_id, read_symbol) is zero, this function is never called.
@@ -287,8 +289,8 @@ struct TransduceToNormalizedWS {
                                                  SymbolT const read_symbol) const
   {
     // Case when read symbol is a space or tab but is unquoted
-    if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
-        state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
+    if (!(match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
+          state_id == static_cast<StateT>(dfa_states::TT_OOS))) {
       return 0;
     }
     return 1;
@@ -328,33 +330,126 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
   std::swap(indata, outdata);
 }
 
-void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
-                          rmm::cuda_stream_view stream,
-                          rmm::device_async_resource_ref mr)
+std::
+  tuple<rmm::device_uvector<char>, rmm::device_uvector<size_type>, rmm::device_uvector<size_type>>
+  normalize_whitespace(device_span<char const> d_input,
+                       device_span<size_type const> col_offsets,
+                       device_span<size_type const> col_lengths,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr)
 {
-  CUDF_FUNC_RANGE();
-  static constexpr std::int32_t min_out = 0;
-  static constexpr std::int32_t max_out = 2;
+  /*
+   * Algorithm:
+    1. Create a single buffer by concatenating the rows of the string column. Create segment offsets
+   and lengths array for concatenated buffer
+    2. Run a whitespace normalization FST that performs NOP for non-whitespace and quoted
+   whitespace characters, and outputs indices of unquoted whitespace characters
+    3. Update segment lengths based on the number of output indices between segment offsets
+    4. Remove characters at output indices from concatenated buffer.
+    5. Return updated buffer, segment lengths and updated segment offsets
+   */
+  auto inbuf_lengths = cudf::detail::make_device_uvector_async(
+    col_lengths, stream, cudf::get_current_device_resource_ref());
+  size_t inbuf_lengths_size = inbuf_lengths.size();
+  size_type inbuf_size =
+    thrust::reduce(rmm::exec_policy_nosync(stream), inbuf_lengths.begin(), inbuf_lengths.end());
+  rmm::device_uvector<char> inbuf(inbuf_size, stream);
+  rmm::device_uvector<size_type> inbuf_offsets(inbuf_lengths_size, stream);
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                         inbuf_lengths.begin(),
+                         inbuf_lengths.end(),
+                         inbuf_offsets.begin(),
+                         0);
+
+  auto input_it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    cuda::proclaim_return_type<char const*>(
+      [d_input = d_input.begin(), col_offsets = col_offsets.begin()] __device__(
+        size_t i) -> char const* { return &d_input[col_offsets[i]]; }));
+  auto output_it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    cuda::proclaim_return_type<char*>(
+      [inbuf = inbuf.begin(), inbuf_offsets = inbuf_offsets.cbegin()] __device__(
+        size_t i) -> char* { return &inbuf[inbuf_offsets[i]]; }));
+
+  {
+    // cub device batched copy
+    size_t temp_storage_bytes = 0;
+    cub::DeviceCopy::Batched(nullptr,
+                             temp_storage_bytes,
+                             input_it,
+                             output_it,
+                             inbuf_lengths.begin(),
+                             inbuf_lengths_size,
+                             stream.value());
+    rmm::device_buffer temp_storage(temp_storage_bytes, stream);
+    cub::DeviceCopy::Batched(temp_storage.data(),
+                             temp_storage_bytes,
+                             input_it,
+                             output_it,
+                             inbuf_lengths.begin(),
+                             inbuf_lengths_size,
+                             stream.value());
+  }
+
+  // whitespace normalization : get the indices of the unquoted whitespace characters
   auto parser =
     fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
                           fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
-                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+                          fst::detail::make_translation_functor<SymbolT, 0, 2>(
                             normalize_whitespace::TransduceToNormalizedWS{}),
                           stream);
 
-  rmm::device_buffer outbuf(indata.size(), stream, mr);
-  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
-                   static_cast<SymbolOffsetT>(indata.size()),
-                   static_cast<SymbolT*>(outbuf.data()),
+  rmm::device_uvector<size_type> outbuf_indices(inbuf.size(), stream, mr);
+  rmm::device_scalar<SymbolOffsetT> outbuf_indices_size(stream, mr);
+  parser.Transduce(inbuf.data(),
+                   static_cast<SymbolOffsetT>(inbuf.size()),
                    thrust::make_discard_iterator(),
-                   outbuf_size.data(),
+                   outbuf_indices.data(),
+                   outbuf_indices_size.data(),
                    normalize_whitespace::start_state,
                    stream);
 
-  outbuf.resize(outbuf_size.value(stream), stream);
-  datasource::owning_buffer<rmm::device_buffer> outdata(std::move(outbuf));
-  std::swap(indata, outdata);
+  auto const num_deletions = outbuf_indices_size.value(stream);
+  outbuf_indices.resize(num_deletions, stream);
+
+  // now these indices need to be removed
+  // TODO: is there a better way to do this?
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    outbuf_indices.begin(),
+    outbuf_indices.end(),
+    [inbuf_offsets_begin = inbuf_offsets.begin(),
+     inbuf_offsets_end   = inbuf_offsets.end(),
+     inbuf_lengths       = inbuf_lengths.begin()] __device__(size_type idx) {
+      auto it  = thrust::upper_bound(thrust::seq, inbuf_offsets_begin, inbuf_offsets_end, idx);
+      auto pos = thrust::distance(inbuf_offsets_begin, it) - 1;
+      cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(inbuf_lengths + pos)};
+      ref.fetch_add(-1, cuda::std::memory_order_relaxed);
+    });
+
+  auto stencil = cudf::detail::make_zeroed_device_uvector_async<bool>(
+    static_cast<std::size_t>(inbuf_size), stream, cudf::get_current_device_resource_ref());
+  thrust::scatter(rmm::exec_policy_nosync(stream),
+                  thrust::make_constant_iterator(true),
+                  thrust::make_constant_iterator(true) + num_deletions,
+                  outbuf_indices.begin(),
+                  stencil.begin());
+  thrust::remove_if(rmm::exec_policy_nosync(stream),
+                    inbuf.begin(),
+                    inbuf.end(),
+                    stencil.begin(),
+                    thrust::identity<int>());
+  inbuf.resize(inbuf_size - num_deletions, stream);
+
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                         inbuf_lengths.begin(),
+                         inbuf_lengths.end(),
+                         inbuf_offsets.begin(),
+                         0);
+
+  stream.synchronize();
+  return std::tuple{std::move(inbuf), std::move(inbuf_offsets), std::move(inbuf_lengths)};
 }
 
 }  // namespace detail
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 4e513d3495c..1c15e147b13 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -2079,10 +2079,12 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
 {
   auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'};
 
-  parse_opts.dayfirst   = options.is_enabled_dayfirst();
-  parse_opts.keepquotes = options.is_enabled_keep_quotes();
-  parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
-  parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  parse_opts.dayfirst              = options.is_enabled_dayfirst();
+  parse_opts.keepquotes            = options.is_enabled_keep_quotes();
+  parse_opts.normalize_whitespace  = options.is_enabled_normalize_whitespace();
+  parse_opts.mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  parse_opts.trie_true             = cudf::detail::create_serialized_trie({"true"}, stream);
+  parse_opts.trie_false            = cudf::detail::create_serialized_trie({"false"}, stream);
   std::vector<std::string> na_values{"", "null"};
   na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end());
   parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream);
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index bd82b040359..99a5b17bce8 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -232,12 +232,6 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
     normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref());
   }
 
-  // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
-  // enabled, invoke pre-processing FST
-  if (reader_opts.is_enabled_normalize_whitespace()) {
-    normalize_whitespace(bufview, stream, cudf::get_current_device_resource_ref());
-  }
-
   auto buffer =
     cudf::device_span<char const>(reinterpret_cast<char const*>(bufview.data()), bufview.size());
   stream.synchronize();
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index bc2722441d0..734067582f7 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -67,6 +67,8 @@ struct parse_options_view {
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
+  bool normalize_whitespace;
+  bool mixed_types_as_string;
   cudf::detail::trie_view trie_true;
   cudf::detail::trie_view trie_false;
   cudf::detail::trie_view trie_na;
@@ -85,6 +87,8 @@ struct parse_options {
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
+  bool normalize_whitespace;
+  bool mixed_types_as_string;
   cudf::detail::optional_trie trie_true;
   cudf::detail::optional_trie trie_false;
   cudf::detail::optional_trie trie_na;
@@ -111,6 +115,8 @@ struct parse_options {
             doublequote,
             dayfirst,
             skipblanklines,
+            normalize_whitespace,
+            mixed_types_as_string,
             cudf::detail::make_trie_view(trie_true),
             cudf::detail::make_trie_view(trie_false),
             cudf::detail::make_trie_view(trie_na),
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 960c19fce2e..48bc982d0e3 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2856,4 +2856,47 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren)
   }
 }
 
+TEST_F(JsonReaderTest, JsonDtypeSchema)
+{
+  std::string data = R"(
+    {"a": 1, "b": {"0": "abc", "1": ["a", "b"]}, "c": true}
+    {"a": 1, "b": {"0": "abc"          }, "c": false}
+    {"a": 1, "b": {"0": "lolol  "}, "c": true}
+    )";
+
+  std::map<std::string, cudf::io::schema_element> dtype_schema{{"c", {data_type{type_id::STRING}}},
+                                                               {"b", {data_type{type_id::STRING}}},
+                                                               {"a", {dtype<double>()}}};
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .dtypes(dtype_schema)
+      .prune_columns(true)
+      .lines(true);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), 3);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING);
+  EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+  EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+  EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+
+  // cudf::column::contents contents = result.tbl->get_column(1).release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), float64_wrapper{{1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(1),
+    cudf::test::strings_column_wrapper({"{\"0\": \"abc\", \"1\": [\"a\", \"b\"]}",
+                                        "{\"0\": \"abc\"          }",
+                                        "{\"0\": \"lolol  \"}"}),
+    cudf::test::debug_output_level::ALL_ERRORS);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2),
+                                 cudf::test::strings_column_wrapper({"true", "false", "true"}),
+                                 cudf::test::debug_output_level::ALL_ERRORS);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
index 6d79fdc98ef..6a3bd69de81 100644
--- a/cpp/tests/io/json/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu
@@ -34,129 +34,127 @@
 // Base test fixture for tests
 struct JsonWSNormalizationTest : public cudf::test::BaseFixture {};
 
-void run_test(std::string const& host_input, std::string const& expected_host_output)
-{
-  // Prepare cuda stream for data transfers & kernels
-  auto stream_view = cudf::test::get_default_stream();
-
-  auto device_input = rmm::device_buffer(
-    host_input.c_str(), host_input.size(), stream_view, cudf::get_current_device_resource_ref());
-
-  // Preprocessing FST
-  cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
-  cudf::io::json::detail::normalize_whitespace(
-    device_data, stream_view, cudf::get_current_device_resource_ref());
-
-  std::string preprocessed_host_output(device_data.size(), 0);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
-                                device_data.data(),
-                                preprocessed_host_output.size(),
-                                cudaMemcpyDeviceToHost,
-                                stream_view.value()));
-
-  stream_view.synchronize();
-  ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size());
-  CUDF_TEST_EXPECT_VECTOR_EQUAL(
-    preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
-}
-
-TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces)
+TEST_F(JsonWSNormalizationTest, ReadJsonOption)
 {
-  std::string input  = R"({ "A" : "TEST" })";
-  std::string output = R"({"A":"TEST"})";
-  run_test(input, output);
-}
+  // When mixed type fields are read as strings, the table read will differ depending the
+  // value of normalize_whitespace
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_MoreSpaces)
-{
-  std::string input  = R"({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": {"c": "d"}})";
-  std::string output = R"({"a":[1,2,3,4,5,6,7,8],"b":{"c":"d"}})";
-  run_test(input, output);
-}
+  // Test input
+  std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(true);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesInString)
-{
-  std::string input  = R"({" a ":50})";
-  std::string output = R"({" a ":50})";
-  run_test(input, output);
-}
+  cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_NewlineInString)
-{
-  std::string input  = "{\"a\" : \"x\ny\"}\n{\"a\" : \"x\\ny\"}";
-  std::string output = "{\"a\":\"x\ny\"}\n{\"a\":\"x\\ny\"}";
-  run_test(input, output);
-}
+  // Expected table
+  std::string const expected_input = R"({ "a" : {"b":"c"}})";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(false);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_Tabs)
-{
-  std::string input  = "{\"a\":\t\"b\"}";
-  std::string output = R"({"a":"b"})";
-  run_test(input, output);
+  cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
 }
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesAndTabs)
+TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows)
 {
-  std::string input  = "{\"A\" : \t\"TEST\" }";
-  std::string output = R"({"A":"TEST"})";
-  run_test(input, output);
-}
+  // When mixed type fields are read as strings, the table read will differ depending the
+  // value of normalize_whitespace
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_MultilineJSONWithSpacesAndTabs)
-{
-  std::string input =
-    "{ \"foo rapids\": [1,2,3], \"bar\trapids\": 123 }\n\t{ \"foo rapids\": { \"a\": 1 }, "
-    "\"bar\trapids\": 456 }";
-  std::string output =
-    "{\"foo rapids\":[1,2,3],\"bar\trapids\":123}\n{\"foo rapids\":{\"a\":1},\"bar\trapids\":456}";
-  run_test(input, output);
-}
+  // Test input
+  std::string const host_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } ] } }
+  { "Root": { "Key": "abc" } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [ { "EE": "efg" } ] } }
+  )";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_PureJSONExample)
-{
-  std::string input  = R"([{"a":50}, {"a" : 60}])";
-  std::string output = R"([{"a":50},{"a":60}])";
-  run_test(input, output);
-}
+  cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_NoNormalizationRequired)
-{
-  std::string input  = R"({"a\\n\r\a":50})";
-  std::string output = R"({"a\\n\r\a":50})";
-  run_test(input, output);
-}
+  // Expected table
+  std::string const expected_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } ] } }
+  { "Root": { "Key": "abc" } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{"YY":1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [{"EE":"efg"}] } }
+  )";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(false)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput)
-{
-  std::string input  = "{\"a\" : \"b }\n{ \"c \" :\t\"d\"}";
-  std::string output = "{\"a\":\"b }\n{\"c \":\"d\"}";
-  run_test(input, output);
+  cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
 }
 
-TEST_F(JsonWSNormalizationTest, ReadJsonOption)
+TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows_NoMixedType)
 {
   // When mixed type fields are read as strings, the table read will differ depending the
   // value of normalize_whitespace
 
   // Test input
-  std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}";
+  std::string const host_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } ] } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [ { "EE": "efg" }, { "YY" :   "abc" }    ] } }
+  { "Root": { "Key": [  { "YY" :   "abc" }    ] } }
+  )";
+
+  std::map<std::string, cudf::io::schema_element> dtype_schema{
+    {"Key", {cudf::data_type{cudf::type_id::STRING}}}};
+
   cudf::io::json_reader_options input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{host_input.data(), host_input.size()})
+      .dtypes(dtype_schema)
       .lines(true)
-      .mixed_types_as_string(true)
-      .normalize_whitespace(true);
+      .prune_columns(true)
+      .normalize_whitespace(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
   cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
 
   // Expected table
-  std::string const expected_input = R"({ "a" : {"b":"c"}})";
+  std::string const expected_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } , { "YY" :    2 } ] } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{"YY":1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [{"EE":"efg"},{"YY":"abc"}] } }
+  { "Root": { "Key": [{"YY":"abc"}] } }
+  )";
+
   cudf::io::json_reader_options expected_input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .dtypes(dtype_schema)
       .lines(true)
-      .mixed_types_as_string(true)
-      .normalize_whitespace(false);
+      .prune_columns(true)
+      .normalize_whitespace(false)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
   cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());