From 36f56c97b94446f29fef5d2ddd8818275a28e406 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 20 Dec 2023 20:16:53 -0800
Subject: [PATCH] Add row conversion code from spark-rapids-jni (#14664)

This temporarily moves the row conversion code from spark-rapids-jni into libcudf. It is necessary to have the row conversion code compiled in a static library to overcome a CCCL issue that triggers invalid memory access when calling to `thrust::in(ex)clusive_scan` (https://github.com/NVIDIA/spark-rapids-jni/issues/1567).

In the future, when we have CCCL updated to fix the issue (1567), we may need to move the code back into spark-rapids-jni.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/14664
---
 cpp/CMakeLists.txt                            |    1 +
 cpp/include/cudf/row_conversion.hpp           |   54 +
 .../src/transform}/row_conversion.cu          | 1577 ++++++++++-------
 cpp/tests/CMakeLists.txt                      |    2 +
 cpp/tests/transform/row_conversion.cpp        | 1021 +++++++++++
 java/src/main/java/ai/rapids/cudf/Table.java  |  142 --
 java/src/main/native/CMakeLists.txt           |    1 -
 java/src/main/native/src/TableJni.cpp         |   85 -
 java/src/main/native/src/row_conversion.hpp   |   52 -
 .../test/java/ai/rapids/cudf/TableTest.java   |   67 -
 10 files changed, 1990 insertions(+), 1012 deletions(-)
 create mode 100644 cpp/include/cudf/row_conversion.hpp
 rename {java/src/main/native/src => cpp/src/transform}/row_conversion.cu (58%)
 create mode 100644 cpp/tests/transform/row_conversion.cpp
 delete mode 100644 java/src/main/native/src/row_conversion.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4163275744e..a7c34ca489c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -628,6 +628,7 @@ add_library(
   src/transform/nans_to_nulls.cu
   src/transform/one_hot_encode.cu
   src/transform/row_bit_count.cu
+  src/transform/row_conversion.cu
   src/transform/transform.cpp
   src/transpose/transpose.cu
   src/unary/cast_ops.cu
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
new file mode 100644
index 00000000000..89453d49856
--- /dev/null
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+//! @cond Doxygen_Suppress
+
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
+  cudf::table_view const& tbl,
+  // TODO need something for validity
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
+  cudf::table_view const& tbl,
+  // TODO need something for validity
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
+  cudf::lists_column_view const& input,
+  std::vector<cudf::data_type> const& schema,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::table> convert_from_rows(
+  cudf::lists_column_view const& input,
+  std::vector<cudf::data_type> const& schema,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+//! @endcond
+}  // namespace cudf
diff --git a/java/src/main/native/src/row_conversion.cu b/cpp/src/transform/row_conversion.cu
similarity index 58%
rename from java/src/main/native/src/row_conversion.cu
rename to cpp/src/transform/row_conversion.cu
index fd7e7bc0b31..f3af90fb54f 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/cpp/src/transform/row_conversion.cu
@@ -15,14 +15,15 @@
  */
 
 #include <cooperative_groups.h>
-#include <cuda/functional>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
+#include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
@@ -42,15 +43,13 @@
 #include <thrust/scan.h>
 #include <type_traits>
 
-#include "row_conversion.hpp"
-
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
 #define ASYNC_MEMCPY_SUPPORTED
 #endif
 
 #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED)
 #include <cuda/barrier>
-#endif // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED)
+#endif  // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED)
 
 #include <algorithm>
 #include <cstdarg>
@@ -61,6 +60,8 @@
 #include <optional>
 #include <tuple>
 
+#include <cuda/functional>
+
 namespace {
 
 constexpr auto JCUDF_ROW_ALIGNMENT = 8;
@@ -68,30 +69,31 @@ constexpr auto JCUDF_ROW_ALIGNMENT = 8;
 constexpr auto MAX_BATCH_SIZE = std::numeric_limits<cudf::size_type>::max();
 
 // Number of rows each block processes in the two kernels. Tuned via nsight
-constexpr auto NUM_STRING_ROWS_PER_BLOCK_TO_ROWS = 1024;
+constexpr auto NUM_STRING_ROWS_PER_BLOCK_TO_ROWS   = 1024;
 constexpr auto NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS = 64;
-constexpr auto MIN_STRING_BLOCKS = 32;
-constexpr auto MAX_STRING_BLOCKS = MAX_BATCH_SIZE;
+constexpr auto MIN_STRING_BLOCKS                   = 32;
+constexpr auto MAX_STRING_BLOCKS                   = MAX_BATCH_SIZE;
 
 constexpr auto NUM_WARPS_IN_BLOCK = 32;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 // needed to suppress warning about cuda::barrier
 #pragma nv_diag_suppress static_var_with_dynamic_init
 
 using namespace cudf;
 using detail::make_device_uvector_async;
+using detail::make_device_uvector_sync;
 using rmm::device_uvector;
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
 using cuda::aligned_size_t;
 #else
-template <std::size_t> using aligned_size_t = size_t; // Local stub for cuda::aligned_size_t.
-#endif // ASYNC_MEMCPY_SUPPORTED
+template <std::size_t>
+using aligned_size_t = size_t;  // Local stub for cuda::aligned_size_t.
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
 namespace cudf {
-namespace jni {
 namespace detail {
 
 /*
@@ -157,8 +159,9 @@ struct tile_info {
   int end_row;
   int batch_number;
 
-  __device__ inline size_type get_shared_row_size(size_type const *const col_offsets,
-                                                  size_type const *const col_sizes) const {
+  __device__ inline size_type get_shared_row_size(size_type const* const col_offsets,
+                                                  size_type const* const col_sizes) const
+  {
     // this calculation is invalid if there are holes in the data such as a variable-width column.
     // It is wrong in a safe way in that it will say this row size is larger than it should be, so
     // we are not losing data we are just not as efficient as we could be with shared memory. This
@@ -180,9 +183,9 @@ struct tile_info {
  *
  */
 struct row_batch {
-  size_type num_bytes;                   // number of bytes in this batch
-  size_type row_count;                   // number of rows in the batch
-  device_uvector<size_type> row_offsets; // offsets column of output cudf column
+  size_type num_bytes;                    // number of bytes in this batch
+  size_type row_count;                    // number of rows in the batch
+  device_uvector<size_type> row_offsets;  // offsets column of output cudf column
 };
 
 /**
@@ -190,11 +193,11 @@ struct row_batch {
  *
  */
 struct batch_data {
-  device_uvector<size_type> batch_row_offsets;      // offsets to each row in incoming data
-  device_uvector<size_type> d_batch_row_boundaries; // row numbers for the start of each batch
+  device_uvector<size_type> batch_row_offsets;       // offsets to each row in incoming data
+  device_uvector<size_type> d_batch_row_boundaries;  // row numbers for the start of each batch
   std::vector<size_type>
-      batch_row_boundaries;           // row numbers for the start of each batch: 0, 1500, 2700
-  std::vector<row_batch> row_batches; // information about each batch such as byte count
+    batch_row_boundaries;              // row numbers for the start of each batch: 0, 1500, 2700
+  std::vector<row_batch> row_batches;  // information about each batch such as byte count
 };
 
 /**
@@ -207,8 +210,10 @@ struct batch_data {
  * offsets into the string column
  */
 std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<strings_column_view::offset_iterator>>
-build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validity_size,
-                         rmm::cuda_stream_view stream) {
+build_string_row_offsets(table_view const& tbl,
+                         size_type fixed_width_and_validity_size,
+                         rmm::cuda_stream_view stream)
+{
   auto const num_rows = tbl.num_rows();
   rmm::device_uvector<size_type> d_row_sizes(num_rows, stream);
   thrust::uninitialized_fill(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), 0);
@@ -216,41 +221,48 @@ build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validi
   auto d_offsets_iterators = [&]() {
     std::vector<strings_column_view::offset_iterator> offsets_iterators;
     auto offsets_iter = thrust::make_transform_iterator(
-        tbl.begin(), [](auto const &col) -> strings_column_view::offset_iterator {
-          if (!is_fixed_width(col.type())) {
-            CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!");
-            return strings_column_view(col).offsets_begin();
-          } else {
-            return nullptr;
-          }
-        });
-    std::copy_if(offsets_iter, offsets_iter + tbl.num_columns(),
+      tbl.begin(), [](auto const& col) -> strings_column_view::offset_iterator {
+        if (!is_fixed_width(col.type())) {
+          CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!");
+          return strings_column_view(col).offsets_begin();
+        } else {
+          return nullptr;
+        }
+      });
+    std::copy_if(offsets_iter,
+                 offsets_iter + tbl.num_columns(),
                  std::back_inserter(offsets_iterators),
-                 [](auto const &offset_ptr) { return offset_ptr != nullptr; });
-    return make_device_uvector_async(offsets_iterators, stream,
-                                     rmm::mr::get_current_device_resource());
+                 [](auto const& offset_ptr) { return offset_ptr != nullptr; });
+    return make_device_uvector_sync(
+      offsets_iterators, stream, rmm::mr::get_current_device_resource());
   }();
 
   auto const num_columns = static_cast<size_type>(d_offsets_iterators.size());
 
-  thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
+  thrust::for_each(rmm::exec_policy(stream),
+                   thrust::make_counting_iterator(0),
                    thrust::make_counting_iterator(num_columns * num_rows),
-                   [d_offsets_iterators = d_offsets_iterators.data(), num_columns, num_rows,
+                   [d_offsets_iterators = d_offsets_iterators.data(),
+                    num_columns,
+                    num_rows,
                     d_row_sizes = d_row_sizes.data()] __device__(auto element_idx) {
                      auto const row = element_idx % num_rows;
                      auto const col = element_idx / num_rows;
                      auto const val =
-                         d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row];
+                       d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row];
                      atomicAdd(&d_row_sizes[row], val);
                    });
 
   // transform the row sizes to include fixed width size and alignment
-  thrust::transform(
-      rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), d_row_sizes.begin(),
-      cuda::proclaim_return_type<size_type>([fixed_width_and_validity_size] __device__(
-                                                auto row_size) {
-        return util::round_up_unsafe(fixed_width_and_validity_size + row_size, JCUDF_ROW_ALIGNMENT);
-      }));
+  thrust::transform(rmm::exec_policy(stream),
+                    d_row_sizes.begin(),
+                    d_row_sizes.end(),
+                    d_row_sizes.begin(),
+                    cuda::proclaim_return_type<size_type>(
+                      [fixed_width_and_validity_size] __device__(auto row_size) {
+                        return util::round_up_unsafe(fixed_width_and_validity_size + row_size,
+                                                     JCUDF_ROW_ALIGNMENT);
+                      }));
 
   return {std::move(d_row_sizes), std::move(d_offsets_iterators)};
 }
@@ -261,9 +273,10 @@ build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validi
  */
 struct string_row_offset_functor {
   string_row_offset_functor(device_span<size_type const> d_row_offsets)
-      : d_row_offsets(d_row_offsets){};
+    : d_row_offsets(d_row_offsets){};
 
-  __device__ inline size_type operator()(int row_number, int) const {
+  __device__ inline size_type operator()(int row_number, int) const
+  {
     return d_row_offsets[row_number];
   }
 
@@ -276,9 +289,10 @@ struct string_row_offset_functor {
  */
 struct fixed_width_row_offset_functor {
   fixed_width_row_offset_functor(size_type fixed_width_only_row_size)
-      : _fixed_width_only_row_size(fixed_width_only_row_size){};
+    : _fixed_width_only_row_size(fixed_width_only_row_size){};
 
-  __device__ inline size_type operator()(int row_number, int tile_row_start) const {
+  __device__ inline size_type operator()(int row_number, int tile_row_start) const
+  {
     return (row_number - tile_row_start) * _fixed_width_only_row_size;
   }
 
@@ -300,11 +314,15 @@ struct fixed_width_row_offset_functor {
  * @param output_nm array of pointers to the output null masks
  * @param input_data pointing to the incoming row data
  */
-__global__ void
-copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type num_columns,
-                                     const size_type row_size, const size_type *input_offset_in_row,
-                                     const size_type *num_bytes, int8_t **output_data,
-                                     bitmask_type **output_nm, const int8_t *input_data) {
+__global__ void copy_from_rows_fixed_width_optimized(const size_type num_rows,
+                                                     const size_type num_columns,
+                                                     const size_type row_size,
+                                                     const size_type* input_offset_in_row,
+                                                     const size_type* num_bytes,
+                                                     int8_t** output_data,
+                                                     bitmask_type** output_nm,
+                                                     const int8_t* input_data)
+{
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -319,30 +337,30 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n
   // are controlled by the x dimension (there are multiple blocks in the x
   // dimension).
 
-  size_type const rows_per_group = blockDim.x;
-  size_type const row_group_start = blockIdx.x;
+  size_type const rows_per_group   = blockDim.x;
+  size_type const row_group_start  = blockIdx.x;
   size_type const row_group_stride = gridDim.x;
-  size_type const row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+  size_type const row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
 
   extern __shared__ int8_t shared_data[];
 
   // Because we are copying fixed width only data and we stride the rows
   // this thread will always start copying from shared data in the same place
-  int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
-  int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+  int8_t* row_tmp     = &shared_data[row_size * threadIdx.x];
+  int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
   for (auto row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
     // Step 1: Copy the data into shared memory
     // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
-    int64_t const *long_input = reinterpret_cast<int64_t const *>(input_data);
+    int64_t* long_shared      = reinterpret_cast<int64_t*>(shared_data);
+    int64_t const* long_input = reinterpret_cast<int64_t const*>(input_data);
 
-    auto const shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
+    auto const shared_output_index  = threadIdx.x + (threadIdx.y * blockDim.x);
     auto const shared_output_stride = blockDim.x * blockDim.y;
-    auto const row_index_end = std::min(num_rows, ((row_group_index + 1) * rows_per_group));
-    auto const num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    auto const shared_length = row_size * num_rows_in_group;
+    auto const row_index_end        = std::min(num_rows, ((row_group_index + 1) * rows_per_group));
+    auto const num_rows_in_group    = row_index_end - (row_group_index * rows_per_group);
+    auto const shared_length        = row_size * num_rows_in_group;
 
     size_type const shared_output_end = shared_length / sizeof(int64_t);
 
@@ -363,33 +381,33 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n
     // But we might not use all of the threads if the number of rows does not go
     // evenly into the thread count. We don't want those threads to exit yet
     // because we may need them to copy data in for the next row group.
-    uint32_t active_mask = __ballot_sync(0xffff'ffffu, row_index < num_rows);
+    uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
     if (row_index < num_rows) {
-      auto const col_index_start = threadIdx.y;
+      auto const col_index_start  = threadIdx.y;
       auto const col_index_stride = blockDim.y;
       for (auto col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-        auto const col_size = num_bytes[col_index];
-        int8_t const *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
-        int8_t *col_output = output_data[col_index];
+        auto const col_size   = num_bytes[col_index];
+        int8_t const* col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
+        int8_t* col_output    = output_data[col_index];
         switch (col_size) {
           case 1: {
             col_output[row_index] = *col_tmp;
             break;
           }
           case 2: {
-            int16_t *short_col_output = reinterpret_cast<int16_t *>(col_output);
-            short_col_output[row_index] = *reinterpret_cast<const int16_t *>(col_tmp);
+            int16_t* short_col_output   = reinterpret_cast<int16_t*>(col_output);
+            short_col_output[row_index] = *reinterpret_cast<const int16_t*>(col_tmp);
             break;
           }
           case 4: {
-            int32_t *int_col_output = reinterpret_cast<int32_t *>(col_output);
-            int_col_output[row_index] = *reinterpret_cast<const int32_t *>(col_tmp);
+            int32_t* int_col_output   = reinterpret_cast<int32_t*>(col_output);
+            int_col_output[row_index] = *reinterpret_cast<const int32_t*>(col_tmp);
             break;
           }
           case 8: {
-            int64_t *long_col_output = reinterpret_cast<int64_t *>(col_output);
-            long_col_output[row_index] = *reinterpret_cast<const int64_t *>(col_tmp);
+            int64_t* long_col_output   = reinterpret_cast<int64_t*>(col_output);
+            long_col_output[row_index] = *reinterpret_cast<const int64_t*>(col_tmp);
             break;
           }
           default: {
@@ -402,25 +420,29 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n
           }
         }
 
-        bitmask_type *nm = output_nm[col_index];
-        int8_t *valid_byte = &row_vld_tmp[col_index / 8];
+        bitmask_type* nm          = output_nm[col_index];
+        int8_t* valid_byte        = &row_vld_tmp[col_index / 8];
         size_type byte_bit_offset = col_index % 8;
-        int predicate = *valid_byte & (1 << byte_bit_offset);
-        uint32_t bitmask = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) {
-          nm[word_index(row_index)] = bitmask;
-        }
-      } // end column loop
-    }   // end row copy
+        int predicate             = *valid_byte & (1 << byte_bit_offset);
+        uint32_t bitmask          = __ballot_sync(active_mask, predicate);
+        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
+      }  // end column loop
+    }    // end row copy
     // wait for the row_group to be totally copied before starting on the next row group
     __syncthreads();
   }
 }
 
-__global__ void copy_to_rows_fixed_width_optimized(
-    const size_type start_row, const size_type num_rows, const size_type num_columns,
-    const size_type row_size, const size_type *output_offset_in_row, const size_type *num_bytes,
-    const int8_t **input_data, const bitmask_type **input_nm, int8_t *output_data) {
+__global__ void copy_to_rows_fixed_width_optimized(const size_type start_row,
+                                                   const size_type num_rows,
+                                                   const size_type num_columns,
+                                                   const size_type row_size,
+                                                   const size_type* output_offset_in_row,
+                                                   const size_type* num_bytes,
+                                                   const int8_t** input_data,
+                                                   const bitmask_type** input_nm,
+                                                   int8_t* output_data)
+{
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -437,18 +459,18 @@ __global__ void copy_to_rows_fixed_width_optimized(
   // are controlled by the x dimension (there are multiple blocks in the x
   // dimension).
 
-  size_type rows_per_group = blockDim.x;
-  size_type row_group_start = blockIdx.x;
+  size_type rows_per_group   = blockDim.x;
+  size_type row_group_start  = blockIdx.x;
   size_type row_group_stride = gridDim.x;
-  size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+  size_type row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
 
   extern __shared__ int8_t shared_data[];
 
   // Because we are copying fixed width only data and we stride the rows
   // this thread will always start copying to shared data in the same place
-  int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
-  int8_t *row_vld_tmp =
-      &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+  int8_t* row_tmp = &shared_data[row_size * threadIdx.x];
+  int8_t* row_vld_tmp =
+    &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
   for (size_type row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
@@ -459,31 +481,31 @@ __global__ void copy_to_rows_fixed_width_optimized(
     // evenly into the thread count. We don't want those threads to exit yet
     // because we may need them to copy data back out.
     if (row_index < (start_row + num_rows)) {
-      size_type col_index_start = threadIdx.y;
+      size_type col_index_start  = threadIdx.y;
       size_type col_index_stride = blockDim.y;
       for (size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-        size_type col_size = num_bytes[col_index];
-        int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]);
-        const int8_t *col_input = input_data[col_index];
+        size_type col_size      = num_bytes[col_index];
+        int8_t* col_tmp         = &(row_tmp[output_offset_in_row[col_index]]);
+        const int8_t* col_input = input_data[col_index];
         switch (col_size) {
           case 1: {
             *col_tmp = col_input[row_index];
             break;
           }
           case 2: {
-            const int16_t *short_col_input = reinterpret_cast<const int16_t *>(col_input);
-            *reinterpret_cast<int16_t *>(col_tmp) = short_col_input[row_index];
+            const int16_t* short_col_input       = reinterpret_cast<const int16_t*>(col_input);
+            *reinterpret_cast<int16_t*>(col_tmp) = short_col_input[row_index];
             break;
           }
           case 4: {
-            const int32_t *int_col_input = reinterpret_cast<const int32_t *>(col_input);
-            *reinterpret_cast<int32_t *>(col_tmp) = int_col_input[row_index];
+            const int32_t* int_col_input         = reinterpret_cast<const int32_t*>(col_input);
+            *reinterpret_cast<int32_t*>(col_tmp) = int_col_input[row_index];
             break;
           }
           case 8: {
-            const int64_t *long_col_input = reinterpret_cast<const int64_t *>(col_input);
-            *reinterpret_cast<int64_t *>(col_tmp) = long_col_input[row_index];
+            const int64_t* long_col_input        = reinterpret_cast<const int64_t*>(col_input);
+            *reinterpret_cast<int64_t*>(col_tmp) = long_col_input[row_index];
             break;
           }
           default: {
@@ -497,11 +519,11 @@ __global__ void copy_to_rows_fixed_width_optimized(
         }
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t *valid_byte = &row_vld_tmp[col_index / 8];
+        int8_t* valid_byte        = &row_vld_tmp[col_index / 8];
         size_type byte_bit_offset = col_index % 8;
-        uint64_t fixup_bytes = reinterpret_cast<uint64_t>(valid_byte) % 4;
-        int32_t *valid_int = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
-        size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8);
+        uint64_t fixup_bytes      = reinterpret_cast<uint64_t>(valid_byte) % 4;
+        int32_t* valid_int        = reinterpret_cast<int32_t*>(valid_byte - fixup_bytes);
+        size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
         // Now copy validity for the column
         if (input_nm[col_index]) {
           if (bit_is_set(input_nm[col_index], row_index)) {
@@ -513,24 +535,22 @@ __global__ void copy_to_rows_fixed_width_optimized(
           // It is valid so just set the bit
           atomicOr_block(valid_int, 1 << int_bit_offset);
         }
-      } // end column loop
-    }   // end row copy
+      }  // end column loop
+    }    // end row copy
     // wait for the row_group to be totally copied into shared memory
     __syncthreads();
 
     // Step 2: Copy the data back out
     // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
-    int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
+    int64_t* long_shared = reinterpret_cast<int64_t*>(shared_data);
+    int64_t* long_output = reinterpret_cast<int64_t*>(output_data);
 
-    size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x);
+    size_type shared_input_index  = threadIdx.x + (threadIdx.y * blockDim.x);
     size_type shared_input_stride = blockDim.x * blockDim.y;
-    size_type row_index_end = ((row_group_index + 1) * rows_per_group);
-    if (row_index_end > num_rows) {
-      row_index_end = num_rows;
-    }
+    size_type row_index_end       = ((row_group_index + 1) * rows_per_group);
+    if (row_index_end > num_rows) { row_index_end = num_rows; }
     size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    size_type shared_length = row_size * num_rows_in_group;
+    size_type shared_length     = row_size * num_rows_in_group;
 
     size_type shared_input_end = shared_length / sizeof(int64_t);
 
@@ -549,7 +569,7 @@ __global__ void copy_to_rows_fixed_width_optimized(
 #define MEMCPY(dst, src, size, barrier) cuda::memcpy_async(dst, src, size, barrier)
 #else
 #define MEMCPY(dst, src, size, barrier) memcpy(dst, src, size)
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
 /**
  * @brief copy data from cudf columns into JCUDF format, which is row-based
@@ -568,12 +588,17 @@ __global__ void copy_to_rows_fixed_width_optimized(
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_to_rows(const size_type num_rows, const size_type num_columns,
+__global__ void copy_to_rows(const size_type num_rows,
+                             const size_type num_columns,
                              const size_type shmem_used_per_tile,
-                             device_span<const tile_info> tile_infos, const int8_t **input_data,
-                             const size_type *col_sizes, const size_type *col_offsets,
-                             RowOffsetFunctor row_offsets, size_type const *batch_row_boundaries,
-                             int8_t **output_data) {
+                             device_span<const tile_info> tile_infos,
+                             const int8_t** input_data,
+                             const size_type* col_sizes,
+                             const size_type* col_offsets,
+                             RowOffsetFunctor row_offsets,
+                             size_type const* batch_row_boundaries,
+                             int8_t** output_data)
+{
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -583,21 +608,19 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
   // any calculation to do here, but it is important to note.
 
   auto const group = cooperative_groups::this_thread_block();
-  auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
+  auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
   extern __shared__ int8_t shared_data[];
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
   __shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier;
-  if (group.thread_rank() == 0) {
-    init(&tile_barrier, group.size());
-  }
+  if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); }
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
-  auto const tile = tile_infos[blockIdx.x];
-  auto const num_tile_cols = tile.num_cols();
-  auto const num_tile_rows = tile.num_rows();
-  auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes);
+  auto const tile                   = tile_infos[blockIdx.x];
+  auto const num_tile_cols          = tile.num_cols();
+  auto const num_tile_rows          = tile.num_rows();
+  auto const tile_row_size          = tile.get_shared_row_size(col_offsets, col_sizes);
   auto const starting_column_offset = col_offsets[tile.start_col];
 
   // to do the copy we need to do n column copies followed by m element copies OR we have to do m
@@ -612,12 +635,11 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
   // works on a row
   for (int relative_col = warp.meta_group_rank(); relative_col < num_tile_cols;
        relative_col += warp.meta_group_size()) {
-
-    auto const absolute_col = relative_col + tile.start_col;
-    auto const col_size = col_sizes[absolute_col];
-    auto const col_offset = col_offsets[absolute_col];
+    auto const absolute_col        = relative_col + tile.start_col;
+    auto const col_size            = col_sizes[absolute_col];
+    auto const col_offset          = col_offsets[absolute_col];
     auto const relative_col_offset = col_offset - starting_column_offset;
-    auto const col_ptr = input_data[absolute_col];
+    auto const col_ptr             = input_data[absolute_col];
 
     if (col_ptr == nullptr) {
       // variable-width data column
@@ -626,7 +648,6 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
 
     for (int relative_row = warp.thread_rank(); relative_row < num_tile_rows;
          relative_row += warp.size()) {
-
       if (relative_row >= num_tile_rows) {
         // out of bounds
         continue;
@@ -634,23 +655,23 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
       auto const absolute_row = relative_row + tile.start_row;
 
       auto const shared_offset = relative_row * tile_row_size + relative_col_offset;
-      auto const input_src = col_ptr + col_size * absolute_row;
+      auto const input_src     = col_ptr + col_size * absolute_row;
 
       // copy the element from global memory
       switch (col_size) {
         case 2: {
-          const int16_t *short_col_input = reinterpret_cast<const int16_t *>(input_src);
-          *reinterpret_cast<int16_t *>(&shared_data[shared_offset]) = *short_col_input;
+          const int16_t* short_col_input = reinterpret_cast<const int16_t*>(input_src);
+          *reinterpret_cast<int16_t*>(&shared_data[shared_offset]) = *short_col_input;
           break;
         }
         case 4: {
-          const int32_t *int_col_input = reinterpret_cast<const int32_t *>(input_src);
-          *reinterpret_cast<int32_t *>(&shared_data[shared_offset]) = *int_col_input;
+          const int32_t* int_col_input = reinterpret_cast<const int32_t*>(input_src);
+          *reinterpret_cast<int32_t*>(&shared_data[shared_offset]) = *int_col_input;
           break;
         }
         case 8: {
-          const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_src);
-          *reinterpret_cast<int64_t *>(&shared_data[shared_offset]) = *long_col_input;
+          const int64_t* long_col_input = reinterpret_cast<const int64_t*>(input_src);
+          *reinterpret_cast<int64_t*>(&shared_data[shared_offset]) = *long_col_input;
           break;
         }
         case 1: shared_data[shared_offset] = *input_src; break;
@@ -691,7 +712,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
   tile_barrier.arrive_and_wait();
 #else
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 }
 
 /**
@@ -710,58 +731,60 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void
-copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
-                      const size_type shmem_used_per_tile, RowOffsetFunctor row_offsets,
-                      size_type const *batch_row_boundaries, int8_t **output_data,
-                      const size_type validity_offset, device_span<const tile_info> tile_infos,
-                      const bitmask_type **input_nm) {
+__global__ void copy_validity_to_rows(const size_type num_rows,
+                                      const size_type num_columns,
+                                      const size_type shmem_used_per_tile,
+                                      RowOffsetFunctor row_offsets,
+                                      size_type const* batch_row_boundaries,
+                                      int8_t** output_data,
+                                      const size_type validity_offset,
+                                      device_span<const tile_info> tile_infos,
+                                      const bitmask_type** input_nm)
+{
   extern __shared__ int8_t shared_data[];
 
   // each thread of warp reads a single int32 of validity - so we read 128 bytes then ballot_sync
   // the bits and write the result to shmem after we fill shared mem memcpy it out in a blob.
   auto const group = cooperative_groups::this_thread_block();
-  auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
+  auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
   // Initialize cuda barriers for each tile.
   __shared__ cuda::barrier<cuda::thread_scope_block> shared_tile_barrier;
-  if (group.thread_rank() == 0) {
-    init(&shared_tile_barrier, group.size());
-  }
+  if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); }
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
-  auto tile = tile_infos[blockIdx.x];
+  auto tile                = tile_infos[blockIdx.x];
   auto const num_tile_cols = tile.num_cols();
   auto const num_tile_rows = tile.num_rows();
 
   auto const threads_per_warp = warp.size();
-  auto const rows_per_read = cudf::detail::size_in_bits<bitmask_type>();
+  auto const rows_per_read    = cudf::detail::size_in_bits<bitmask_type>();
 
   auto const num_sections_x = util::div_rounding_up_unsafe(num_tile_cols, threads_per_warp);
   auto const num_sections_y = util::div_rounding_up_unsafe(num_tile_rows, rows_per_read);
   auto const validity_data_row_length = util::round_up_unsafe(
-      util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
+    util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
   auto const total_sections = num_sections_x * num_sections_y;
 
   // the tile is divided into sections. A warp operates on a section at a time.
   for (int my_section_idx = warp.meta_group_rank(); my_section_idx < total_sections;
        my_section_idx += warp.meta_group_size()) {
     // convert to rows and cols
-    auto const section_x = my_section_idx % num_sections_x;
-    auto const section_y = my_section_idx / num_sections_x;
-    auto const relative_col = section_x * threads_per_warp + warp.thread_rank();
-    auto const relative_row = section_y * rows_per_read;
-    auto const absolute_col = relative_col + tile.start_col;
-    auto const absolute_row = relative_row + tile.start_row;
-    auto const participating = absolute_col < num_columns && absolute_row < num_rows;
-    auto const participation_mask = __ballot_sync(0xFFFF'FFFFu, participating);
+    auto const section_x          = my_section_idx % num_sections_x;
+    auto const section_y          = my_section_idx / num_sections_x;
+    auto const relative_col       = section_x * threads_per_warp + warp.thread_rank();
+    auto const relative_row       = section_y * rows_per_read;
+    auto const absolute_col       = relative_col + tile.start_col;
+    auto const absolute_row       = relative_row + tile.start_row;
+    auto const participating      = absolute_col < num_columns && absolute_row < num_rows;
+    auto const participation_mask = __ballot_sync(0xFFFFFFFF, participating);
 
     if (participating) {
-      auto my_data = input_nm[absolute_col] != nullptr ?
-                         input_nm[absolute_col][word_index(absolute_row)] :
-                         std::numeric_limits<uint32_t>::max();
+      auto my_data = input_nm[absolute_col] != nullptr
+                       ? input_nm[absolute_col][word_index(absolute_row)]
+                       : std::numeric_limits<uint32_t>::max();
 
       // every thread that is participating in the warp has 4 bytes, but it's column-based data and
       // we need it in row-based. So we shuffle the bits around with ballot_sync to make the bytes
@@ -771,19 +794,19 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
         auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
         // lead thread in each warp writes data
         auto const validity_write_offset =
-            validity_data_row_length * (relative_row + i) + (relative_col / CHAR_BIT);
+          validity_data_row_length * (relative_row + i) + (relative_col / CHAR_BIT);
         if (warp.thread_rank() == 0) {
-          *reinterpret_cast<bitmask_type *>(&shared_data[validity_write_offset]) = validity_data;
+          *reinterpret_cast<bitmask_type*>(&shared_data[validity_write_offset]) = validity_data;
         }
       }
     }
   }
 
   auto const output_data_base =
-      output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT;
+    output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT;
 
   // each warp copies a row at a time
-  auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT);
+  auto const row_bytes       = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT);
   auto const row_batch_start = tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
 
   // make sure entire tile has finished copy
@@ -809,7 +832,7 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
   shared_tile_barrier.arrive_and_wait();
 #else
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 }
 
 /**
@@ -828,42 +851,46 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_strings_to_rows(size_type const num_rows, size_type const num_variable_columns,
-                                     int8_t const **variable_input_data,
-                                     size_type const *variable_col_output_offsets,
-                                     size_type const **variable_col_offsets,
-                                     size_type fixed_width_row_size, RowOffsetFunctor row_offsets,
-                                     size_type const batch_row_offset, int8_t *output_data) {
+__global__ void copy_strings_to_rows(size_type const num_rows,
+                                     size_type const num_variable_columns,
+                                     int8_t const** variable_input_data,
+                                     size_type const* variable_col_output_offsets,
+                                     size_type const** variable_col_offsets,
+                                     size_type fixed_width_row_size,
+                                     RowOffsetFunctor row_offsets,
+                                     size_type const batch_row_offset,
+                                     int8_t* output_data)
+{
   // Each block will take a group of rows controlled by NUM_STRING_ROWS_PER_BLOCK_TO_ROWS. Each warp
   // will copy a row at a time. The base thread will first go through column data and fill out
   // offset/length information for the column. Then all threads of the warp will participate in the
   // memcpy of the string data.
   auto const my_block = cooperative_groups::this_thread_block();
-  auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
+  auto const warp     = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
 #ifdef ASYNC_MEMCPY_SUPPORTED
   cuda::barrier<cuda::thread_scope_block> block_barrier;
 #endif
 
   auto const start_row =
-      blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset;
+    blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset;
   auto const end_row =
-      std::min(num_rows, static_cast<size_type>(start_row + NUM_STRING_ROWS_PER_BLOCK_TO_ROWS));
+    std::min(num_rows, static_cast<size_type>(start_row + NUM_STRING_ROWS_PER_BLOCK_TO_ROWS));
 
   for (int row = start_row; row < end_row; row += warp.meta_group_size()) {
-    auto offset = fixed_width_row_size; // initial offset to variable-width data
+    auto offset                = fixed_width_row_size;  // initial offset to variable-width data
     auto const base_row_offset = row_offsets(row, 0);
     for (int col = 0; col < num_variable_columns; ++col) {
       auto const string_start_offset = variable_col_offsets[col][row];
-      auto const string_length = variable_col_offsets[col][row + 1] - string_start_offset;
+      auto const string_length       = variable_col_offsets[col][row + 1] - string_start_offset;
       if (warp.thread_rank() == 0) {
         // write the offset/length to column
-        uint32_t *output_dest = reinterpret_cast<uint32_t *>(
-            &output_data[base_row_offset + variable_col_output_offsets[col]]);
+        uint32_t* output_dest = reinterpret_cast<uint32_t*>(
+          &output_data[base_row_offset + variable_col_output_offsets[col]]);
         output_dest[0] = offset;
         output_dest[1] = string_length;
       }
       auto string_output_dest = &output_data[base_row_offset + offset];
-      auto string_output_src = &variable_input_data[col][string_start_offset];
+      auto string_output_src  = &variable_input_data[col][string_start_offset];
       warp.sync();
 #ifdef ASYNC_MEMCPY_SUPPORTED
       cuda::memcpy_async(warp, string_output_dest, string_output_src, string_length, block_barrier);
@@ -893,11 +920,17 @@ __global__ void copy_strings_to_rows(size_type const num_rows, size_type const n
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
-                               const size_type shmem_used_per_tile, RowOffsetFunctor row_offsets,
-                               size_type const *batch_row_boundaries, int8_t **output_data,
-                               const size_type *col_sizes, const size_type *col_offsets,
-                               device_span<const tile_info> tile_infos, const int8_t *input_data) {
+__global__ void copy_from_rows(const size_type num_rows,
+                               const size_type num_columns,
+                               const size_type shmem_used_per_tile,
+                               RowOffsetFunctor row_offsets,
+                               size_type const* batch_row_boundaries,
+                               int8_t** output_data,
+                               const size_type* col_sizes,
+                               const size_type* col_offsets,
+                               device_span<const tile_info> tile_infos,
+                               const int8_t* input_data)
+{
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -910,31 +943,30 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
   // memory for each of the tiles that we work on
 
   auto const group = cooperative_groups::this_thread_block();
-  auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
+  auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
   extern __shared__ int8_t shared[];
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
   // Initialize cuda barriers for each tile.
   __shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier;
-  if (group.thread_rank() == 0) {
-    init(&tile_barrier, group.size());
-  }
+  if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); }
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
   {
-    auto const fetch_tile = tile_infos[blockIdx.x];
+    auto const fetch_tile           = tile_infos[blockIdx.x];
     auto const fetch_tile_start_row = fetch_tile.start_row;
-    auto const starting_col_offset = col_offsets[fetch_tile.start_col];
-    auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes);
+    auto const starting_col_offset  = col_offsets[fetch_tile.start_col];
+    auto const fetch_tile_row_size  = fetch_tile.get_shared_row_size(col_offsets, col_sizes);
     auto const row_batch_start =
-        fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number];
+      fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number];
 
     for (int absolute_row = warp.meta_group_rank() + fetch_tile.start_row;
-         absolute_row <= fetch_tile.end_row; absolute_row += warp.meta_group_size()) {
+         absolute_row <= fetch_tile.end_row;
+         absolute_row += warp.meta_group_size()) {
       warp.sync();
       auto shared_offset = (absolute_row - fetch_tile_start_row) * fetch_tile_row_size;
-      auto dst = &shared[shared_offset];
+      auto dst           = &shared[shared_offset];
       auto src = &input_data[row_offsets(absolute_row, row_batch_start) + starting_col_offset];
       // copy the data
 #ifdef ASYNC_MEMCPY_SUPPORTED
@@ -948,9 +980,9 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
   }
 
   {
-    auto const tile = tile_infos[blockIdx.x];
-    auto const rows_in_tile = tile.num_rows();
-    auto const cols_in_tile = tile.num_cols();
+    auto const tile          = tile_infos[blockIdx.x];
+    auto const rows_in_tile  = tile.num_rows();
+    auto const cols_in_tile  = tile.num_cols();
     auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes);
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
@@ -958,7 +990,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
     tile_barrier.arrive_and_wait();
 #else
     group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
     // Now we copy from shared memory to final destination. The data is laid out in rows in shared
     // memory, so the reads for a column will be "vertical". Because of this and the different sizes
@@ -967,8 +999,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
     // than rows, we do a global index instead of a double for loop with col/row.
     for (int relative_row = warp.thread_rank(); relative_row < rows_in_tile;
          relative_row += warp.size()) {
-
-      auto const absolute_row = relative_row + tile.start_row;
+      auto const absolute_row             = relative_row + tile.start_row;
       auto const shared_memory_row_offset = tile_row_size * relative_row;
 
       for (int relative_col = warp.meta_group_rank(); relative_col < cols_in_tile;
@@ -976,11 +1007,11 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
         auto const absolute_col = relative_col + tile.start_col;
 
         auto const shared_memory_offset =
-            col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset;
+          col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset;
         auto const column_size = col_sizes[absolute_col];
 
-        int8_t *shmem_src = &shared[shared_memory_offset];
-        int8_t *dst = &output_data[absolute_col][absolute_row * column_size];
+        int8_t* shmem_src = &shared[shared_memory_offset];
+        int8_t* dst       = &output_data[absolute_col][absolute_row * column_size];
 
         MEMCPY(dst, shmem_src, column_size, tile_barrier);
       }
@@ -992,7 +1023,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
   tile_barrier.arrive_and_wait();
 #else
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 }
 
 /**
@@ -1011,12 +1042,16 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void
-copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
-                        const size_type shmem_used_per_tile, RowOffsetFunctor row_offsets,
-                        size_type const *batch_row_boundaries, bitmask_type **output_nm,
-                        const size_type validity_offset, device_span<const tile_info> tile_infos,
-                        const int8_t *input_data) {
+__global__ void copy_validity_from_rows(const size_type num_rows,
+                                        const size_type num_columns,
+                                        const size_type shmem_used_per_tile,
+                                        RowOffsetFunctor row_offsets,
+                                        size_type const* batch_row_boundaries,
+                                        bitmask_type** output_nm,
+                                        const size_type validity_offset,
+                                        device_span<const tile_info> tile_infos,
+                                        const int8_t* input_data)
+{
   extern __shared__ int8_t shared[];
 
   using cudf::detail::warp_size;
@@ -1036,46 +1071,44 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
   //        __ballot_sync, representing 32 rows of that column.
 
   auto const group = cooperative_groups::this_thread_block();
-  auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
+  auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
   // Initialize cuda barriers for each tile.
   __shared__ cuda::barrier<cuda::thread_scope_block> shared_tile_barrier;
-  if (group.thread_rank() == 0) {
-    init(&shared_tile_barrier, group.size());
-  }
+  if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); }
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
-  auto const tile = tile_infos[blockIdx.x];
+  auto const tile           = tile_infos[blockIdx.x];
   auto const tile_start_col = tile.start_col;
   auto const tile_start_row = tile.start_row;
-  auto const num_tile_cols = tile.num_cols();
-  auto const num_tile_rows = tile.num_rows();
+  auto const num_tile_cols  = tile.num_cols();
+  auto const num_tile_rows  = tile.num_rows();
 
   auto const threads_per_warp = warp.size();
-  auto const cols_per_read = CHAR_BIT;
+  auto const cols_per_read    = CHAR_BIT;
 
-  auto const rows_per_read = static_cast<size_type>(threads_per_warp);
-  auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, cols_per_read);
-  auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, rows_per_read);
-  auto const validity_data_col_length = num_sections_y * 4; // words to bytes
-  auto const total_sections = num_sections_x * num_sections_y;
+  auto const rows_per_read            = static_cast<size_type>(threads_per_warp);
+  auto const num_sections_x           = util::div_rounding_up_safe(num_tile_cols, cols_per_read);
+  auto const num_sections_y           = util::div_rounding_up_safe(num_tile_rows, rows_per_read);
+  auto const validity_data_col_length = num_sections_y * 4;  // words to bytes
+  auto const total_sections           = num_sections_x * num_sections_y;
 
   // the tile is divided into sections. A warp operates on a section at a time.
   for (int my_section_idx = warp.meta_group_rank(); my_section_idx < total_sections;
        my_section_idx += warp.meta_group_size()) {
     // convert section to row and col
-    auto const section_x = my_section_idx % num_sections_x;
-    auto const section_y = my_section_idx / num_sections_x;
+    auto const section_x    = my_section_idx % num_sections_x;
+    auto const section_y    = my_section_idx / num_sections_x;
     auto const relative_col = section_x * cols_per_read;
     auto const relative_row = section_y * rows_per_read + warp.thread_rank();
     auto const absolute_col = relative_col + tile_start_col;
     auto const absolute_row = relative_row + tile_start_row;
     auto const row_batch_start =
-        tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
+      tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
 
-    auto const participation_mask = __ballot_sync(0xFFFF'FFFFu, absolute_row < num_rows);
+    auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
 
     if (absolute_row < num_rows) {
       auto const my_byte = input_data[row_offsets(absolute_row, row_batch_start) + validity_offset +
@@ -1090,8 +1123,8 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
         // lead thread in each warp writes data
         if (warp.thread_rank() == 0) {
           auto const validity_write_offset =
-              validity_data_col_length * (relative_col + i) + relative_row / cols_per_read;
-          *reinterpret_cast<bitmask_type *>(&shared[validity_write_offset]) = validity_data;
+            validity_data_col_length * (relative_col + i) + relative_row / cols_per_read;
+          *reinterpret_cast<bitmask_type*>(&shared[validity_write_offset]) = validity_data;
         }
       }
     }
@@ -1106,13 +1139,13 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
   for (int relative_col = warp.meta_group_rank(); relative_col < num_tile_cols;
        relative_col += warp.meta_group_size()) {
     auto const absolute_col = relative_col + tile_start_col;
-    auto dst = output_nm[absolute_col] + word_index(tile_start_row);
+    auto dst                = output_nm[absolute_col] + word_index(tile_start_row);
     auto const src =
-        reinterpret_cast<bitmask_type *>(&shared[validity_data_col_length * relative_col]);
+      reinterpret_cast<bitmask_type*>(&shared[validity_data_col_length * relative_col]);
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
-    cuda::memcpy_async(warp, dst, src, aligned_size_t<4>(validity_data_col_length),
-                       shared_tile_barrier);
+    cuda::memcpy_async(
+      warp, dst, src, aligned_size_t<4>(validity_data_col_length), shared_tile_barrier);
 #else
     for (int b = warp.thread_rank(); b < col_words; b += warp.size()) {
       dst[b] = src[b];
@@ -1125,7 +1158,7 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
   shared_tile_barrier.arrive_and_wait();
 #else
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 }
 
 /**
@@ -1142,38 +1175,42 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
  * @param num_string_columns number of string columns in the table
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets, int32_t **string_row_offsets,
-                                       int32_t **string_lengths, size_type **string_column_offsets,
-                                       char **string_col_data, int8_t const *row_data,
+__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets,
+                                       int32_t** string_row_offsets,
+                                       int32_t** string_lengths,
+                                       size_type** string_column_offsets,
+                                       char** string_col_data,
+                                       int8_t const* row_data,
                                        size_type const num_rows,
-                                       size_type const num_string_columns) {
+                                       size_type const num_string_columns)
+{
   // Each warp takes a tile, which is a single column and up to ROWS_PER_BLOCK rows. A tile will not
   // wrap around the bottom of the table. The warp will copy the strings for each row in the tile.
   // Traversing in row-major order to coalesce the offsets and size reads.
   auto my_block = cooperative_groups::this_thread_block();
-  auto warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
+  auto warp     = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
 #ifdef ASYNC_MEMCPY_SUPPORTED
   cuda::barrier<cuda::thread_scope_block> block_barrier;
 #endif
 
   // workaround for not being able to take a reference to a constexpr host variable
   auto const ROWS_PER_BLOCK = NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS;
-  auto const tiles_per_col = util::div_rounding_up_unsafe(num_rows, ROWS_PER_BLOCK);
-  auto const starting_tile = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank();
-  auto const num_tiles = tiles_per_col * num_string_columns;
-  auto const tile_stride = warp.meta_group_size() * gridDim.x;
+  auto const tiles_per_col  = util::div_rounding_up_unsafe(num_rows, ROWS_PER_BLOCK);
+  auto const starting_tile  = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank();
+  auto const num_tiles      = tiles_per_col * num_string_columns;
+  auto const tile_stride    = warp.meta_group_size() * gridDim.x;
   // Each warp will copy strings in its tile. This is handled by all the threads of a warp passing
   // the same parameters to async_memcpy and all threads in the warp participating in the copy.
   for (auto my_tile = starting_tile; my_tile < num_tiles; my_tile += tile_stride) {
     auto const starting_row = (my_tile % tiles_per_col) * ROWS_PER_BLOCK;
-    auto const col = my_tile / tiles_per_col;
-    auto const str_len = string_lengths[col];
-    auto const str_row_off = string_row_offsets[col];
-    auto const str_col_off = string_column_offsets[col];
-    auto str_col_data = string_col_data[col];
+    auto const col          = my_tile / tiles_per_col;
+    auto const str_len      = string_lengths[col];
+    auto const str_row_off  = string_row_offsets[col];
+    auto const str_col_off  = string_column_offsets[col];
+    auto str_col_data       = string_col_data[col];
     for (int row = starting_row; row < starting_row + ROWS_PER_BLOCK && row < num_rows; ++row) {
       auto const src = &row_data[row_offsets(row, 0) + str_row_off[row]];
-      auto dst = &str_col_data[str_col_off[row]];
+      auto dst       = &str_col_data[str_col_off[row]];
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
       cuda::memcpy_async(warp, dst, src, str_len[row], block_barrier);
@@ -1196,8 +1233,12 @@ __global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets, int32_t **s
  * @param [out] threads the size of the threads for the kernel
  * @return the size in bytes of shared memory needed for each block.
  */
-static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_type num_rows,
-                                        const size_type size_per_row, dim3 &blocks, dim3 &threads) {
+static int calc_fixed_width_kernel_dims(const size_type num_columns,
+                                        const size_type num_rows,
+                                        const size_type size_per_row,
+                                        dim3& blocks,
+                                        dim3& threads)
+{
   // We have found speed degrades when a thread handles more than 4 columns.
   // Each block is 2 dimensional. The y dimension indicates the columns.
   // We limit this to 32 threads in the y dimension so we can still
@@ -1207,7 +1248,7 @@ static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_
   // in the x dimension because we use atomic operations at the block
   // level when writing validity data out to main memory, and that would
   // need to change if we split a word of validity data between blocks.
-  int const y_block_size = min(util::div_rounding_up_safe(num_columns, 4), 32);
+  int const y_block_size          = min(util::div_rounding_up_safe(num_columns, 4), 32);
   int const x_possible_block_size = 1024 / y_block_size;
   // 48KB is the default setting for shared memory per block according to the cuda tutorials
   // If someone configures the GPU to only have 16 KB this might not work.
@@ -1230,9 +1271,9 @@ static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_
   // to try and future proof this a bit.
   int const num_blocks = std::clamp((num_rows + block_size - 1) / block_size, 1, 10240);
 
-  blocks.x = num_blocks;
-  blocks.y = 1;
-  blocks.z = 1;
+  blocks.x  = num_blocks;
+  blocks.y  = 1;
+  blocks.z  = 1;
   threads.x = block_size;
   threads.y = y_block_size;
   threads.z = 1;
@@ -1246,12 +1287,19 @@ static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_
  * into this function are common between runs and should be calculated once.
  */
 static std::unique_ptr<column> fixed_width_convert_to_rows(
-    const size_type start_row, const size_type num_rows, const size_type num_columns,
-    const size_type size_per_row, rmm::device_uvector<size_type> &column_start,
-    rmm::device_uvector<size_type> &column_size, rmm::device_uvector<const int8_t *> &input_data,
-    rmm::device_uvector<const bitmask_type *> &input_nm, const scalar &zero,
-    const scalar &scalar_size_per_row, rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr) {
+  const size_type start_row,
+  const size_type num_rows,
+  const size_type num_columns,
+  const size_type size_per_row,
+  rmm::device_uvector<size_type>& column_start,
+  rmm::device_uvector<size_type>& column_size,
+  rmm::device_uvector<const int8_t*>& input_data,
+  rmm::device_uvector<const bitmask_type*>& input_nm,
+  const scalar& zero,
+  const scalar& scalar_size_per_row,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
   int64_t const total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
   CUDF_EXPECTS(total_allocation < std::numeric_limits<size_type>::max(),
@@ -1259,28 +1307,43 @@ static std::unique_ptr<column> fixed_width_convert_to_rows(
 
   // Allocate and set the offsets row for the byte array
   std::unique_ptr<column> offsets =
-      cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr);
+    cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr);
 
-  std::unique_ptr<column> data =
-      make_numeric_column(data_type(type_id::INT8), static_cast<size_type>(total_allocation),
-                          mask_state::UNALLOCATED, stream, mr);
+  std::unique_ptr<column> data = make_numeric_column(data_type(type_id::INT8),
+                                                     static_cast<size_type>(total_allocation),
+                                                     mask_state::UNALLOCATED,
+                                                     stream,
+                                                     mr);
 
   dim3 blocks;
   dim3 threads;
   int shared_size =
-      detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+    detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
   copy_to_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
-      start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(),
-      input_data.data(), input_nm.data(), data->mutable_view().data<int8_t>());
-
-  return make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
-                           rmm::device_buffer{0, stream, mr}, stream, mr);
+    start_row,
+    num_rows,
+    num_columns,
+    size_per_row,
+    column_start.data(),
+    column_size.data(),
+    input_data.data(),
+    input_nm.data(),
+    data->mutable_view().data<int8_t>());
+
+  return make_lists_column(num_rows,
+                           std::move(offsets),
+                           std::move(data),
+                           0,
+                           rmm::device_buffer{0, cudf::get_default_stream(), mr},
+                           stream,
+                           mr);
 }
 
-static inline bool are_all_fixed_width(std::vector<data_type> const &schema) {
-  return std::all_of(schema.begin(), schema.end(),
-                     [](const data_type &t) { return is_fixed_width(t); });
+static inline bool are_all_fixed_width(std::vector<data_type> const& schema)
+{
+  return std::all_of(
+    schema.begin(), schema.end(), [](const data_type& t) { return is_fixed_width(t); });
 }
 
 /**
@@ -1291,9 +1354,10 @@ static inline bool are_all_fixed_width(std::vector<data_type> const &schema) {
  * @param [out] column_size the size in bytes of the data for each columns in the row.
  * @return the size in bytes each row needs.
  */
-static inline int32_t compute_fixed_width_layout(std::vector<data_type> const &schema,
-                                                 std::vector<size_type> &column_start,
-                                                 std::vector<size_type> &column_size) {
+static inline int32_t compute_fixed_width_layout(std::vector<data_type> const& schema,
+                                                 std::vector<size_type>& column_start,
+                                                 std::vector<size_type>& column_size)
+{
   // We guarantee that the start of each column is 64-bit aligned so anything can go
   // there, but to make the code simple we will still do an alignment for it.
   int32_t at_offset = 0;
@@ -1301,7 +1365,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<data_type> const &s
     size_type s = size_of(*col);
     column_size.emplace_back(s);
     std::size_t allocation_needed = s;
-    std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types
+    std::size_t alignment_needed  = allocation_needed;  // They are the same for fixed width types
     at_offset = util::round_up_unsafe(at_offset, static_cast<int32_t>(alignment_needed));
     column_start.emplace_back(at_offset);
     at_offset += allocation_needed;
@@ -1311,7 +1375,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<data_type> const &s
   // Eventually we can think about nullable vs not nullable, but for now we will just always add
   // it in
   int32_t const validity_bytes_needed =
-      util::div_rounding_up_safe<int32_t>(schema.size(), CHAR_BIT);
+    util::div_rounding_up_safe<int32_t>(schema.size(), CHAR_BIT);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
@@ -1327,8 +1391,8 @@ struct column_info_s {
   std::vector<size_type> column_sizes;
   std::vector<size_type> variable_width_column_starts;
 
-  column_info_s &operator=(column_info_s const &other) = delete;
-  column_info_s &operator=(column_info_s &&other) = delete;
+  column_info_s& operator=(column_info_s const& other) = delete;
+  column_info_s& operator=(column_info_s&& other)      = delete;
 };
 
 /**
@@ -1342,7 +1406,8 @@ struct column_info_s {
  * @return size of the fixed_width data portion of a row.
  */
 template <typename iterator>
-column_info_s compute_column_information(iterator begin, iterator end) {
+column_info_s compute_column_information(iterator begin, iterator end)
+{
   size_type size_per_row = 0;
   std::vector<size_type> column_starts;
   std::vector<size_type> column_sizes;
@@ -1360,10 +1425,8 @@ column_info_s compute_column_information(iterator begin, iterator end) {
     // align size for this type - They are the same for fixed width types and 4 bytes for variable
     // width length/offset combos
     size_type const alignment_needed = compound_type ? __alignof(uint32_t) : col_size;
-    size_per_row = util::round_up_unsafe(size_per_row, alignment_needed);
-    if (compound_type) {
-      variable_width_column_starts.push_back(size_per_row);
-    }
+    size_per_row                     = util::round_up_unsafe(size_per_row, alignment_needed);
+    if (compound_type) { variable_width_column_starts.push_back(size_per_row); }
     column_starts.push_back(size_per_row);
     column_sizes.push_back(col_size);
     size_per_row += col_size;
@@ -1375,9 +1438,11 @@ column_info_s compute_column_information(iterator begin, iterator end) {
 
   // validity is byte-aligned in the JCUDF format
   size_per_row +=
-      util::div_rounding_up_safe(static_cast<size_type>(std::distance(begin, end)), CHAR_BIT);
+    util::div_rounding_up_safe(static_cast<size_type>(std::distance(begin, end)), CHAR_BIT);
 
-  return {size_per_row, std::move(column_starts), std::move(column_sizes),
+  return {size_per_row,
+          std::move(column_starts),
+          std::move(column_sizes),
           std::move(variable_width_column_starts)};
 }
 
@@ -1390,34 +1455,35 @@ column_info_s compute_column_information(iterator begin, iterator end) {
  * @param row_batches batched row information for multiple output locations
  * @return vector of `tile_info` structs for validity data
  */
-std::vector<detail::tile_info>
-build_validity_tile_infos(size_type const &num_columns, size_type const &num_rows,
-                          size_type const &shmem_limit_per_tile,
-                          std::vector<row_batch> const &row_batches) {
+std::vector<detail::tile_info> build_validity_tile_infos(size_type const& num_columns,
+                                                         size_type const& num_rows,
+                                                         size_type const& shmem_limit_per_tile,
+                                                         std::vector<row_batch> const& row_batches)
+{
   auto const desired_rows_and_columns = static_cast<int>(sqrt(shmem_limit_per_tile));
-  auto const column_stride = util::round_up_unsafe(
-      [&]() {
-        if (desired_rows_and_columns > num_columns) {
-          // not many columns, build a single tile for table width and ship it off
-          return num_columns;
-        } else {
-          return util::round_down_safe(desired_rows_and_columns, CHAR_BIT);
-        }
-      }(),
-      JCUDF_ROW_ALIGNMENT);
+  auto const column_stride            = util::round_up_unsafe(
+    [&]() {
+      if (desired_rows_and_columns > num_columns) {
+        // not many columns, build a single tile for table width and ship it off
+        return num_columns;
+      } else {
+        return util::round_down_safe(desired_rows_and_columns, CHAR_BIT);
+      }
+    }(),
+    JCUDF_ROW_ALIGNMENT);
 
   // we fit as much as we can given the column stride note that an element in the table takes just 1
   // bit, but a row with a single element still takes 8 bytes!
-  auto const bytes_per_row = util::round_up_safe(
-      util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
+  auto const bytes_per_row =
+    util::round_up_safe(util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
   auto const row_stride =
-      std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64));
+    std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64));
   std::vector<detail::tile_info> validity_tile_infos;
   validity_tile_infos.reserve(num_columns / column_stride * num_rows / row_stride);
   for (int col = 0; col < num_columns; col += column_stride) {
     int current_tile_row_batch = 0;
-    int rows_left_in_batch = row_batches[current_tile_row_batch].row_count;
-    int row = 0;
+    int rows_left_in_batch     = row_batches[current_tile_row_batch].row_count;
+    int row                    = 0;
     while (row < num_rows) {
       if (rows_left_in_batch == 0) {
         current_tile_row_batch++;
@@ -1425,8 +1491,11 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row
       }
       int const tile_height = std::min(row_stride, rows_left_in_batch);
       validity_tile_infos.emplace_back(
-          detail::tile_info{col, row, std::min(col + column_stride - 1, num_columns - 1),
-                            row + tile_height - 1, current_tile_row_batch});
+        detail::tile_info{col,
+                          row,
+                          std::min(col + column_stride - 1, num_columns - 1),
+                          row + tile_height - 1,
+                          current_tile_row_batch});
       row += tile_height;
       rows_left_in_batch -= tile_height;
     }
@@ -1441,11 +1510,15 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row
  *
  * @tparam RowSize iterator that returns the size of a specific row
  */
-template <typename RowSize> struct row_size_functor {
+template <typename RowSize>
+struct row_size_functor {
   row_size_functor(size_type row_end, RowSize row_sizes, size_type last_row_end)
-      : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end) {}
+    : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end)
+  {
+  }
 
-  __device__ inline uint64_t operator()(int i) const {
+  __device__ inline uint64_t operator()(int i) const
+  {
     return i >= _row_end ? 0 : _row_sizes[i + _last_row_end];
   }
 
@@ -1467,11 +1540,15 @@ template <typename RowSize> struct row_size_functor {
  * device_uvector of row offsets
  */
 template <typename RowSize>
-batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_width,
-                         rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+batch_data build_batches(size_type num_rows,
+                         RowSize row_sizes,
+                         bool all_fixed_width,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+{
   auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows);
   auto const num_batches = static_cast<int32_t>(
-      util::div_rounding_up_safe(total_size, static_cast<uint64_t>(MAX_BATCH_SIZE)));
+    util::div_rounding_up_safe(total_size, static_cast<uint64_t>(MAX_BATCH_SIZE)));
   auto const num_offsets = num_batches + 1;
   std::vector<row_batch> row_batches;
   std::vector<size_type> batch_row_boundaries;
@@ -1482,8 +1559,9 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
   batch_row_boundaries.push_back(0);
   size_type last_row_end = 0;
   device_uvector<uint64_t> cumulative_row_sizes(num_rows, stream);
-  thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows,
-                         cumulative_row_sizes.begin());
+
+  thrust::inclusive_scan(
+    rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, cumulative_row_sizes.begin());
 
   // This needs to be split this into 2 gig batches. Care must be taken to avoid a batch larger than
   // 2 gigs. Imagine a table with 900 meg rows. The batches should occur every 2 rows, but if a
@@ -1497,22 +1575,22 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
 
   while (last_row_end < num_rows) {
     auto offset_row_sizes = thrust::make_transform_iterator(
-        cumulative_row_sizes.begin(),
-        cuda::proclaim_return_type<uint64_t>(
-            [last_row_end, cumulative_row_sizes = cumulative_row_sizes.data()] __device__(auto i) {
-              return i - cumulative_row_sizes[last_row_end];
-            }));
+      cumulative_row_sizes.begin(),
+      cuda::proclaim_return_type<uint64_t>(
+        [last_row_end, cumulative_row_sizes = cumulative_row_sizes.data()] __device__(auto i) {
+          return i - cumulative_row_sizes[last_row_end];
+        }));
     auto search_start = offset_row_sizes + last_row_end;
-    auto search_end = offset_row_sizes + num_rows;
+    auto search_end   = offset_row_sizes + num_rows;
 
     // find the next MAX_BATCH_SIZE boundary
     auto const lb =
-        thrust::lower_bound(rmm::exec_policy(stream), search_start, search_end, MAX_BATCH_SIZE);
+      thrust::lower_bound(rmm::exec_policy(stream), search_start, search_end, MAX_BATCH_SIZE);
     size_type const batch_size = lb - search_start;
 
-    size_type const row_end = lb == search_end ?
-                                  batch_size + last_row_end :
-                                  last_row_end + util::round_down_safe(batch_size, 32);
+    size_type const row_end = lb == search_end
+                                ? batch_size + last_row_end
+                                : last_row_end + util::round_down_safe(batch_size, 32);
 
     // build offset list for each row in this batch
     auto const num_rows_in_batch = row_end - last_row_end;
@@ -1522,10 +1600,12 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
     device_uvector<size_type> output_batch_row_offsets(num_entries, stream, mr);
 
     auto row_size_iter_bounded = cudf::detail::make_counting_transform_iterator(
-        0, row_size_functor(row_end, row_sizes, last_row_end));
+      0, row_size_functor(row_end, row_sizes, last_row_end));
 
-    thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded,
-                           row_size_iter_bounded + num_entries, output_batch_row_offsets.begin());
+    thrust::exclusive_scan(rmm::exec_policy(stream),
+                           row_size_iter_bounded,
+                           row_size_iter_bounded + num_entries,
+                           output_batch_row_offsets.begin());
 
     auto const batch_bytes = output_batch_row_offsets.element(num_rows_in_batch, stream);
 
@@ -1533,8 +1613,10 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
     // needs to be individually allocated, but the kernel needs a contiguous array of offsets or
     // more global lookups are necessary.
     if (!all_fixed_width) {
-      cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(),
-                 num_rows_in_batch * sizeof(size_type), cudaMemcpyDefault);
+      cudaMemcpy(batch_row_offsets.data() + last_row_end,
+                 output_batch_row_offsets.data(),
+                 num_rows_in_batch * sizeof(size_type),
+                 cudaMemcpyDeviceToDevice);
     }
 
     batch_row_boundaries.push_back(row_end);
@@ -1543,10 +1625,11 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
     last_row_end = row_end;
   }
 
-  return {std::move(batch_row_offsets),
-          make_device_uvector_async(batch_row_boundaries, stream,
-                                    rmm::mr::get_current_device_resource()),
-          std::move(batch_row_boundaries), std::move(row_batches)};
+  return {
+    std::move(batch_row_offsets),
+    make_device_uvector_async(batch_row_boundaries, stream, rmm::mr::get_current_device_resource()),
+    std::move(batch_row_boundaries),
+    std::move(row_batches)};
 }
 
 /**
@@ -1557,20 +1640,25 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
  * @param stream stream to use
  * @return number of tiles necessary
  */
-int compute_tile_counts(device_span<size_type const> const &batch_row_boundaries,
-                        int desired_tile_height, rmm::cuda_stream_view stream) {
+int compute_tile_counts(device_span<size_type const> const& batch_row_boundaries,
+                        int desired_tile_height,
+                        rmm::cuda_stream_view stream)
+{
   size_type const num_batches = batch_row_boundaries.size() - 1;
   device_uvector<size_type> num_tiles(num_batches, stream);
   auto iter = thrust::make_counting_iterator(0);
   thrust::transform(
-      rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(),
-      cuda::proclaim_return_type<size_type>(
-          [desired_tile_height, batch_row_boundaries = batch_row_boundaries.data()] __device__(
-              auto batch_index) -> size_type {
-            return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] -
-                                                    batch_row_boundaries[batch_index],
-                                                desired_tile_height);
-          }));
+    rmm::exec_policy(stream),
+    iter,
+    iter + num_batches,
+    num_tiles.begin(),
+    cuda::proclaim_return_type<size_type>(
+      [desired_tile_height, batch_row_boundaries = batch_row_boundaries.data()] __device__(
+        auto batch_index) -> size_type {
+        return util::div_rounding_up_unsafe(
+          batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index],
+          desired_tile_height);
+      }));
   return thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end());
 }
 
@@ -1586,64 +1674,78 @@ int compute_tile_counts(device_span<size_type const> const &batch_row_boundaries
  * @param stream stream to use
  * @return number of tiles created
  */
-size_type
-build_tiles(device_span<tile_info> tiles,
-            device_uvector<size_type> const &batch_row_boundaries, // comes from build_batches
-            int column_start, int column_end, int desired_tile_height, int total_number_of_rows,
-            rmm::cuda_stream_view stream) {
+size_type build_tiles(
+  device_span<tile_info> tiles,
+  device_uvector<size_type> const& batch_row_boundaries,  // comes from build_batches
+  int column_start,
+  int column_end,
+  int desired_tile_height,
+  int total_number_of_rows,
+  rmm::cuda_stream_view stream)
+{
   size_type const num_batches = batch_row_boundaries.size() - 1;
   device_uvector<size_type> num_tiles(num_batches, stream);
   auto iter = thrust::make_counting_iterator(0);
   thrust::transform(
-      rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(),
-      cuda::proclaim_return_type<size_type>(
-          [desired_tile_height, batch_row_boundaries = batch_row_boundaries.data()] __device__(
-              auto batch_index) -> size_type {
-            return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] -
-                                                    batch_row_boundaries[batch_index],
-                                                desired_tile_height);
-          }));
+    rmm::exec_policy(stream),
+    iter,
+    iter + num_batches,
+    num_tiles.begin(),
+    cuda::proclaim_return_type<size_type>(
+      [desired_tile_height, batch_row_boundaries = batch_row_boundaries.data()] __device__(
+        auto batch_index) -> size_type {
+        return util::div_rounding_up_unsafe(
+          batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index],
+          desired_tile_height);
+      }));
 
   size_type const total_tiles =
-      thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end());
+    thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end());
 
   device_uvector<size_type> tile_starts(num_batches + 1, stream);
   auto tile_iter = cudf::detail::make_counting_transform_iterator(
-      0, cuda::proclaim_return_type<size_type>(
-             [num_tiles = num_tiles.data(), num_batches] __device__(auto i) {
-               return (i < num_batches) ? num_tiles[i] : 0;
-             }));
-  thrust::exclusive_scan(rmm::exec_policy(stream), tile_iter, tile_iter + num_batches + 1,
-                         tile_starts.begin()); // in tiles
+    0,
+    cuda::proclaim_return_type<size_type>(
+      [num_tiles = num_tiles.data(), num_batches] __device__(auto i) {
+        return (i < num_batches) ? num_tiles[i] : 0;
+      }));
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         tile_iter,
+                         tile_iter + num_batches + 1,
+                         tile_starts.begin());  // in tiles
 
   thrust::transform(
-      rmm::exec_policy(stream), iter, iter + total_tiles, tiles.begin(),
-      cuda::proclaim_return_type<tile_info>(
-          [=, tile_starts = tile_starts.data(),
-           batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) {
-            // what batch this tile falls in
-            auto const batch_index_iter = thrust::upper_bound(
-                thrust::seq, tile_starts, tile_starts + num_batches, tile_index);
-            auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1;
-            // local index within the tile
-            int const local_tile_index = tile_index - tile_starts[batch_index];
-            // the start row for this batch.
-            int const batch_row_start = batch_row_boundaries[batch_index];
-            // the start row for this tile
-            int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height);
-            // the end row for this tile
-            int const max_row =
-                std::min(total_number_of_rows - 1,
-                         batch_index + 1 > num_batches ?
-                             std::numeric_limits<size_type>::max() :
-                             static_cast<int>(batch_row_boundaries[batch_index + 1]) - 1);
-            int const tile_row_end = std::min(
-                batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row);
-
-            // stuff the tile
-            return tile_info{column_start, tile_row_start, column_end, tile_row_end,
-                             static_cast<int>(batch_index)};
-          }));
+    rmm::exec_policy(stream),
+    iter,
+    iter + total_tiles,
+    tiles.begin(),
+    cuda::proclaim_return_type<tile_info>(
+      [                     =,
+       tile_starts          = tile_starts.data(),
+       batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) {
+        // what batch this tile falls in
+        auto const batch_index_iter =
+          thrust::upper_bound(thrust::seq, tile_starts, tile_starts + num_batches, tile_index);
+        auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1;
+        // local index within the tile
+        int const local_tile_index = tile_index - tile_starts[batch_index];
+        // the start row for this batch.
+        int const batch_row_start = batch_row_boundaries[batch_index];
+        // the start row for this tile
+        int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height);
+        // the end row for this tile
+        int const max_row =
+          std::min(total_number_of_rows - 1,
+                   batch_index + 1 > num_batches
+                     ? std::numeric_limits<size_type>::max()
+                     : static_cast<int>(batch_row_boundaries[batch_index + 1]) - 1);
+        int const tile_row_end =
+          std::min(batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row);
+
+        // stuff the tile
+        return tile_info{
+          column_start, tile_row_start, column_end, tile_row_end, static_cast<int>(batch_index)};
+      }));
 
   return total_tiles;
 }
@@ -1661,13 +1763,16 @@ build_tiles(device_span<tile_info> tiles,
  * @param f callback function called when building a tile
  */
 template <typename TileCallback>
-void determine_tiles(std::vector<size_type> const &column_sizes,
-                     std::vector<size_type> const &column_starts,
-                     size_type const first_row_batch_size, size_type const total_number_of_rows,
-                     size_type const &shmem_limit_per_tile, TileCallback f) {
+void determine_tiles(std::vector<size_type> const& column_sizes,
+                     std::vector<size_type> const& column_starts,
+                     size_type const first_row_batch_size,
+                     size_type const total_number_of_rows,
+                     size_type const& shmem_limit_per_tile,
+                     TileCallback f)
+{
   // tile infos are organized with the tile going "down" the columns this provides the most
   // coalescing of memory access
-  int current_tile_width = 0;
+  int current_tile_width     = 0;
   int current_tile_start_col = 0;
 
   // the ideal tile height has lots of 8-byte reads and 8-byte writes. The optimal read/write would
@@ -1676,10 +1781,10 @@ void determine_tiles(std::vector<size_type> const &column_sizes,
   // sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we want them
   // equal, so height and width are sqrt(shared_mem_size). The trick is that it's in bytes, not rows
   // or columns.
-  auto const square_bias = 32; // bias towards columns for performance reasons
-  auto const optimal_square_len = static_cast<size_type>(sqrt(shmem_limit_per_tile));
+  auto const square_bias         = 32;  // bias towards columns for performance reasons
+  auto const optimal_square_len  = static_cast<size_type>(sqrt(shmem_limit_per_tile));
   auto const desired_tile_height = util::round_up_safe<int>(
-      std::min(optimal_square_len / square_bias, total_number_of_rows), cudf::detail::warp_size);
+    std::min(optimal_square_len / square_bias, total_number_of_rows), cudf::detail::warp_size);
   auto const tile_height = std::clamp(desired_tile_height, 1, first_row_batch_size);
 
   int row_size = 0;
@@ -1689,22 +1794,22 @@ void determine_tiles(std::vector<size_type> const &column_sizes,
     auto const col_size = column_sizes[col];
 
     // align size for this type
-    auto const alignment_needed = col_size; // They are the same for fixed width types
-    auto const row_size_aligned = util::round_up_unsafe(row_size, alignment_needed);
+    auto const alignment_needed       = col_size;  // They are the same for fixed width types
+    auto const row_size_aligned       = util::round_up_unsafe(row_size, alignment_needed);
     auto const row_size_with_this_col = row_size_aligned + col_size;
     auto const row_size_with_end_pad =
-        util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT);
+      util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT);
 
     if (row_size_with_end_pad * tile_height > shmem_limit_per_tile) {
       // too large, close this tile, generate vertical tiles and restart
       f(current_tile_start_col, col == 0 ? col : col - 1, tile_height);
 
       row_size =
-          util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-      row_size += col_size; // alignment required for shared memory tile boundary to match alignment
-                            // of output row
+        util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+      row_size += col_size;  // alignment required for shared memory tile boundary to match
+                             // alignment of output row
       current_tile_start_col = col;
-      current_tile_width = 0;
+      current_tile_width     = 0;
     } else {
       row_size = row_size_with_this_col;
       current_tile_width++;
@@ -1732,155 +1837,196 @@ void determine_tiles(std::vector<size_type> const &column_sizes,
  */
 template <typename offsetFunctor>
 std::vector<std::unique_ptr<column>> convert_to_rows(
-    table_view const &tbl, batch_data &batch_info, offsetFunctor offset_functor,
-    column_info_s const &column_info,
-    std::optional<rmm::device_uvector<strings_column_view::offset_iterator>> variable_width_offsets,
-    rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+  table_view const& tbl,
+  batch_data& batch_info,
+  offsetFunctor offset_functor,
+  column_info_s const& column_info,
+  std::optional<rmm::device_uvector<strings_column_view::offset_iterator>> variable_width_offsets,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
   int device_id;
   CUDF_CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem_in_bytes;
   CUDF_CUDA_TRY(
-      cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+    cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-#ifndef __CUDA_ARCH__ // __host__ code.
+#ifndef __CUDA_ARCH__  // __host__ code.
   // Need to reduce total shmem available by the size of barriers in the kernel's shared memory
   total_shmem_in_bytes -=
-      util::round_up_unsafe(sizeof(cuda::barrier<cuda::thread_scope_block>), 16ul);
-#endif // __CUDA_ARCH__
+    util::round_up_unsafe(sizeof(cuda::barrier<cuda::thread_scope_block>), 16ul);
+#endif  // __CUDA_ARCH__
 
   auto const shmem_limit_per_tile = total_shmem_in_bytes;
 
-  auto const num_rows = tbl.num_rows();
+  auto const num_rows         = tbl.num_rows();
   auto const fixed_width_only = !variable_width_offsets.has_value();
 
-  auto select_columns = [](auto const &tbl, auto column_predicate) {
+  auto select_columns = [](auto const& tbl, auto column_predicate) {
     std::vector<column_view> cols;
-    std::copy_if(tbl.begin(), tbl.end(), std::back_inserter(cols),
-                 [&](auto c) { return column_predicate(c); });
+    std::copy_if(tbl.begin(), tbl.end(), std::back_inserter(cols), [&](auto c) {
+      return column_predicate(c);
+    });
     return table_view(cols);
   };
 
-  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream,
-                                                 rmm::mr::get_current_device_resource());
-  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream,
-                                                  rmm::mr::get_current_device_resource());
+  auto dev_col_sizes = make_device_uvector_async(
+    column_info.column_sizes, stream, rmm::mr::get_current_device_resource());
+  auto dev_col_starts = make_device_uvector_async(
+    column_info.column_starts, stream, rmm::mr::get_current_device_resource());
 
   // Get the pointers to the input columnar data ready
-  auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const &c) {
+  auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const& c) {
     return is_compound(c.type()) ? nullptr : c.template data<int8_t>();
   });
-  std::vector<int8_t const *> input_data(data_begin, data_begin + tbl.num_columns());
+  std::vector<int8_t const*> input_data(data_begin, data_begin + tbl.num_columns());
 
   // validity code handles variable and fixed-width data, so give it everything
   auto const nm_begin =
-      thrust::make_transform_iterator(tbl.begin(), [](auto const &c) { return c.null_mask(); });
-  std::vector<bitmask_type const *> input_nm(nm_begin, nm_begin + tbl.num_columns());
+    thrust::make_transform_iterator(tbl.begin(), [](auto const& c) { return c.null_mask(); });
+  std::vector<bitmask_type const*> input_nm(nm_begin, nm_begin + tbl.num_columns());
 
   auto dev_input_data =
-      make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource());
   auto dev_input_nm =
-      make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource());
 
   // the first batch always exists unless we were sent an empty table
   auto const first_batch_size = batch_info.row_batches[0].row_count;
 
   std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t *> output_data;
+  std::vector<int8_t*> output_data;
   output_data.reserve(batch_info.row_batches.size());
   output_buffers.reserve(batch_info.row_batches.size());
-  std::transform(batch_info.row_batches.begin(), batch_info.row_batches.end(),
-                 std::back_inserter(output_buffers), [&](auto const &batch) {
-                   return rmm::device_buffer(batch.num_bytes, stream, mr);
-                 });
-  std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data),
-                 [](auto &buf) { return static_cast<int8_t *>(buf.data()); });
+  std::transform(
+    batch_info.row_batches.begin(),
+    batch_info.row_batches.end(),
+    std::back_inserter(output_buffers),
+    [&](auto const& batch) { return rmm::device_buffer(batch.num_bytes, stream, mr); });
+  std::transform(
+    output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data), [](auto& buf) {
+      return static_cast<int8_t*>(buf.data());
+    });
 
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
   int info_count = 0;
   detail::determine_tiles(
-      column_info.column_sizes, column_info.column_starts, first_batch_size, num_rows,
-      shmem_limit_per_tile,
-      [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count,
-       &stream](int const start_col, int const end_col, int const tile_height) {
-        int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream);
-        info_count += i;
-      });
+    column_info.column_sizes,
+    column_info.column_starts,
+    first_batch_size,
+    num_rows,
+    shmem_limit_per_tile,
+    [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count, &stream](
+      int const start_col, int const end_col, int const tile_height) {
+      int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream);
+      info_count += i;
+    });
 
   // allocate space for tiles
   device_uvector<detail::tile_info> gpu_tile_infos(info_count, stream);
   int tile_offset = 0;
 
   detail::determine_tiles(
-      column_info.column_sizes, column_info.column_starts, first_batch_size, num_rows,
-      shmem_limit_per_tile,
-      [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &gpu_tile_infos, num_rows,
-       &tile_offset, stream](int const start_col, int const end_col, int const tile_height) {
-        tile_offset += detail::build_tiles(
-            {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
-            gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream);
-      });
+    column_info.column_sizes,
+    column_info.column_starts,
+    first_batch_size,
+    num_rows,
+    shmem_limit_per_tile,
+    [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries,
+     &gpu_tile_infos,
+     num_rows,
+     &tile_offset,
+     stream](int const start_col, int const end_col, int const tile_height) {
+      tile_offset += detail::build_tiles(
+        {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
+        gpu_batch_row_boundaries,
+        start_col,
+        end_col,
+        tile_height,
+        num_rows,
+        stream);
+    });
 
   // build validity tiles for ALL columns, variable and fixed width.
   auto validity_tile_infos = detail::build_validity_tile_infos(
-      tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches);
+    tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches);
 
-  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream,
-                                                           rmm::mr::get_current_device_resource());
+  auto dev_validity_tile_infos =
+    make_device_uvector_async(validity_tile_infos, stream, rmm::mr::get_current_device_resource());
 
   auto const validity_offset = column_info.column_starts.back();
 
   // blast through the entire table and convert it
-  detail::copy_to_rows<<<gpu_tile_infos.size(), NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                         total_shmem_in_bytes, stream.value()>>>(
-      num_rows, tbl.num_columns(), shmem_limit_per_tile, gpu_tile_infos, dev_input_data.data(),
-      dev_col_sizes.data(), dev_col_starts.data(), offset_functor,
-      batch_info.d_batch_row_boundaries.data(),
-      reinterpret_cast<int8_t **>(dev_output_data.data()));
+  detail::copy_to_rows<<<gpu_tile_infos.size(),
+                         NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
+                         total_shmem_in_bytes,
+                         stream.value()>>>(num_rows,
+                                           tbl.num_columns(),
+                                           shmem_limit_per_tile,
+                                           gpu_tile_infos,
+                                           dev_input_data.data(),
+                                           dev_col_sizes.data(),
+                                           dev_col_starts.data(),
+                                           offset_functor,
+                                           batch_info.d_batch_row_boundaries.data(),
+                                           reinterpret_cast<int8_t**>(dev_output_data.data()));
 
   // note that validity gets the entire table and not the fixed-width portion
   detail::copy_validity_to_rows<<<validity_tile_infos.size(),
                                   NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                                  total_shmem_in_bytes, stream.value()>>>(
-      num_rows, tbl.num_columns(), shmem_limit_per_tile, offset_functor,
-      batch_info.d_batch_row_boundaries.data(), dev_output_data.data(), validity_offset,
-      dev_validity_tile_infos, dev_input_nm.data());
+                                  total_shmem_in_bytes,
+                                  stream.value()>>>(num_rows,
+                                                    tbl.num_columns(),
+                                                    shmem_limit_per_tile,
+                                                    offset_functor,
+                                                    batch_info.d_batch_row_boundaries.data(),
+                                                    dev_output_data.data(),
+                                                    validity_offset,
+                                                    dev_validity_tile_infos,
+                                                    dev_input_nm.data());
 
   if (!fixed_width_only) {
     // build table view for variable-width data only
     auto const variable_width_table =
-        select_columns(tbl, [](auto col) { return is_compound(col.type()); });
+      select_columns(tbl, [](auto col) { return is_compound(col.type()); });
 
     CUDF_EXPECTS(!variable_width_table.is_empty(), "No variable-width columns when expected!");
     CUDF_EXPECTS(variable_width_offsets.has_value(), "No variable width offset data!");
 
     auto const variable_data_begin =
-        thrust::make_transform_iterator(variable_width_table.begin(), [](auto const &c) {
-          strings_column_view const scv{c};
-          return is_compound(c.type()) ? scv.chars().template data<int8_t>() : nullptr;
-        });
-    std::vector<int8_t const *> variable_width_input_data(
-        variable_data_begin, variable_data_begin + variable_width_table.num_columns());
+      thrust::make_transform_iterator(variable_width_table.begin(), [](auto const& c) {
+        strings_column_view const scv{c};
+        return is_compound(c.type()) ? scv.chars().template data<int8_t>() : nullptr;
+      });
+    std::vector<int8_t const*> variable_width_input_data(
+      variable_data_begin, variable_data_begin + variable_width_table.num_columns());
 
     auto dev_variable_input_data = make_device_uvector_async(
-        variable_width_input_data, stream, rmm::mr::get_current_device_resource());
+      variable_width_input_data, stream, rmm::mr::get_current_device_resource());
     auto dev_variable_col_output_offsets = make_device_uvector_async(
-        column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource());
+      column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource());
 
     for (uint i = 0; i < batch_info.row_batches.size(); i++) {
       auto const batch_row_offset = batch_info.batch_row_boundaries[i];
-      auto const batch_num_rows = batch_info.row_batches[i].row_count;
-
-      dim3 const string_blocks(std::min(
-          MAX_STRING_BLOCKS,
-          util::div_rounding_up_unsafe(batch_num_rows, NUM_STRING_ROWS_PER_BLOCK_TO_ROWS)));
-
-      detail::copy_strings_to_rows<<<string_blocks, NUM_WARPS_IN_BLOCK * cudf::detail::warp_size, 0,
-                                     stream.value()>>>(
-          batch_num_rows, variable_width_table.num_columns(), dev_variable_input_data.data(),
-          dev_variable_col_output_offsets.data(), variable_width_offsets->data(),
-          column_info.size_per_row, offset_functor, batch_row_offset,
-          reinterpret_cast<int8_t *>(output_data[i]));
+      auto const batch_num_rows   = batch_info.row_batches[i].row_count;
+
+      dim3 const string_blocks(
+        std::min(MAX_STRING_BLOCKS,
+                 util::div_rounding_up_unsafe(batch_num_rows, NUM_STRING_ROWS_PER_BLOCK_TO_ROWS)));
+
+      detail::copy_strings_to_rows<<<string_blocks,
+                                     NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
+                                     0,
+                                     stream.value()>>>(batch_num_rows,
+                                                       variable_width_table.num_columns(),
+                                                       dev_variable_input_data.data(),
+                                                       dev_variable_col_output_offsets.data(),
+                                                       variable_width_offsets->data(),
+                                                       column_info.size_per_row,
+                                                       offset_functor,
+                                                       batch_row_offset,
+                                                       reinterpret_cast<int8_t*>(output_data[i]));
     }
   }
 
@@ -1889,26 +2035,36 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
   std::vector<std::unique_ptr<column>> ret;
   ret.reserve(batch_info.row_batches.size());
   auto counting_iter = thrust::make_counting_iterator(0);
-  std::transform(counting_iter, counting_iter + batch_info.row_batches.size(),
-                 std::back_inserter(ret), [&](auto batch) {
+  std::transform(counting_iter,
+                 counting_iter + batch_info.row_batches.size(),
+                 std::back_inserter(ret),
+                 [&](auto batch) {
                    auto const offset_count = batch_info.row_batches[batch].row_offsets.size();
                    auto offsets =
-                       std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_count,
-                                                batch_info.row_batches[batch].row_offsets.release(),
-                                                rmm::device_buffer{}, 0);
-                   auto data = std::make_unique<column>(
-                       data_type{type_id::INT8}, batch_info.row_batches[batch].num_bytes,
-                       std::move(output_buffers[batch]), rmm::device_buffer{}, 0);
-
-                   return make_lists_column(
-                       batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data),
-                       0, rmm::device_buffer{0, cudf::get_default_stream(), mr}, stream, mr);
+                     std::make_unique<column>(data_type{type_id::INT32},
+                                              (size_type)offset_count,
+                                              batch_info.row_batches[batch].row_offsets.release(),
+                                              rmm::device_buffer{},
+                                              0);
+                   auto data = std::make_unique<column>(data_type{type_id::INT8},
+                                                        batch_info.row_batches[batch].num_bytes,
+                                                        std::move(output_buffers[batch]),
+                                                        rmm::device_buffer{},
+                                                        0);
+
+                   return make_lists_column(batch_info.row_batches[batch].row_count,
+                                            std::move(offsets),
+                                            std::move(data),
+                                            0,
+                                            rmm::device_buffer{0, cudf::get_default_stream(), mr},
+                                            stream,
+                                            mr);
                  });
 
   return ret;
 }
 
-} // namespace detail
+}  // namespace detail
 
 /**
  * @brief convert a cudf table to JCUDF row format
@@ -1918,14 +2074,15 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
  * @param mr memory resource used for returned data
  * @return vector of list columns containing byte columns of the JCUDF row data
  */
-std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
+std::vector<std::unique_ptr<column>> convert_to_rows(table_view const& tbl,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource *mr) {
+                                                     rmm::mr::device_memory_resource* mr)
+{
   auto const num_columns = tbl.num_columns();
-  auto const num_rows = tbl.num_rows();
+  auto const num_rows    = tbl.num_rows();
 
   auto const fixed_width_only = std::all_of(
-      tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); });
+    tbl.begin(), tbl.end(), [](column_view const& c) { return is_fixed_width(c.type()); });
 
   // Break up the work into tiles, which are a starting and ending row/col #. This tile size is
   // calculated based on the shared memory size available we want a single tile to fill up the
@@ -1941,94 +2098,107 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
   // before building the tiles so the tiles can be properly cut around them.
 
   auto schema_column_iter =
-      thrust::make_transform_iterator(tbl.begin(), [](auto const &i) { return i.type(); });
+    thrust::make_transform_iterator(tbl.begin(), [](auto const& i) { return i.type(); });
 
   auto column_info =
-      detail::compute_column_information(schema_column_iter, schema_column_iter + num_columns);
+    detail::compute_column_information(schema_column_iter, schema_column_iter + num_columns);
   auto const size_per_row = column_info.size_per_row;
   if (fixed_width_only) {
     // total encoded row size. This includes fixed-width data and validity only. It does not include
     // variable-width data since it isn't copied with the fixed-width and validity kernel.
     auto row_size_iter = thrust::make_constant_iterator<uint64_t>(
-        util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT));
+      util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT));
 
     auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr);
 
     detail::fixed_width_row_offset_functor offset_functor(
-        util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT));
+      util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT));
 
-    return detail::convert_to_rows(tbl, batch_info, offset_functor, std::move(column_info),
-                                   std::nullopt, stream, mr);
+    return detail::convert_to_rows(
+      tbl, batch_info, offset_functor, std::move(column_info), std::nullopt, stream, mr);
   } else {
     auto offset_data = detail::build_string_row_offsets(tbl, size_per_row, stream);
-    auto &row_sizes = std::get<0>(offset_data);
+    auto& row_sizes  = std::get<0>(offset_data);
 
     auto row_size_iter = cudf::detail::make_counting_transform_iterator(
-        0, detail::row_size_functor(num_rows, row_sizes.data(), 0));
+      0, detail::row_size_functor(num_rows, row_sizes.data(), 0));
 
     auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr);
 
     detail::string_row_offset_functor offset_functor(batch_info.batch_row_offsets);
 
-    return detail::convert_to_rows(tbl, batch_info, offset_functor, std::move(column_info),
-                                   std::make_optional(std::move(std::get<1>(offset_data))), stream,
+    return detail::convert_to_rows(tbl,
+                                   batch_info,
+                                   offset_functor,
+                                   std::move(column_info),
+                                   std::make_optional(std::move(std::get<1>(offset_data))),
+                                   stream,
                                    mr);
   }
 }
 
-std::vector<std::unique_ptr<column>>
-convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource *mr) {
+std::vector<std::unique_ptr<column>> convert_to_rows_fixed_width_optimized(
+  table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+{
   auto const num_columns = tbl.num_columns();
 
   std::vector<data_type> schema;
   schema.resize(num_columns);
-  std::transform(tbl.begin(), tbl.end(), schema.begin(),
-                 [](auto i) -> data_type { return i.type(); });
+  std::transform(
+    tbl.begin(), tbl.end(), schema.begin(), [](auto i) -> data_type { return i.type(); });
 
   if (detail::are_all_fixed_width(schema)) {
     std::vector<size_type> column_start;
     std::vector<size_type> column_size;
 
     int32_t const size_per_row =
-        detail::compute_fixed_width_layout(schema, column_start, column_size);
+      detail::compute_fixed_width_layout(schema, column_start, column_size);
     auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
-    auto dev_column_size = make_device_uvector_async(column_size, stream, mr);
+    auto dev_column_size  = make_device_uvector_async(column_size, stream, mr);
 
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about splitting
     // validity at a specific row offset.  This might change in the future.
     auto const max_rows_per_batch =
-        util::round_down_safe(std::numeric_limits<size_type>::max() / size_per_row, 32);
+      util::round_down_safe(std::numeric_limits<size_type>::max() / size_per_row, 32);
 
     auto const num_rows = tbl.num_rows();
 
     // Get the pointers to the input columnar data ready
-    std::vector<const int8_t *> input_data;
-    std::vector<bitmask_type const *> input_nm;
+    std::vector<const int8_t*> input_data;
+    std::vector<bitmask_type const*> input_nm;
     for (size_type column_number = 0; column_number < num_columns; column_number++) {
       column_view cv = tbl.column(column_number);
       input_data.emplace_back(cv.data<int8_t>());
       input_nm.emplace_back(cv.null_mask());
     }
     auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-    auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
+    auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
 
     using ScalarType = scalar_type_t<size_type>;
-    auto zero = make_numeric_scalar(data_type(type_id::INT32), stream.value());
+    auto zero        = make_numeric_scalar(data_type(type_id::INT32), stream.value());
     zero->set_valid_async(true, stream);
-    static_cast<ScalarType *>(zero.get())->set_value(0, stream);
+    static_cast<ScalarType*>(zero.get())->set_value(0, stream);
 
     auto step = make_numeric_scalar(data_type(type_id::INT32), stream.value());
     step->set_valid_async(true, stream);
-    static_cast<ScalarType *>(step.get())->set_value(static_cast<size_type>(size_per_row), stream);
+    static_cast<ScalarType*>(step.get())->set_value(static_cast<size_type>(size_per_row), stream);
 
     std::vector<std::unique_ptr<column>> ret;
     for (size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
       size_type row_count = num_rows - row_start;
-      row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
-      ret.emplace_back(detail::fixed_width_convert_to_rows(
-          row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size,
-          dev_input_data, dev_input_nm, *zero, *step, stream, mr));
+      row_count           = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
+      ret.emplace_back(detail::fixed_width_convert_to_rows(row_start,
+                                                           row_count,
+                                                           num_columns,
+                                                           size_per_row,
+                                                           dev_column_start,
+                                                           dev_column_size,
+                                                           dev_input_data,
+                                                           dev_input_nm,
+                                                           *zero,
+                                                           *step,
+                                                           stream,
+                                                           mr));
     }
 
     return ret;
@@ -2037,6 +2207,19 @@ convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_vi
   }
 }
 
+namespace {
+
+/// @brief Calculates and sets null counts for specified columns
+void fixup_null_counts(std::vector<std::unique_ptr<column>>& output_columns,
+                       rmm::cuda_stream_view stream)
+{
+  for (auto& col : output_columns) {
+    col->set_null_count(cudf::detail::null_count(col->view().null_mask(), 0, col->size(), stream));
+  }
+}
+
+}  // namespace
+
 /**
  * @brief convert from JCUDF row format to cudf columns
  *
@@ -2046,12 +2229,13 @@ convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_vi
  * @param mr memory resource for returned data
  * @return cudf table of the data
  */
-std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
-                                         std::vector<data_type> const &schema,
+std::unique_ptr<table> convert_from_rows(lists_column_view const& input,
+                                         std::vector<data_type> const& schema,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource *mr) {
+                                         rmm::mr::device_memory_resource* mr)
+{
   // verify that the types are what we expect
-  column_view child = input.child();
+  column_view child    = input.child();
   auto const list_type = child.type().id();
   CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8,
                "Only a list of bytes is supported as input");
@@ -2071,19 +2255,19 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   }
 
   auto const num_columns = string_schema.size();
-  auto const num_rows = input.parent().size();
+  auto const num_rows    = input.parent().size();
 
   int device_id;
   CUDF_CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem_in_bytes;
   CUDF_CUDA_TRY(
-      cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+    cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-#ifndef __CUDA_ARCH__ // __host__ code.
+#ifndef __CUDA_ARCH__  // __host__ code.
   // Need to reduce total shmem available by the size of barriers in the kernel's shared memory
   total_shmem_in_bytes -=
-      util::round_up_unsafe(sizeof(cuda::barrier<cuda::thread_scope_block>), 16ul);
-#endif // __CUDA_ARCH__
+    util::round_up_unsafe(sizeof(cuda::barrier<cuda::thread_scope_block>), 16ul);
+#endif  // __CUDA_ARCH__
 
   auto const shmem_limit_per_tile = total_shmem_in_bytes;
 
@@ -2093,41 +2277,44 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   // Ideally we would check that the offsets are all the same, etc. but for now this is probably
   // fine
   CUDF_EXPECTS(size_per_row * num_rows <= child.size(), "The layout of the data appears to be off");
-  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream,
-                                                  rmm::mr::get_current_device_resource());
-  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream,
-                                                 rmm::mr::get_current_device_resource());
+  auto dev_col_starts = make_device_uvector_async(
+    column_info.column_starts, stream, rmm::mr::get_current_device_resource());
+  auto dev_col_sizes = make_device_uvector_async(
+    column_info.column_sizes, stream, rmm::mr::get_current_device_resource());
 
   // Allocate the columns we are going to write into
   std::vector<std::unique_ptr<column>> output_columns;
   std::vector<std::unique_ptr<column>> string_row_offset_columns;
   std::vector<std::unique_ptr<column>> string_length_columns;
-  std::vector<int8_t *> output_data;
-  std::vector<bitmask_type *> output_nm;
-  std::vector<int32_t *> string_row_offsets;
-  std::vector<int32_t *> string_lengths;
+  std::vector<int8_t*> output_data;
+  std::vector<bitmask_type*> output_nm;
+  std::vector<int32_t*> string_row_offsets;
+  std::vector<int32_t*> string_lengths;
   for (auto i : schema) {
-    auto make_col = [&output_data, &output_nm](data_type type, size_type num_rows, bool include_nm,
+    auto make_col = [&output_data, &output_nm](data_type type,
+                                               size_type num_rows,
+                                               bool include_nm,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource *mr) {
-      auto column = make_fixed_width_column(
-          type, num_rows, include_nm ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, stream,
-          mr);
+                                               rmm::mr::device_memory_resource* mr) {
+      auto column =
+        make_fixed_width_column(type,
+                                num_rows,
+                                include_nm ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED,
+                                stream,
+                                mr);
       auto mut = column->mutable_view();
       output_data.emplace_back(mut.data<int8_t>());
-      if (include_nm) {
-        output_nm.emplace_back(mut.null_mask());
-      }
+      if (include_nm) { output_nm.emplace_back(mut.null_mask()); }
       return column;
     };
     if (i.id() == type_id::STRING) {
       auto const int32type = data_type(type_id::INT32);
       auto offset_col =
-          make_col(int32type, num_rows, true, stream, rmm::mr::get_current_device_resource());
+        make_col(int32type, num_rows, true, stream, rmm::mr::get_current_device_resource());
       string_row_offsets.push_back(offset_col->mutable_view().data<int32_t>());
       string_row_offset_columns.emplace_back(std::move(offset_col));
       auto length_col =
-          make_col(int32type, num_rows, false, stream, rmm::mr::get_current_device_resource());
+        make_col(int32type, num_rows, false, stream, rmm::mr::get_current_device_resource());
       string_lengths.push_back(length_col->mutable_view().data<int32_t>());
       string_length_columns.emplace_back(std::move(length_col));
       // placeholder
@@ -2138,160 +2325,214 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   }
 
   auto dev_string_row_offsets =
-      make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource());
   auto dev_string_lengths =
-      make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource());
 
   // build the row_batches from the passed in list column
   std::vector<detail::row_batch> row_batches;
   row_batches.push_back(
-      {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
+    {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
 
   auto dev_output_data =
-      make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource());
   auto dev_output_nm =
-      make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource());
 
   // only ever get a single batch when going from rows, so boundaries are 0, num_rows
   constexpr auto num_batches = 2;
   device_uvector<size_type> gpu_batch_row_boundaries(num_batches, stream);
 
-  thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
-                    thrust::make_counting_iterator(num_batches), gpu_batch_row_boundaries.begin(),
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(num_batches),
+                    gpu_batch_row_boundaries.begin(),
                     cuda::proclaim_return_type<size_type>(
-                        [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; }));
+                      [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; }));
 
   int info_count = 0;
-  detail::determine_tiles(
-      column_info.column_sizes, column_info.column_starts, num_rows, num_rows, shmem_limit_per_tile,
-      [&gpu_batch_row_boundaries, &info_count, &stream](int const start_col, int const end_col,
-                                                        int const tile_height) {
-        info_count += detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream);
-      });
+  detail::determine_tiles(column_info.column_sizes,
+                          column_info.column_starts,
+                          num_rows,
+                          num_rows,
+                          shmem_limit_per_tile,
+                          [&gpu_batch_row_boundaries, &info_count, &stream](
+                            int const start_col, int const end_col, int const tile_height) {
+                            info_count += detail::compute_tile_counts(
+                              gpu_batch_row_boundaries, tile_height, stream);
+                          });
 
   // allocate space for tiles
   device_uvector<detail::tile_info> gpu_tile_infos(info_count, stream);
 
   int tile_offset = 0;
   detail::determine_tiles(
-      column_info.column_sizes, column_info.column_starts, num_rows, num_rows, shmem_limit_per_tile,
-      [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset,
-       stream](int const start_col, int const end_col, int const tile_height) {
-        tile_offset += detail::build_tiles(
-            {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
-            gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream);
-      });
+    column_info.column_sizes,
+    column_info.column_starts,
+    num_rows,
+    num_rows,
+    shmem_limit_per_tile,
+    [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset, stream](
+      int const start_col, int const end_col, int const tile_height) {
+      tile_offset += detail::build_tiles(
+        {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
+        gpu_batch_row_boundaries,
+        start_col,
+        end_col,
+        tile_height,
+        num_rows,
+        stream);
+    });
 
   dim3 const blocks(gpu_tile_infos.size());
 
   // validity needs to be calculated based on the actual number of final table columns
   auto validity_tile_infos =
-      detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches);
+    detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches);
 
-  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream,
-                                                           rmm::mr::get_current_device_resource());
+  auto dev_validity_tile_infos =
+    make_device_uvector_async(validity_tile_infos, stream, rmm::mr::get_current_device_resource());
 
   dim3 const validity_blocks(validity_tile_infos.size());
 
   if (dev_string_row_offsets.size() == 0) {
     detail::fixed_width_row_offset_functor offset_functor(size_per_row);
 
-    detail::copy_from_rows<<<gpu_tile_infos.size(), NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                             total_shmem_in_bytes, stream.value()>>>(
-        num_rows, num_columns, shmem_limit_per_tile, offset_functor,
-        gpu_batch_row_boundaries.data(), dev_output_data.data(), dev_col_sizes.data(),
-        dev_col_starts.data(), gpu_tile_infos, child.data<int8_t>());
+    detail::copy_from_rows<<<gpu_tile_infos.size(),
+                             NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
+                             total_shmem_in_bytes,
+                             stream.value()>>>(num_rows,
+                                               num_columns,
+                                               shmem_limit_per_tile,
+                                               offset_functor,
+                                               gpu_batch_row_boundaries.data(),
+                                               dev_output_data.data(),
+                                               dev_col_sizes.data(),
+                                               dev_col_starts.data(),
+                                               gpu_tile_infos,
+                                               child.data<int8_t>());
 
     detail::copy_validity_from_rows<<<validity_tile_infos.size(),
                                       NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                                      total_shmem_in_bytes, stream.value()>>>(
-        num_rows, num_columns, shmem_limit_per_tile, offset_functor,
-        gpu_batch_row_boundaries.data(), dev_output_nm.data(), column_info.column_starts.back(),
-        dev_validity_tile_infos, child.data<int8_t>());
+                                      total_shmem_in_bytes,
+                                      stream.value()>>>(num_rows,
+                                                        num_columns,
+                                                        shmem_limit_per_tile,
+                                                        offset_functor,
+                                                        gpu_batch_row_boundaries.data(),
+                                                        dev_output_nm.data(),
+                                                        column_info.column_starts.back(),
+                                                        dev_validity_tile_infos,
+                                                        child.data<int8_t>());
 
   } else {
     detail::string_row_offset_functor offset_functor(device_span<size_type const>{input.offsets()});
-    detail::copy_from_rows<<<gpu_tile_infos.size(), NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                             total_shmem_in_bytes, stream.value()>>>(
-        num_rows, num_columns, shmem_limit_per_tile, offset_functor,
-        gpu_batch_row_boundaries.data(), dev_output_data.data(), dev_col_sizes.data(),
-        dev_col_starts.data(), gpu_tile_infos, child.data<int8_t>());
+    detail::copy_from_rows<<<gpu_tile_infos.size(),
+                             NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
+                             total_shmem_in_bytes,
+                             stream.value()>>>(num_rows,
+                                               num_columns,
+                                               shmem_limit_per_tile,
+                                               offset_functor,
+                                               gpu_batch_row_boundaries.data(),
+                                               dev_output_data.data(),
+                                               dev_col_sizes.data(),
+                                               dev_col_starts.data(),
+                                               gpu_tile_infos,
+                                               child.data<int8_t>());
 
     detail::copy_validity_from_rows<<<validity_tile_infos.size(),
                                       NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                                      total_shmem_in_bytes, stream.value()>>>(
-        num_rows, num_columns, shmem_limit_per_tile, offset_functor,
-        gpu_batch_row_boundaries.data(), dev_output_nm.data(), column_info.column_starts.back(),
-        dev_validity_tile_infos, child.data<int8_t>());
+                                      total_shmem_in_bytes,
+                                      stream.value()>>>(num_rows,
+                                                        num_columns,
+                                                        shmem_limit_per_tile,
+                                                        offset_functor,
+                                                        gpu_batch_row_boundaries.data(),
+                                                        dev_output_nm.data(),
+                                                        column_info.column_starts.back(),
+                                                        dev_validity_tile_infos,
+                                                        child.data<int8_t>());
 
     std::vector<device_uvector<size_type>> string_col_offsets;
     std::vector<rmm::device_uvector<char>> string_data_cols;
-    std::vector<size_type *> string_col_offset_ptrs;
-    std::vector<char *> string_data_col_ptrs;
-    for (auto &col_string_lengths : string_lengths) {
+    std::vector<size_type*> string_col_offset_ptrs;
+    std::vector<char*> string_data_col_ptrs;
+    for (auto& col_string_lengths : string_lengths) {
       device_uvector<size_type> output_string_offsets(num_rows + 1, stream, mr);
-      auto tmp = cuda::proclaim_return_type<size_type>(
-          [num_rows, col_string_lengths] __device__(auto const &i) {
-            return i < num_rows ? col_string_lengths[i] : 0;
-          });
+      auto tmp = cuda::proclaim_return_type<int32_t>(
+        [num_rows, col_string_lengths] __device__(auto const& i) {
+          return i < num_rows ? col_string_lengths[i] : 0;
+        });
       auto bounded_iter = cudf::detail::make_counting_transform_iterator(0, tmp);
-      thrust::exclusive_scan(rmm::exec_policy(stream), bounded_iter, bounded_iter + num_rows + 1,
+      thrust::exclusive_scan(rmm::exec_policy(stream),
+                             bounded_iter,
+                             bounded_iter + num_rows + 1,
                              output_string_offsets.begin());
 
       // allocate destination string column
-      rmm::device_uvector<char> string_data(output_string_offsets.element(num_rows, stream), stream,
-                                            mr);
+      rmm::device_uvector<char> string_data(
+        output_string_offsets.element(num_rows, stream), stream, mr);
 
       string_col_offset_ptrs.push_back(output_string_offsets.data());
       string_data_col_ptrs.push_back(string_data.data());
       string_col_offsets.push_back(std::move(output_string_offsets));
       string_data_cols.push_back(std::move(string_data));
     }
-    auto dev_string_col_offsets = make_device_uvector_async(string_col_offset_ptrs, stream,
-                                                            rmm::mr::get_current_device_resource());
-    auto dev_string_data_cols = make_device_uvector_async(string_data_col_ptrs, stream,
-                                                          rmm::mr::get_current_device_resource());
+    auto dev_string_col_offsets = make_device_uvector_async(
+      string_col_offset_ptrs, stream, rmm::mr::get_current_device_resource());
+    auto dev_string_data_cols = make_device_uvector_async(
+      string_data_col_ptrs, stream, rmm::mr::get_current_device_resource());
 
     dim3 const string_blocks(
-        std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS),
-                 MAX_STRING_BLOCKS));
+      std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS),
+               MAX_STRING_BLOCKS));
 
-    detail::copy_strings_from_rows<<<string_blocks, NUM_WARPS_IN_BLOCK * cudf::detail::warp_size, 0,
+    detail::copy_strings_from_rows<<<string_blocks,
+                                     NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
+                                     0,
                                      stream.value()>>>(
-        offset_functor, dev_string_row_offsets.data(), dev_string_lengths.data(),
-        dev_string_col_offsets.data(), dev_string_data_cols.data(), child.data<int8_t>(), num_rows,
-        static_cast<cudf::size_type>(string_col_offsets.size()));
+      offset_functor,
+      dev_string_row_offsets.data(),
+      dev_string_lengths.data(),
+      dev_string_col_offsets.data(),
+      dev_string_data_cols.data(),
+      child.data<int8_t>(),
+      num_rows,
+      static_cast<cudf::size_type>(string_col_offsets.size()));
 
     // merge strings back into output_columns
     int string_idx = 0;
     for (int i = 0; i < static_cast<int>(schema.size()); ++i) {
       if (schema[i].id() == type_id::STRING) {
         // stuff real string column
-        auto const null_count = string_row_offset_columns[string_idx]->null_count();
-        auto string_data = string_row_offset_columns[string_idx].release()->release();
-        output_columns[i] = make_strings_column(
-            num_rows,
-            std::make_unique<cudf::column>(std::move(string_col_offsets[string_idx]),
-                                           rmm::device_buffer{}, 0),
-            std::make_unique<cudf::column>(std::move(string_data_cols[string_idx]),
-                                           rmm::device_buffer{}, 0),
-            null_count, std::move(*string_data.null_mask.release()));
+        auto string_data  = string_row_offset_columns[string_idx].release()->release();
+        output_columns[i] = make_strings_column(num_rows,
+                                                std::move(string_col_offsets[string_idx]),
+                                                std::move(string_data_cols[string_idx]),
+                                                std::move(*string_data.null_mask.release()),
+                                                0);
+        // Null count set to 0, temporarily. Will be fixed up before return.
         string_idx++;
       }
     }
   }
 
-  for (auto &col : output_columns) {
-    col->set_null_count(cudf::null_count(col->view().null_mask(), 0, col->size()));
-  }
+  // Set null counts, because output_columns are modified via mutable-view,
+  // in the kernel above.
+  // TODO(future): Consider setting null count in the kernel itself.
+  fixup_null_counts(output_columns, stream);
+
   return std::make_unique<table>(std::move(output_columns));
 }
 
-std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
-    lists_column_view const &input, std::vector<data_type> const &schema,
-    rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+std::unique_ptr<table> convert_from_rows_fixed_width_optimized(lists_column_view const& input,
+                                                               std::vector<data_type> const& schema,
+                                                               rmm::cuda_stream_view stream,
+                                                               rmm::mr::device_memory_resource* mr)
+{
   // verify that the types are what we expect
-  column_view child = input.child();
+  column_view child    = input.child();
   auto const list_type = child.type().id();
   CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8,
                "Only a list of bytes is supported as input");
@@ -2302,7 +2543,7 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
     std::vector<size_type> column_start;
     std::vector<size_type> column_size;
 
-    auto const num_rows = input.parent().size();
+    auto const num_rows     = input.parent().size();
     auto const size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
 
     // Ideally we would check that the offsets are all the same, etc. but for now this is probably
@@ -2310,17 +2551,17 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
     auto dev_column_start =
-        make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource());
+      make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource());
     auto dev_column_size =
-        make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource());
+      make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource());
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<column>> output_columns;
-    std::vector<int8_t *> output_data;
-    std::vector<bitmask_type *> output_nm;
+    std::vector<int8_t*> output_data;
+    std::vector<bitmask_type*> output_nm;
     for (int i = 0; i < static_cast<int>(num_columns); i++) {
       auto column =
-          make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr);
+        make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr);
       auto mut = column->mutable_view();
       output_data.emplace_back(mut.data<int8_t>());
       output_nm.emplace_back(mut.null_mask());
@@ -2328,26 +2569,32 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
     }
 
     auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-    auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
+    auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
 
     dim3 blocks;
     dim3 threads;
     int shared_size =
-        detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+      detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
     detail::copy_from_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
-        num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
-        dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
+      num_rows,
+      num_columns,
+      size_per_row,
+      dev_column_start.data(),
+      dev_column_size.data(),
+      dev_output_data.data(),
+      dev_output_nm.data(),
+      child.data<int8_t>());
+
+    // Set null counts, because output_columns are modified via mutable-view,
+    // in the kernel above.
+    // TODO(future): Consider setting null count in the kernel itself.
+    fixup_null_counts(output_columns, stream);
 
-    for (auto &col : output_columns) {
-      col->set_null_count(cudf::null_count(col->view().null_mask(), 0, col->size()));
-    }
     return std::make_unique<table>(std::move(output_columns));
   } else {
     CUDF_FAIL("Only fixed width types are currently supported");
   }
 }
 
-} // namespace jni
-
-} // namespace cudf
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index af4ab8c2485..40d745338f4 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -257,6 +257,8 @@ ConfigureTest(
   transform/one_hot_encode_tests.cpp
 )
 
+ConfigureTest(ROW_CONVERSION_TEST transform/row_conversion.cpp)
+
 # ##################################################################################################
 # * interop tests -------------------------------------------------------------------------
 ConfigureTest(
diff --git a/cpp/tests/transform/row_conversion.cpp b/cpp/tests/transform/row_conversion.cpp
new file mode 100644
index 00000000000..6d8f714401c
--- /dev/null
+++ b/cpp/tests/transform/row_conversion.cpp
@@ -0,0 +1,1021 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/row_conversion.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <rmm/exec_policy.hpp>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <limits>
+
+struct ColumnToRowTests : public cudf::test::BaseFixture {};
+struct RowToColumnTests : public cudf::test::BaseFixture {};
+
+TEST_F(ColumnToRowTests, Single)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(ColumnToRowTests, SimpleString)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1, 0, -1});
+  cudf::test::strings_column_wrapper b(
+    {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"});
+  cudf::table_view in(std::vector<cudf::column_view>{a, b});
+  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
+
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(new_rows[0]->size(), 5);
+}
+
+TEST_F(ColumnToRowTests, DoubleString)
+{
+  cudf::test::strings_column_wrapper a(
+    {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"});
+  cudf::test::fixed_width_column_wrapper<int32_t> b({0, 1, 2, 3, 4});
+  cudf::test::strings_column_wrapper c({"world",
+                                        "hello",
+                                        "this string isn't as long",
+                                        "this one isn't so short though when you think about it",
+                                        "dlrow"});
+  cudf::table_view in(std::vector<cudf::column_view>{a, b, c});
+
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(new_rows[0]->size(), 5);
+}
+
+TEST_F(ColumnToRowTests, BigStrings)
+{
+  char const* TEST_STRINGS[] = {
+    "These",
+    "are",
+    "the",
+    "test",
+    "strings",
+    "that",
+    "we",
+    "have",
+    "some are really long",
+    "and some are kinda short",
+    "They are all over on purpose with different sizes for the strings in order to test the code "
+    "on all different lengths of strings",
+    "a",
+    "good test",
+    "is required to produce reasonable confidence that this is working"};
+  auto num_generator =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  auto string_generator =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* {
+      return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))];
+    });
+
+  auto const num_rows = 50;
+  auto const num_cols = 50;
+  std::vector<cudf::data_type> schema;
+
+  std::vector<cudf::test::detail::column_wrapper> cols;
+  std::vector<cudf::column_view> views;
+
+  for (auto col = 0; col < num_cols; ++col) {
+    if (rand() % 2) {
+      cols.emplace_back(
+        cudf::test::fixed_width_column_wrapper<int32_t>(num_generator, num_generator + num_rows));
+      views.push_back(cols.back());
+      schema.emplace_back(cudf::data_type{cudf::type_id::INT32});
+    } else {
+      cols.emplace_back(
+        cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows));
+      views.push_back(cols.back());
+      schema.emplace_back(cudf::type_id::STRING);
+    }
+  }
+
+  cudf::table_view in(views);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(new_rows[0]->size(), num_rows);
+}
+
+TEST_F(ColumnToRowTests, ManyStrings)
+{
+  char const* TEST_STRINGS[] = {
+    "These",
+    "are",
+    "the",
+    "test",
+    "strings",
+    "that",
+    "we",
+    "have",
+    "some are really long",
+    "and some are kinda short",
+    "They are all over on purpose with different sizes for the strings in order to test the code "
+    "on all different lengths of strings",
+    "a",
+    "good test",
+    "is required to produce reasonable confidence that this is working",
+    "some strings",
+    "are split into multiple strings",
+    "some strings have all their data",
+    "lots of choices of strings and sizes is sure to test the offset calculation code to ensure "
+    "that even a really long string ends up in the correct spot for the final destination allowing "
+    "for even crazy run-on sentences to be inserted into the data"};
+  auto num_generator =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  auto string_generator =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* {
+      return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))];
+    });
+
+  auto const num_rows = 1000000;
+  auto const num_cols = 50;
+  std::vector<cudf::data_type> schema;
+
+  std::vector<cudf::test::detail::column_wrapper> cols;
+  std::vector<cudf::column_view> views;
+
+  for (auto col = 0; col < num_cols; ++col) {
+    if (rand() % 2) {
+      cols.emplace_back(
+        cudf::test::fixed_width_column_wrapper<int32_t>(num_generator, num_generator + num_rows));
+      views.push_back(cols.back());
+      schema.emplace_back(cudf::data_type{cudf::type_id::INT32});
+    } else {
+      cols.emplace_back(
+        cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows));
+      views.push_back(cols.back());
+      schema.emplace_back(cudf::type_id::STRING);
+    }
+  }
+
+  cudf::table_view in(views);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(new_rows[0]->size(), num_rows);
+}
+
+TEST_F(ColumnToRowTests, Simple)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(ColumnToRowTests, Tall)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(ColumnToRowTests, Wide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(ColumnToRowTests, SingleByteWide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
+    views.push_back(cols.back());
+
+    schema.push_back(cudf::data_type{cudf::type_id::INT8});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(ColumnToRowTests, Non2Power)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  constexpr auto num_rows = 6 * 1024 + 557;
+  for (int i = 0; i < 131; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    for (int j = 0; j < old_tbl->num_columns(); ++j) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(ColumnToRowTests, Big)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 28 columns of 1 million rows
+  constexpr auto num_rows = 1024 * 1024;
+  for (int i = 0; i < 28; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    for (int j = 0; j < old_tbl->num_columns(); ++j) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(ColumnToRowTests, Bigger)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 128 columns of 1 million rows
+  constexpr auto num_rows = 1024 * 1024;
+  for (int i = 0; i < 128; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    for (int j = 0; j < old_tbl->num_columns(); ++j) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(ColumnToRowTests, Biggest)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 128 columns of 2 million rows
+  constexpr auto num_rows = 2 * 1024 * 1024;
+  for (int i = 0; i < 128; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    for (int j = 0; j < old_tbl->num_columns(); ++j) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Single)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Simple)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Tall)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Wide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({i}));  // rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, SingleByteWide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, AllTypes)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT64},
+                                      cudf::data_type{cudf::type_id::FLOAT64},
+                                      cudf::data_type{cudf::type_id::INT8},
+                                      cudf::data_type{cudf::type_id::BOOL8},
+                                      cudf::data_type{cudf::type_id::FLOAT32},
+                                      cudf::data_type{cudf::type_id::INT8},
+                                      cudf::data_type{cudf::type_id::INT32},
+                                      cudf::data_type{cudf::type_id::INT64}};
+
+  cudf::test::fixed_width_column_wrapper<int64_t> c0({3, 9, 4, 2, 20, 0}, {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<double> c1({5.0, 9.5, 0.9, 7.23, 2.8, 0.0},
+                                                    {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<int8_t> c2({5, 1, 0, 2, 7, 0}, {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> c3({true, false, false, true, false, false},
+                                                  {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<float> c4({1.0f, 3.5f, 5.9f, 7.1f, 9.8f, 0.0f},
+                                                   {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<int8_t> c5({2, 3, 4, 5, 9, 0}, {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_point_column_wrapper<int32_t> c6(
+    {-300, 500, 950, 90, 723, 0}, {1, 1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-2});
+  cudf::test::fixed_point_column_wrapper<int64_t> c7(
+    {-80, 30, 90, 20, 200, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-1});
+
+  cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7});
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, AllTypesLarge)
+{
+  std::vector<cudf::column> cols;
+  std::vector<cudf::data_type> schema{};
+
+  // 15 columns of each type with 1 million entries
+  constexpr int num_rows{1024 * 1024 * 1};
+
+  std::default_random_engine re;
+  std::uniform_real_distribution<double> rand_double(std::numeric_limits<double>::min(),
+                                                     std::numeric_limits<double>::max());
+  std::uniform_int_distribution<int64_t> rand_int64(std::numeric_limits<int64_t>::min(),
+                                                    std::numeric_limits<int64_t>::max());
+  auto r = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) -> int64_t { return rand_int64(re); });
+  auto d = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) -> double { return rand_double(re); });
+
+  auto all_valid  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
+  auto none_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; });
+  auto most_valid = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return rand() % 2 == 0 ? 0 : 1; });
+  auto few_valid = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return rand() % 13 == 0 ? 1 : 0; });
+
+  for (int i = 0; i < 15; ++i) {
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<int8_t>(r, r + num_rows, all_valid)
+                      .release()
+                      .release());
+    schema.push_back(cudf::data_type{cudf::type_id::INT8});
+  }
+
+  for (int i = 0; i < 15; ++i) {
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<int16_t>(r, r + num_rows, few_valid)
+                      .release()
+                      .release());
+    schema.push_back(cudf::data_type{cudf::type_id::INT16});
+  }
+
+  for (int i = 0; i < 15; ++i) {
+    if (i < 5) {
+      cols.push_back(*cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows, few_valid)
+                        .release()
+                        .release());
+    } else {
+      cols.push_back(*cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows, none_valid)
+                        .release()
+                        .release());
+    }
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+
+  for (int i = 0; i < 15; ++i) {
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<float>(d, d + num_rows, most_valid)
+                      .release()
+                      .release());
+    schema.push_back(cudf::data_type{cudf::type_id::FLOAT32});
+  }
+
+  for (int i = 0; i < 15; ++i) {
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<double>(d, d + num_rows, most_valid)
+                      .release()
+                      .release());
+    schema.push_back(cudf::data_type{cudf::type_id::FLOAT64});
+  }
+
+  for (int i = 0; i < 15; ++i) {
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<bool>(r, r + num_rows, few_valid)
+                      .release()
+                      .release());
+    schema.push_back(cudf::data_type{cudf::type_id::BOOL8});
+  }
+
+  for (int i = 0; i < 15; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
+         r, r + num_rows, all_valid)
+         .release()
+         .release());
+    schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
+  }
+
+  for (int i = 0; i < 15; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>(
+         r, r + num_rows, most_valid)
+         .release()
+         .release());
+    schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS});
+  }
+
+  for (int i = 0; i < 15; ++i) {
+    cols.push_back(*cudf::test::fixed_point_column_wrapper<int32_t>(
+                      r, r + num_rows, all_valid, numeric::scale_type{-2})
+                      .release()
+                      .release());
+    schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32});
+  }
+
+  for (int i = 0; i < 15; ++i) {
+    cols.push_back(*cudf::test::fixed_point_column_wrapper<int64_t>(
+                      r, r + num_rows, most_valid, numeric::scale_type{-1})
+                      .release()
+                      .release());
+    schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64});
+  }
+
+  std::vector<cudf::column_view> views(cols.begin(), cols.end());
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Non2Power)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  constexpr auto num_rows = 6 * 1024 + 557;
+  for (int i = 0; i < 131; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Big)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 28 columns of 1 million rows
+  constexpr auto num_rows = 1024 * 1024;
+  for (int i = 0; i < 28; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Bigger)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 28 columns of 1 million rows
+  constexpr auto num_rows = 1024 * 1024;
+  for (int i = 0; i < 128; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Biggest)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 128 columns of 1 million rows
+  constexpr auto num_rows = 5 * 1024 * 1024;
+  for (int i = 0; i < 128; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, SimpleString)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1, 0, -1});
+  cudf::test::strings_column_wrapper b(
+    {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"});
+  cudf::table_view in(std::vector<cudf::column_view>{a, b});
+  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32},
+                                         cudf::data_type{cudf::type_id::STRING}};
+
+  auto new_rows = cudf::convert_to_rows(in);
+  EXPECT_EQ(new_rows.size(), 1);
+  for (auto& row : new_rows) {
+    auto new_cols = cudf::convert_from_rows(cudf::lists_column_view(*row), schema);
+
+    EXPECT_EQ(row->size(), 5);
+    auto const num_columns = new_cols->num_columns();
+
+    cudf::strings_column_view str_col = new_cols->get_column(1).view();
+    std::vector<thrust::host_vector<int8_t>> col_data;
+    std::vector<thrust::host_vector<cudf::size_type>> offset_data;
+    for (int i = 0; i < num_columns; ++i) {
+      offset_data.emplace_back(
+        std::get<0>(cudf::test::to_host<cudf::size_type>(str_col.offsets())));
+      col_data.emplace_back(std::get<0>(cudf::test::to_host<int8_t>(str_col.chars())));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols);
+  }
+}
+
+TEST_F(RowToColumnTests, DoubleString)
+{
+  cudf::test::strings_column_wrapper a(
+    {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"});
+  cudf::test::fixed_width_column_wrapper<int32_t> b({0, 1, 2, 3, 4});
+  cudf::test::strings_column_wrapper c({"world",
+                                        "hello",
+                                        "this string isn't as long",
+                                        "this one isn't so short though when you think about it",
+                                        "dlrow"});
+  cudf::table_view in(std::vector<cudf::column_view>{a, b, c});
+  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::STRING},
+                                         cudf::data_type{cudf::type_id::INT32},
+                                         cudf::data_type{cudf::type_id::STRING}};
+
+  auto new_rows = cudf::convert_to_rows(in);
+
+  for (uint i = 0; i < new_rows.size(); ++i) {
+    auto new_cols = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+
+    EXPECT_EQ(new_rows[0]->size(), 5);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols);
+  }
+}
+
+TEST_F(RowToColumnTests, BigStrings)
+{
+  char const* TEST_STRINGS[] = {
+    "These",
+    "are",
+    "the",
+    "test",
+    "strings",
+    "that",
+    "we",
+    "have",
+    "some are really long",
+    "and some are kinda short",
+    "They are all over on purpose with different sizes for the strings in order to test the code "
+    "on all different lengths of strings",
+    "a",
+    "good test",
+    "is required to produce reasonable confidence that this is working"};
+  auto num_generator =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  auto string_generator =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* {
+      return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))];
+    });
+
+  auto const num_rows = 50;
+  auto const num_cols = 50;
+  std::vector<cudf::data_type> schema;
+
+  std::vector<cudf::test::detail::column_wrapper> cols;
+  std::vector<cudf::column_view> views;
+
+  for (auto col = 0; col < num_cols; ++col) {
+    if (rand() % 2) {
+      cols.emplace_back(
+        cudf::test::fixed_width_column_wrapper<int32_t>(num_generator, num_generator + num_rows));
+      views.push_back(cols.back());
+      schema.emplace_back(cudf::data_type{cudf::type_id::INT32});
+    } else {
+      cols.emplace_back(
+        cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows));
+      views.push_back(cols.back());
+      schema.emplace_back(cudf::type_id::STRING);
+    }
+  }
+
+  cudf::table_view in(views);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  for (auto& i : new_rows) {
+    auto new_cols = cudf::convert_from_rows(cudf::lists_column_view(*i), schema);
+
+    auto in_view = cudf::slice(in, {0, new_cols->num_rows()});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in_view[0], *new_cols);
+  }
+}
+
+TEST_F(RowToColumnTests, ManyStrings)
+{
+  char const* TEST_STRINGS[] = {
+    "These",
+    "are",
+    "the",
+    "test",
+    "strings",
+    "that",
+    "we",
+    "have",
+    "some are really long",
+    "and some are kinda short",
+    "They are all over on purpose with different sizes for the strings in order to test the code "
+    "on all different lengths of strings",
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "this string is the longest string because it is duplicated more than you can imagine "
+    "a",
+    "good test",
+    "is required to produce reasonable confidence that this is working",
+    "some strings",
+    "are split into multiple strings",
+    "some strings have all their data",
+    "lots of choices of strings and sizes is sure to test the offset calculation code to ensure "
+    "that even a really long string ends up in the correct spot for the final destination allowing "
+    "for even crazy run-on sentences to be inserted into the data"};
+  auto num_generator =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  auto string_generator =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* {
+      return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))];
+    });
+
+  auto const num_rows = 500000;
+  auto const num_cols = 50;
+  std::vector<cudf::data_type> schema;
+
+  std::vector<cudf::test::detail::column_wrapper> cols;
+  std::vector<cudf::column_view> views;
+
+  for (auto col = 0; col < num_cols; ++col) {
+    if (rand() % 2) {
+      cols.emplace_back(
+        cudf::test::fixed_width_column_wrapper<int32_t>(num_generator, num_generator + num_rows));
+      views.push_back(cols.back());
+      schema.emplace_back(cudf::data_type{cudf::type_id::INT32});
+    } else {
+      cols.emplace_back(
+        cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows));
+      views.push_back(cols.back());
+      schema.emplace_back(cudf::type_id::STRING);
+    }
+  }
+
+  cudf::table_view in(views);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  for (auto& i : new_rows) {
+    auto new_cols = cudf::convert_from_rows(cudf::lists_column_view(*i), schema);
+
+    auto in_view = cudf::slice(in, {0, new_cols->num_rows()});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in_view[0], *new_cols);
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 3bd1e3f25a7..50ea54ddaab 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -751,14 +751,6 @@ private static native long[] scatterScalars(long[] srcScalarHandles, long scatte
                                              long targetTableHandle)
                                              throws CudfException;
 
-  private static native long[] convertToRows(long nativeHandle);
-
-  private static native long[] convertToRowsFixedWidthOptimized(long nativeHandle);
-
-  private static native long[] convertFromRows(long nativeColumnView, int[] types, int[] scale);
-
-  private static native long[] convertFromRowsFixedWidthOptimized(long nativeColumnView, int[] types, int[] scale);
-
   private static native long[] repeatStaticCount(long tableHandle, int count);
 
   private static native long[] repeatColumnCount(long tableHandle,
@@ -3647,140 +3639,6 @@ public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKe
     return buildSemiJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * For details about how this method functions refer to
-   * {@link #convertToRowsFixedWidthOptimized()}.
-   *
-   * The only thing different between this method and {@link #convertToRowsFixedWidthOptimized()}
-   * is that this can handle roughly 250M columns while {@link #convertToRowsFixedWidthOptimized()}
-   * can only handle columns less than 100
-   */
-  public ColumnVector[] convertToRows() {
-    long[] ptrs = convertToRows(nativeHandle);
-    return ColumnVector.getColumnVectorsFromPointers(ptrs);
-  }
-
-  /**
-   * Convert this table of columns into a row major format that is useful for interacting with other
-   * systems that do row major processing of the data. Currently only fixed-width column types are
-   * supported.
-   * <p/>
-   * The output is one or more ColumnVectors that are lists of bytes. A ColumnVector that is a
-   * list of bytes can have at most 2GB of data stored in it. Multiple ColumnVectors are returned
-   * if not all of the data can fit in a single one.
-   * <p/>
-   * Each row in the returned ColumnVector array corresponds to a row in the input table. The rows
-   * will be in the same order as the input Table. The first ColumnVector in the array will hold
-   * the first N rows followed by the second ColumnVector and so on.  The following illustrates
-   * this and also shows some of the internal structure that will be explained later.
-   * <p/><pre>
-   * result[0]:
-   *  | row 0 | validity for row 0 | padding |
-   *  ...
-   *  | row N | validity for row N | padding |
-   *  result[1]:
-   *  |row N+1 | validity for row N+1 | padding |
-   *  ...
-   * </pre>
-   *
-   * The format of each row is similar in layout to a C struct where each column will have padding
-   * in front of it to align it properly. Each row has padding inserted at the end so the next row
-   * is aligned to a 64-bit boundary. This is so that the first column will always start at the
-   * beginning (first byte) of the list of bytes and each row has a consistent layout for fixed
-   * width types.
-   * <p/>
-   * Validity bytes are added to the end of the row. There will be one byte for each 8 columns in a
-   * row. Because the validity is byte aligned there is no padding between it and the last column
-   * in the row.
-   * <p/>
-   * For example a table consisting of the following columns A, B, C with the corresponding types
-   * <p/><pre>
-   *   | A - BOOL8 (8-bit) | B - INT16 (16-bit) | C - DURATION_DAYS (32-bit) |
-   * </pre>
-   * <p/>
-   *  Will have a layout that looks like
-   *  <p/><pre>
-   *  | A_0 | P | B_0 | B_1 | C_0 | C_1 | C_2 | C_3 | V0 | P | P | P | P | P | P | P |
-   * </pre>
-   * <p/>
-   * In this P corresponds to a byte of padding, [LETTER]_[NUMBER] represents the NUMBER
-   * byte of the corresponding LETTER column, and V[NUMBER] is a validity byte for the `NUMBER * 8`
-   * to `(NUMBER + 1) * 8` columns.
-   * <p/>
-   * The order of the columns will not be changed, but to reduce the total amount of padding it is
-   * recommended to order the columns in the following way.
-   * <p/>
-   * <ol>
-   *  <li>64-bit columns</li>
-   *  <li>32-bit columns</li>
-   *  <li>16-bit columns</li>
-   *  <li>8-bit columns</li>
-   * </ol>
-   * <p/>
-   * This way padding is only inserted at the end of a row to make the next column 64-bit aligned.
-   * So for the example above if the columns were ordered C, B, A the layout would be.
-   * <pre>
-   * | C_0 | C_1 | C_2 | C_3 | B_0 | B_1 | A_0 | V0 |
-   * </pre>
-   * This would have reduced the overall size of the data transferred by half.
-   * <p/>
-   * One of the main motivations for doing a row conversion on the GPU is to avoid cache problems
-   * when walking through columnar data on the CPU in a row wise manner. If you are not transferring
-   * very many columns it is likely to be more efficient to just pull back the columns and walk
-   * through them. This is especially true of a single column of fixed width data. The extra
-   * padding will slow down the transfer and looking at only a handful of buffers is not likely to
-   * cause cache issues.
-   * <p/>
-   * There are some limits on the size of a single row.  If the row is larger than 1KB this will
-   * throw an exception.
-   */
-  public ColumnVector[] convertToRowsFixedWidthOptimized() {
-    long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle);
-    return ColumnVector.getColumnVectorsFromPointers(ptrs);
-  }
-
-  /**
-   * Convert a column of list of bytes that is formatted like the output from `convertToRows`
-   * and convert it back to a table.
-   *
-   * NOTE: This method doesn't support nested types
-   *
-   * @param vec the row data to process.
-   * @param schema the types of each column.
-   * @return the parsed table.
-   */
-  public static Table convertFromRows(ColumnView vec, DType ... schema) {
-    int[] types = new int[schema.length];
-    int[] scale = new int[schema.length];
-    for (int i = 0; i < schema.length; i++) {
-      types[i] = schema[i].typeId.nativeId;
-      scale[i] = schema[i].getScale();
-
-    }
-    return new Table(convertFromRows(vec.getNativeView(), types, scale));
-  }
-
-  /**
-   * Convert a column of list of bytes that is formatted like the output from `convertToRows`
-   * and convert it back to a table.
-   *
-   * NOTE: This method doesn't support nested types
-   *
-   * @param vec the row data to process.
-   * @param schema the types of each column.
-   * @return the parsed table.
-   */
-  public static Table convertFromRowsFixedWidthOptimized(ColumnView vec, DType ... schema) {
-    int[] types = new int[schema.length];
-    int[] scale = new int[schema.length];
-    for (int i = 0; i < schema.length; i++) {
-      types[i] = schema[i].typeId.nativeId;
-      scale[i] = schema[i].getScale();
-
-    }
-    return new Table(convertFromRowsFixedWidthOptimized(vec.getNativeView(), types, scale));
-  }
-
   /**
    * Construct a table from a packed representation.
    * @param metadata host-based metadata for the table
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 260b402443a..1853cbd6a5c 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -147,7 +147,6 @@ add_library(
   src/TableJni.cpp
   src/aggregation128_utils.cu
   src/maps_column_view.cu
-  src/row_conversion.cu
   src/check_nvcomp_output_sizes.cu
 )
 
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index fad19bdf895..d7d0279174d 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -54,7 +54,6 @@
 #include "jni_compiled_expr.hpp"
 #include "jni_utils.hpp"
 #include "jni_writer_data_sink.hpp"
-#include "row_conversion.hpp"
 
 namespace cudf {
 namespace jni {
@@ -3207,24 +3206,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL
-Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, jlong input_table) {
-  JNI_NULL_CHECK(env, input_table, "input table is null", 0);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
-    std::vector<std::unique_ptr<cudf::column>> cols =
-        cudf::jni::convert_to_rows_fixed_width_optimized(*n_input_table);
-    int num_columns = cols.size();
-    cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-    std::transform(cols.begin(), cols.end(), outcol_handles.begin(),
-                   [](auto &col) { return release_as_jlong(col); });
-    return outcol_handles.get_jArray();
-  }
-  CATCH_STD(env, 0);
-}
-
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env, jclass,
                                                                     jlong j_input, jlong j_map,
                                                                     jlong j_target) {
@@ -3260,72 +3241,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *en
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env, jclass,
-                                                                     jlong input_table) {
-  JNI_NULL_CHECK(env, input_table, "input table is null", 0);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
-    std::vector<std::unique_ptr<cudf::column>> cols = cudf::jni::convert_to_rows(*n_input_table);
-    int num_columns = cols.size();
-    cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-    std::transform(cols.begin(), cols.end(), outcol_handles.begin(),
-                   [](auto &col) { return release_as_jlong(col); });
-    return outcol_handles.get_jArray();
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(
-    JNIEnv *env, jclass, jlong input_column, jintArray types, jintArray scale) {
-  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
-  JNI_NULL_CHECK(env, types, "types is null", 0);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    cudf::lists_column_view const list_input{*reinterpret_cast<cudf::column_view *>(input_column)};
-    cudf::jni::native_jintArray n_types(env, types);
-    cudf::jni::native_jintArray n_scale(env, scale);
-    if (n_types.size() != n_scale.size()) {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
-                    NULL);
-    }
-    std::vector<cudf::data_type> types_vec;
-    std::transform(n_types.begin(), n_types.end(), n_scale.begin(), std::back_inserter(types_vec),
-                   [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); });
-    std::unique_ptr<cudf::table> result =
-        cudf::jni::convert_from_rows_fixed_width_optimized(list_input, types_vec);
-    return convert_table_for_return(env, result);
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *env, jclass,
-                                                                       jlong input_column,
-                                                                       jintArray types,
-                                                                       jintArray scale) {
-  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
-  JNI_NULL_CHECK(env, types, "types is null", 0);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    cudf::lists_column_view const list_input{*reinterpret_cast<cudf::column_view *>(input_column)};
-    cudf::jni::native_jintArray n_types(env, types);
-    cudf::jni::native_jintArray n_scale(env, scale);
-    if (n_types.size() != n_scale.size()) {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
-                    NULL);
-    }
-    std::vector<cudf::data_type> types_vec;
-    std::transform(n_types.begin(), n_types.end(), n_scale.begin(), std::back_inserter(types_vec),
-                   [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); });
-    std::unique_ptr<cudf::table> result = cudf::jni::convert_from_rows(list_input, types_vec);
-    return convert_table_for_return(env, result);
-  }
-  CATCH_STD(env, 0);
-}
-
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatStaticCount(JNIEnv *env, jclass,
                                                                          jlong input_jtable,
                                                                          jint count) {
diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp
deleted file mode 100644
index e4631875152..00000000000
--- a/java/src/main/native/src/row_conversion.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <memory>
-
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <rmm/cuda_stream_view.hpp>
-
-namespace cudf {
-namespace jni {
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
-    cudf::table_view const &tbl,
-    // TODO need something for validity
-    rmm::cuda_stream_view stream = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
-std::vector<std::unique_ptr<cudf::column>>
-convert_to_rows(cudf::table_view const &tbl,
-                // TODO need something for validity
-                rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
-    cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
-    rmm::cuda_stream_view stream = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table>
-convert_from_rows(cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
-                  rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
-} // namespace jni
-} // namespace cudf
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index b0dd4122b0e..8df8ebea8a7 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -9076,73 +9076,6 @@ void testStructColumnFilterStrings() {
     }
   }
 
-  @Test
-  void fixedWidthRowsRoundTripWide() {
-    TestBuilder tb = new TestBuilder();
-    IntStream.range(0, 10).forEach(i -> tb.column(3l, 9l, 4l, 2l, 20l, null));
-    IntStream.range(0, 10).forEach(i -> tb.column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null));
-    IntStream.range(0, 10).forEach(i -> tb.column(5, 1, 0, 2, 7, null));
-    IntStream.range(0, 10).forEach(i -> tb.column(true, false, false, true, false, null));
-    IntStream.range(0, 10).forEach(i -> tb.column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f, null));
-    IntStream.range(0, 10).forEach(i -> tb.column(new Byte[]{2, 3, 4, 5, 9, null}));
-    IntStream.range(0, 10).forEach(i -> tb.decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d,
-        9.5d, 0.9d, 7.23d, 2.8d, null));
-    IntStream.range(0, 10).forEach(i -> tb.decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null));
-    try (Table origTable = tb.build()) {
-      ColumnVector[] rowMajorTable = origTable.convertToRows();
-      try {
-        // We didn't overflow
-        assert rowMajorTable.length == 1;
-        ColumnVector cv = rowMajorTable[0];
-        assert cv.getRowCount() == origTable.getRowCount();
-        DType[] types = new DType[origTable.getNumberOfColumns()];
-        for (int i = 0; i < origTable.getNumberOfColumns(); i++) {
-          types[i] = origTable.getColumn(i).getType();
-        }
-        try (Table backAgain = Table.convertFromRows(cv, types)) {
-          assertTablesAreEqual(origTable, backAgain);
-        }
-      } finally {
-        for (ColumnVector cv : rowMajorTable) {
-          cv.close();
-        }
-      }
-    }
-  }
-
-  @Test
-  void fixedWidthRowsRoundTrip() {
-    try (Table origTable = new TestBuilder()
-        .column(3l, 9l, 4l, 2l, 20l, null)
-        .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null)
-        .column(5, 1, 0, 2, 7, null)
-        .column(true, false, false, true, false, null)
-        .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f, null)
-        .column(new Byte[]{2, 3, 4, 5, 9, null})
-        .decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null)
-        .decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null)
-        .build()) {
-      ColumnVector[] rowMajorTable = origTable.convertToRowsFixedWidthOptimized();
-      try {
-        // We didn't overflow
-        assert rowMajorTable.length == 1;
-        ColumnVector cv = rowMajorTable[0];
-        assert cv.getRowCount() == origTable.getRowCount();
-        DType[] types = new DType[origTable.getNumberOfColumns()];
-        for (int i = 0; i < origTable.getNumberOfColumns(); i++) {
-          types[i] = origTable.getColumn(i).getType();
-        }
-        try (Table backAgain = Table.convertFromRowsFixedWidthOptimized(cv, types)) {
-          assertTablesAreEqual(origTable, backAgain);
-        }
-      } finally {
-        for (ColumnVector cv : rowMajorTable) {
-          cv.close();
-        }
-      }
-    }
-  }
-
   // utility methods to reduce typing
 
   private static StructData struct(Object... values) {